# Sec Recon Analyses

Here we perform different analyses to test the quality of secRecon

In [1]:
import datetime
import pickle
import networkx as nx

import re
import pandas as pd
import numpy as np
from Bio import Entrez
import Request_Utilis
from google_sheet import GoogleSheet
from collections import defaultdict

# Plotting libraries
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import plotly.express as px
#from venn import venn
#from upsetplot import UpSet
import matplotlib.pyplot as plt
from itertools import product

# Warnings
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [2]:
##### ----- Generate datasets from Google Sheet ----- #####

#Credential file
KEY_FILE_PATH = 'credentials.json'

#CHO Network Reconstruction + Recon3D_v3 Google Sheet ID
Sec_Recon_SPREADSHEET_ID = '1L6qQQs48OdFd-mJcVqov_rSDoV90Ta0kib6UpL81OJQ'

# Initialize the GoogleSheet object
sec_recon_gsheet_file = GoogleSheet(Sec_Recon_SPREADSHEET_ID, KEY_FILE_PATH)

# Read data from SecRecon
sec_genes_sheet = 'SecRecon'
ontology_sheet = 'Ontology'

sec_genes = sec_recon_gsheet_file.read_google_sheet(sec_genes_sheet)
ontology = sec_recon_gsheet_file.read_google_sheet(ontology_sheet)

### Identification of secRecon genes in CRISPR CHO whole genome library

In [3]:
import requests
import urllib.parse
import pandas as pd
import json
import time
from tqdm import tqdm

https://genome.ucsc.edu/h/GCF_003668045.3

In [4]:
# Function to perform BLAT search for multiple sequences in FASTA format
def perform_blat_batch(manifest_names, sequences, db="hub_2667129_GCF_003668045.3", query_type="DNA", hgsid="2315154894_wzNAPcVmz3ZZYdvOnvPkXeKxPesW", max_retries=3):
    # Create a FASTA formatted string for the sequences
    fasta_sequences = ''.join([f">{name}\n{seq}\n" for name, seq in zip(manifest_names, sequences)])
    encoded_fasta_sequences = urllib.parse.quote(fasta_sequences)
    encoded_db = urllib.parse.quote(db)
    
    # Construct the URL for BLAT search with hgsid
    url = f"https://genome.ucsc.edu/cgi-bin/hgBlat?hgsid={hgsid}&userSeq={encoded_fasta_sequences}&type={query_type}&db={encoded_db}&output=json"
    
    for attempt in range(max_retries):
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an HTTPError for bad responses
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"Request failed (attempt {attempt + 1}): {e}")
            time.sleep(1)  # Wait for 1 second before retrying
        except ValueError as e:
            print(f"JSON Decode Error (attempt {attempt + 1}): {e}")
            time.sleep(1)  # Wait for 1 second before retrying
    
    print(f"Failed to retrieve BLAT results after three attempts for sequences: {sequences}")
    return None

# Function to parse and filter BLAT results, keeping only specified values
def parse_blat_results(results, min_score=20):
    parsed_results = []
    
    if results is None:
        return parsed_results
    
    # Extract fields and alignments
    alignments = results.get('blat', [])
    
    # Filter alignments based on score and keep specific values
    for alignment in alignments:
        if alignment[0] >= min_score:
            parsed_results.append({
                "seqName": alignment[9],
                "strand": alignment[8],
                "tName": alignment[13],
                "tStart": alignment[15],
                "tEnd": alignment[16]
            })
    
    return parsed_results

In [None]:
# Load CRISPR library dataset
guide_rna = pd.read_excel('Data/guide_rna_lib/CRISPR_a_Library_sorted_file_without_duplicates.xlsx')

# Load guide RNA with NT dataset
guide_rna_ntg = pd.read_excel('Data/guide_rna_lib/CRISPRa_library_manifest_NTG_without_duplicates.xlsx')

# Load TFs from IPA dataset
ipa_tfs = pd.read_excel('Data/IPA_analysis/TF_secrecon_lists.xlsx', sheet_name = 'IPA_results_85-TF', usecols='B')

In [5]:
# Path to your preprocessed dataset
preprocessed_dataset_path = 'Data/guide_rna_lib/CRISPR_a_Library_sorted_file_without_duplicates_pre_processed.xlsx'

# Read the preprocessed dataset
guide_rna = pd.read_excel(preprocessed_dataset_path)

# Check and print initial status of 'tName', 'tStart', 'tEnd'
print("Initial status of 'tName', 'tStart', 'tEnd':")
print(guide_rna[['tName', 'tStart', 'tEnd']].isnull().sum())

Initial status of 'tName', 'tStart', 'tEnd':
tName     19108
tStart    19108
tEnd      19108
dtype: int64


In [6]:
# Collect sequences that need BLAT search
sequences_to_blat = []
manifest_names_to_blat = []
for index, row in guide_rna.iterrows():
    if pd.isnull(row['tName']):
        sequences_to_blat.append(row['guide'])
        manifest_names_to_blat.append(row['Manifest Name'])

# Perform BLAT in batches
batch_size = 25  # Adjust the batch size as needed
all_blat_results = []
for i in tqdm(range(0, len(sequences_to_blat), batch_size), desc="Processing BLAT searches"):
    batch_sequences = sequences_to_blat[i:i+batch_size]
    batch_names = manifest_names_to_blat[i:i+batch_size]
    blat_result = perform_blat_batch(batch_names, batch_sequences)
    if blat_result:
        all_blat_results.extend(parse_blat_results(blat_result, min_score=20))

Processing BLAT searches:   1%|               | 4/765 [00:29<1:34:40,  7.46s/it]

Request failed (attempt 1): Expecting value: line 2 column 9 (char 9)
Request failed (attempt 2): Expecting value: line 2 column 9 (char 9)
Request failed (attempt 3): Expecting value: line 2 column 9 (char 9)


Processing BLAT searches:   1%|               | 5/765 [00:55<2:55:16, 13.84s/it]

Failed to retrieve BLAT results after three attempts for sequences: ['CTTATATACGGTCCTAATGT', 'TTCCCTATTTGGTTCATTAC', 'CCCTGTGTCAAGTCTGAAGA', 'TTATATACGGTCCTAATGTT', 'GGTTCATTACTGGTTTTGAA', 'CGGTCCTAATGTTGGGATTA', 'CCCAGTACTGAGGTCAGCTT', 'NNNNNNNNNGGGGGCCGCGC', 'CAGGAAGACGGACGTGCGGC', 'AGAGCTTCCTGGCACAGCGT', 'CGGGGAGCATCGAAGGCGGG', 'GGAAGGGTGTCCCAAACCAG', 'GGTTGGTCCCTCTGCGTGAC', 'CCAGGAAGACGGACGTGCGG', 'CGTGCGGCGGGGAGCATCGA', 'TAAGTGAAAGCTGCATGGGA', 'AGCTGCATGGGAAGGACTGT', 'GTGTGTCATAAGGAGTGATC', 'GAAAATGCCCTATGGCACTA', 'CACATACACACCTAATGAAG', 'ACCTTTGCTTTTAAAAAGAG', 'GAAGGAGACATTCATTCATG', 'GCCCTATAAAGAAAACAAAC', 'TGTAACTATACATAATATTT', 'CATCCCCTCCTCCCACAGTT']
Request failed (attempt 1): Expecting value: line 2 column 24 (char 24)
Request failed (attempt 2): Expecting value: line 2 column 24 (char 24)
Request failed (attempt 3): Expecting value: line 2 column 24 (char 24)


Processing BLAT searches:   1%|               | 6/765 [01:20<3:42:38, 17.60s/it]

Failed to retrieve BLAT results after three attempts for sequences: ['CAGAGGCAGGCCCGATGAAG', 'ATAAATAAAAACCCTCAGTG', 'AAGTTTAATCACAGAGAGAG', 'GCCAGGAGCCCATTCCTGGG', 'CGGAGAGATGGCCTCGTAGG', 'GATCCTGCTGCTGCTGAACG', 'CATAGCAGCAGCAGACTTAG', 'CACTGTTTTTTTTTTGCCGG', 'ACTGTTTTTTTTTTGCCGGG', 'TCACTGTTTTTTTTTTGCCG', 'GTTTACTGATGGAGAGGATG', 'GGCCTTTTTAAAAAGACCCC', 'AGCCTCCGAGAGGAGAGGGG', 'GCCTCCGAGAGGAGAGGGGT', 'AGAATCCTGTGGTCATGGTG', 'TGGTCATGGTGAGGAAAGCC', 'GCCTCTGCCTCTGACGTTTA', 'TTGGCATGTGCCATCCTGGT', 'GTAGAAATTGGGTAAGGTGA', 'TCTTGGCCCAGGGCCCAAAG', 'CCGGCTGGGCTCAGGGACCT', 'CACACCTGCTTCTTGGCCCA', 'NNNNNNNNNGGGAGAAGCAG', 'CAACCGTATTCCTGCCTGTT', 'AGGCAAAGCCGTACCACCAG']


Processing BLAT searches:   1%|▏              | 8/765 [01:34<2:32:25, 12.08s/it]

Request failed (attempt 1): Expecting value: line 2 column 3 (char 3)
Request failed (attempt 2): Expecting value: line 2 column 3 (char 3)
Request failed (attempt 3): Expecting value: line 2 column 3 (char 3)


Processing BLAT searches:   1%|▏              | 9/765 [01:58<3:19:07, 15.80s/it]

Failed to retrieve BLAT results after three attempts for sequences: ['GCTTCGAGCCCACTCAGTCA', 'NNNNNNNNNNNNNNNACAGG', 'TGCGCCTGGAGAAAAAGTAG', 'AGACCTTGGGTAGGAGCAGA', 'CTCCTTCTGGAAGATACCCA', 'GAGACCTTGGGTAGGAGCAG', 'CTGGAGAAAAAGTAGGGGGC', 'CTGCGCCTGGAGAAAAAGTA', 'GAGAAAGCGGCATAGGCTGG', 'CAGGGGCATCGCCCACCCGC', 'CCGTCCGCCCGTCGGCCCGC', 'CGCAGTAACTAGAAGTGCAG', 'TGGATATCAAAGAAGGGCCC', 'CGGGGTCTTTTTCACAGGGC', 'GCCAGGGCTAACATCAGAAG', 'CAGCAGGGAAGTTTGGCAGT', 'TAGACGCCCCTCCTACCGGT', 'TATTTACCGAGAACCCGGTG', 'GTTTGGCAGTGGGCTGCGGT', 'CTAACCTGGCCCAGATCTGT', 'TCTTGCACCCAAAAATGCCC', 'TCTCCTTCGTGGTCCCAACA', 'NNNNNNNNNNNNNNNNNNAC', 'NNNNNNNNNACAGGAGGAAG', 'CCAGTATAACCCCACCAAAC']


Processing BLAT searches:   2%|▎             | 15/765 [02:40<1:40:21,  8.03s/it]

Request failed (attempt 1): Expecting value: line 2 column 11 (char 11)
Request failed (attempt 2): Expecting value: line 2 column 11 (char 11)
Request failed (attempt 3): Expecting value: line 2 column 11 (char 11)


Processing BLAT searches:   2%|▎             | 16/765 [03:03<2:36:59, 12.58s/it]

Failed to retrieve BLAT results after three attempts for sequences: ['CTTTAATCCCAGGTTTCCTT', 'AAAGCAGAGGCAGATCACAC', 'TGAGTTTTTTTTTATCGTGT', 'CAGGAATCTCTGTGAGGCTA', 'TCACTCAGAGACAGAGACTG', 'TTGGACCACACACCACAATT', 'TATTAATTTCATGGAGGGGT', 'TAATTTCATGGAGGGGTTGG', 'CCCAGATCCGGTTAGCACTC', 'NNNNNNNNNNNNNNNCCGGG', 'ACATGTGTTCTCCAGAAGCG', 'NNNNNNNNNNNNNNNNNNNC', 'NNNNNNNNNNNNNNNNNNCC', 'ACACACACAGTTAAGCCGCG', 'TGGGCCGAGGGCGGGGCAGC', 'ACACAGATGGGCTAATCCCT', 'GGGCCGAGGGCGGGGCAGCT', 'GTATGTCCTGGATATCCCTA', 'AATTTGAGCACGTGTGCAGC', 'ATTTGAGCACGTGTGCAGCA', 'GGTCACTCTTCTAGAGGGCC', 'ATCACAAGCAAATTATTATG', 'TCACAAGCAAATTATTATGA', 'GACAGTACAAATGTCACAGG', 'CAAATGTCACAGGAGGAGTC']
Request failed (attempt 1): Expecting value: line 2 column 15 (char 15)
Request failed (attempt 2): Expecting value: line 2 column 15 (char 15)
Request failed (attempt 3): Expecting value: line 2 column 15 (char 15)


Processing BLAT searches:   2%|▎             | 17/765 [03:27<3:17:07, 15.81s/it]

Failed to retrieve BLAT results after three attempts for sequences: ['TACTCCTCAATGGTGCTGGC', 'TCACAATTCTCCCTGGCCAC', 'TATGGATGCTTGGTTGTATG', 'TTCCAGATAAGTTGTATAAC', 'CATGGGTCTATACTCCTCAA', 'CTTGGTTGTATGGGGCGTGC', 'CGGCAGGGAAAGAAAAGAAA', 'TTCTAATGTAACTAAGACTC', 'AACTAAGACTCAGGAAGAAC', 'GGCTCAAGCTGCTCACTGCT', 'TTCTGCCTTTTAGATATGCC', 'GAAGATAAATGATGAGAACA', 'ACTTTCAGCATCCAGTTATT', 'NNNNNNNNNNNNNNNNTTTC', 'NNNNNNTTTCTGGATTCTAT', 'ATCTTTTCAATAGTCAAGAC', 'GAGCACTGGATGTAATATAG', 'TAGCTATGCTTTATCAGCAC', 'GTGCAGAAGGGGAGCCGGGC', 'CGCCCACACCGCTGTCTGTG', 'ATATAGTGGAGACTGGGGCT', 'CCAGGGCTCGCAGCCCGCGG', 'GAGCCGCTGCTGTGCAGAAG', 'TCTAGCCGATTGTTAACCAC', 'CTGTAGCTCTGGAGCTACGG']


Processing BLAT searches: 100%|█████████████| 765/765 [1:30:47<00:00,  7.12s/it]


In [7]:
# Update the dataframe with the BLAT results
for result in all_blat_results:
    idx = guide_rna[guide_rna['Manifest Name'] == result['seqName']].index[0]
    guide_rna.at[idx, 'tName'] = result['tName']
    guide_rna.at[idx, 'tStart'] = result['tStart']
    guide_rna.at[idx, 'tEnd'] = result['tEnd']

In [8]:
# Check and print final status of 'tName', 'tStart', 'tEnd'
print("Final status of 'tName', 'tStart', 'tEnd':")
print(guide_rna[['tName', 'tStart', 'tEnd']].isnull().sum())

Final status of 'tName', 'tStart', 'tEnd':
tName     505
tStart    505
tEnd      505
dtype: int64


In [9]:
guide_rna.to_excel('Data/guide_rna_lib/CRISPR_a_Library_sorted_file_without_duplicates_pre_processed.xlsx', index=False)

In [None]:
# Find common genes between CRISPR library and secRecon
guide_rna['gene_lower'] = guide_rna['target_name'].str.replace('gene-', '')
common_genes_sec_recon = guide_rna[guide_rna['gene_lower'].isin(sec_genes['CHO GENE SYMBOL'])]
common_genes_sec_recon = common_genes_sec_recon.drop(['gene_lower'], axis=1)

In [None]:
# Find common genes between CRISPR library and IPA Analysis
guide_rna['gene_upper'] = guide_rna['target_name'].str.replace('gene-', '').str.upper()
common_genes_ipa = guide_rna[guide_rna['gene_upper'].isin(ipa_tfs['Upstream Regulator'])]
common_genes_ipa = common_genes_ipa.drop(['gene_upper','gene_lower'], axis=1)

In [None]:
# Combine datasets
final_df = pd.concat([common_genes_sec_recon, common_genes_ipa], ignore_index=True)

In [None]:
# Filter NT dataset

# Filter rows where 'Name' is in 'Manifest Name' of final_df
filter_condition = guide_rna_ntg['Name'].isin(final_df['Manifest Name'])

# Filter rows where 'Name' starts with 'NT_'
nt_condition = guide_rna_ntg['Name'].str.startswith('NT_')

# Combine both conditions
combined_condition = filter_condition | nt_condition

# Apply the combined filter
filtered_guide_rna_ntg = guide_rna_ntg[combined_condition]

In [None]:
filtered_guide_rna_ntg

In [None]:
# Comparison of genes in the CRISPR library and secRecon

print(f' Total genes in CRISPR library: {len(guide_rna.target_name.unique())}')
print(f' Genes in CRISPR library covered by secRecon: {len(common_genes_sec_recon.target_name.unique())}')
print(f' Total CHO genes in secRecon:',len(sec_genes['CHO GENE SYMBOL'].unique()))

In [None]:
# Comparison of genes in the CRISPR library and IPA analysis

print(f' Total genes in CRISPR library: {len(guide_rna.target_name.unique())}')
print(f' Genes in CRISPR library covered by IPA analysis: {len(common_genes_ipa.target_name.unique())}')
print(f' Total CHO genes in IPA TFs:',len(ipa_tfs['Upstream Regulator'].unique()))

In [None]:
common_genes_sec_recon.to_excel('Data/guide_rna_lib/CRISPR_a_Library_sorted_file_without_duplicates_sec_genes.xlsx', index=False)
common_genes_ipa.to_excel('Data/guide_rna_lib/CRISPR_a_Library_sorted_file_without_duplicates_ipa_tfs.xlsx', index=False)
filtered_guide_rna_ntg.to_excel('Data/guide_rna_lib/CRISPRa_library_manifest_NTG_filtered_secgenes_and_IPATFs.xlsx', index=False)

In [None]:
print(guide_rna)