# Sec Recon Analyses

Here we perform different analyses to test the quality of secRecon

In [3]:
import datetime
import pickle
import networkx as nx

import re
import pandas as pd
import numpy as np
from Bio import Entrez
import Request_Utilis
from google_sheet import GoogleSheet
from collections import defaultdict

# Plotting libraries
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import plotly.express as px
#from venn import venn
#from upsetplot import UpSet
import matplotlib.pyplot as plt
from itertools import product

# Warnings
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [4]:
##### ----- Generate datasets from Google Sheet ----- #####

#Credential file
KEY_FILE_PATH = 'credentials.json'

#CHO Network Reconstruction + Recon3D_v3 Google Sheet ID
Sec_Recon_SPREADSHEET_ID = '1L6qQQs48OdFd-mJcVqov_rSDoV90Ta0kib6UpL81OJQ'

# Initialize the GoogleSheet object
sec_recon_gsheet_file = GoogleSheet(Sec_Recon_SPREADSHEET_ID, KEY_FILE_PATH)

# Read data from SecRecon
sec_genes_sheet = 'SecRecon'
ontology_sheet = 'Ontology'

sec_genes = sec_recon_gsheet_file.read_google_sheet(sec_genes_sheet)
ontology = sec_recon_gsheet_file.read_google_sheet(ontology_sheet)

### Network Analysis

In this section we'll use the networks generated in the Network_visualization notebook to visualize experimental data from a CHO High vs Low dataset

In [None]:
# Read dataset
cho_vs_plasma_prot = pd.read_excel('Data/cho_vs_plasma/1-s2.0-S1096717624000521-mmc3.xlsx',
                                   sheet_name = 'Proteome DE proteins')

cho_vs_plasma_rna = pd.read_excel('Data/cho_vs_plasma/1-s2.0-S1096717624000521-mmc3.xlsx',
                                   sheet_name = 'Transcriptome DE')

In [None]:
# Read gene_dict from the pickle file

with open('gene_dict.pkl', 'rb') as f:
    gene_dict = pickle.load(f)

In [None]:
# Map Mouse genes to Human genes

dict_mouse_human = dict(zip(sec_genes['MOUSE GENE SYMBOL'], sec_genes['GENE SYMBOL']))
cho_vs_plasma_prot['Human_Genes'] = cho_vs_plasma_prot['Mmus_Genes'].map(dict_mouse_human)

In [None]:
# Filter rows where 'Human_Genes' is not NaN
filtered_df = cho_vs_plasma_prot[cho_vs_plasma_prot['Human_Genes'].notna()]

In [None]:
filtered_df

In [None]:
for i,row in filtered_df.iterrows():
    g = row['Human_Genes']
    if g in gene_dict.keys():
        gene_dict[g]['Expression'] = float(row['FC PCD/CHO'])

In [None]:
G = nx.read_graphml('Network/sec_recon_network.graphml')

# Extract positions from the graph
pos = {node: (float(data['x']), float(data['y'])) for node, data in G.nodes(data=True)}

In [None]:
system_colors = {
    'Protein conformation': (1.0, 0.6, 0.0, 1.0),
    'Post-translational modifications': (0.1, 0.9, 0.1, 1.0), 
    'Proteostasis': (0.3, 0.3, 0.9, 1.0),
    'Translocation': (0.4, 0.7, 0.9, 1.0),
    'Vesicle trafficking': (0.7, 0.7, 0.3, 1.0)
}

radius_coeff = 0.000265
# Initialize the figure and axis
fig, ax = plt.subplots(figsize=(40, 40))

# Only plot edges that are greater than 1 in weight
edgelist = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] >= 1]

# Draw edges
nx.draw_networkx_edges(G, pos, edgelist=edgelist, edge_color='lightgrey')#, 
                       #width=[G[u][v]['weight'] for u, v in G.edges() if 40 > G[u][v]['weight'] > 2])

    
# Draw nodes as pie charts
for node, (x, y) in pos.items():
    systems = gene_dict[node]['systems']
    
    # Adjust the alpha value of each color
    colors = [system_colors[sys] for sys in systems]
    
    # Draw pie chart at node position with edgecolor and linewidth
    #ax.pie([1]*len(systems), colors=colors, radius=0.012, center=(x, y), wedgeprops=dict(edgecolor='black', linewidth=0.5))
    try:
        radius = gene_dict[node]['Expression'] * radius_coeff + 0.0000001
    except:
        radius = 0.0000001
    ax.pie([1]*len(systems), colors=colors, radius=radius, center=(x, y), wedgeprops=dict(edgecolor='black', linewidth=0.5))

# Get the current axis limits
x_values, y_values = zip(*pos.values())
min_x, max_x = min(x_values), max(x_values)
min_y, max_y = min(y_values), max(y_values)

# Set new axis limits
ax.set_xlim(min_x - 0.1, max_x + 0.1)
ax.set_ylim(min_y - 0.1, max_y + 0.1)

# Legend
legend_patches = [mpatches.Patch(color=color, label=category) for category, color in system_colors.items()]
plt.legend(handles=legend_patches, prop={'size': 35}, loc='lower left', bbox_to_anchor=(0.9, 0.6))
plt.subplots_adjust(right=0.75)

plt.savefig('Network/secrecon_network_systems.png', dpi=300, bbox_inches='tight')

# Display the plot
plt.show()

### 2. Identification of secRecon genes in CRISPR CHO whole genome library

In [5]:
import requests
import urllib.parse
import pandas as pd
import json
import time
from tqdm import tqdm

In [6]:
# Function to perform BLAT search

def perform_blat(sequence, db="hub_2667129_GCF_003668045.3", query_type="DNA", hgsid="2315154894_wzNAPcVmz3ZZYdvOnvPkXeKxPesW", max_retries=3):
    # URL encode the sequence and database identifier
    encoded_sequence = urllib.parse.quote(sequence)
    encoded_db = urllib.parse.quote(db)
    
    # Construct the URL for BLAT search with hgsid
    url = f"https://genome.ucsc.edu/cgi-bin/hgBlat?hgsid={hgsid}&userSeq={encoded_sequence}&type={query_type}&db={encoded_db}&output=json"
    for attempt in range(max_retries):
        try:
            response = requests.get(url)
            response.raise_for_status()  # Raise an HTTPError for bad responses
            return response.json()
        except requests.exceptions.RequestException as e:
            time.sleep(1)  # Wait for 1 second(s) before retrying
        except ValueError as e:
            time.sleep(1)  # Wait for 1 second(s) before retrying
    
    print(f"Failed to retrieve BLAT results after three attempts for sequence: {sequence}")
    return None
    
def parse_blat_results(results, min_score=20):
    parsed_results = []
    
    # Extract fields and alignments
    alignments = results.get('blat', [])
    
    # Filter alignments based on score and keep specific values
    for alignment in alignments:
        if alignment[0] >= min_score:
            parsed_results.append({
                "strand": alignment[8],
                "tName": alignment[13],
                "tStart": alignment[15],
                "tEnd": alignment[16]
            })
    
    return parsed_results

In [7]:
# Load CRISPR library dataset
guide_rna = pd.read_excel('Data/guide_rna_lib/CRISPR_a_Library_sorted_file_without_duplicates.xlsx')

# Load guide RNA with NT dataset
guide_rna_ntg = pd.read_excel('Data/guide_rna_lib/CRISPRa_library_manifest_NTG_without_duplicates.xlsx')

# Load TFs from IPA dataset
ipa_tfs = pd.read_excel('Data/IPA_analysis/TF_secrecon_lists.xlsx', sheet_name = 'IPA_results_85-TF', usecols='B')

In [None]:
'''
# Find common genes between CRISPR library and secRecon
guide_rna['gene_lower'] = guide_rna['target_name'].str.replace('gene-', '')
common_genes_sec_recon = guide_rna[guide_rna['gene_lower'].isin(sec_genes['CHO GENE SYMBOL'])]
common_genes_sec_recon = common_genes_sec_recon.drop(['gene_lower'], axis=1)
'''

In [None]:
# Perform BLAT for each sequence and store results

results = {}
for index, row in tqdm(guide_rna.iterrows(), total=guide_rna.shape[0], desc="Processing BLAT searches"):
    manifest_name = row['Manifest Name']
    guide_sequence = row['guide']
    blat_result = perform_blat(guide_sequence)
    filtered_result = parse_blat_results(blat_result, min_score=20)
    results[manifest_name] = filtered_result

In [9]:
# Add the results to the guide_rna dataset

# Add new columns to the dataframe
guide_rna['tName'] = None
guide_rna['tStart'] = None
guide_rna['tEnd'] = None

# Update the dataframe with the BLAT results
for index, row in guide_rna.iterrows():
    manifest_name = row['Manifest Name']
    if manifest_name in results and results[manifest_name]:
        result = results[manifest_name][0]
        guide_rna.at[index, 'tName'] = result['tName']
        guide_rna.at[index, 'tStart'] = result['tStart']
        guide_rna.at[index, 'tEnd'] = result['tEnd']

In [11]:
guide_rna.to_excel('Data/guide_rna_lib/CRISPR_a_Library_sorted_file_without_duplicates_pre_processed.xlsx', index=False)

In [None]:
# Find common genes between CRISPR library and IPA Analysis
guide_rna['gene_upper'] = guide_rna['target_name'].str.replace('gene-', '').str.upper()
common_genes_ipa = guide_rna[guide_rna['gene_upper'].isin(ipa_tfs['Upstream Regulator'])]
common_genes_ipa = common_genes_ipa.drop(['gene_upper','gene_lower'], axis=1)

In [None]:
# Combine datasets
final_df = pd.concat([common_genes_sec_recon, common_genes_ipa], ignore_index=True)

In [None]:
# Filter NT dataset

# Filter rows where 'Name' is in 'Manifest Name' of final_df
filter_condition = guide_rna_ntg['Name'].isin(final_df['Manifest Name'])

# Filter rows where 'Name' starts with 'NT_'
nt_condition = guide_rna_ntg['Name'].str.startswith('NT_')

# Combine both conditions
combined_condition = filter_condition | nt_condition

# Apply the combined filter
filtered_guide_rna_ntg = guide_rna_ntg[combined_condition]

In [None]:
filtered_guide_rna_ntg

In [None]:
# Comparison of genes in the CRISPR library and secRecon

print(f' Total genes in CRISPR library: {len(guide_rna.target_name.unique())}')
print(f' Genes in CRISPR library covered by secRecon: {len(common_genes_sec_recon.target_name.unique())}')
print(f' Total CHO genes in secRecon:',len(sec_genes['CHO GENE SYMBOL'].unique()))

In [None]:
# Comparison of genes in the CRISPR library and IPA analysis

print(f' Total genes in CRISPR library: {len(guide_rna.target_name.unique())}')
print(f' Genes in CRISPR library covered by IPA analysis: {len(common_genes_ipa.target_name.unique())}')
print(f' Total CHO genes in IPA TFs:',len(ipa_tfs['Upstream Regulator'].unique()))

In [None]:
common_genes_sec_recon.to_excel('Data/guide_rna_lib/CRISPR_a_Library_sorted_file_without_duplicates_sec_genes.xlsx', index=False)
common_genes_ipa.to_excel('Data/guide_rna_lib/CRISPR_a_Library_sorted_file_without_duplicates_ipa_tfs.xlsx', index=False)
filtered_guide_rna_ntg.to_excel('Data/guide_rna_lib/CRISPRa_library_manifest_NTG_filtered_secgenes_and_IPATFs.xlsx', index=False)

In [None]:
print(guide_rna)