# Sec Recon Analyses

Here we perform different analyses to test the quality of secRecon

In [1]:
import datetime
import pickle
import tqdm.notebook as tqdm

import re
import pandas as pd
import numpy as np
from Bio import Entrez
import Request_Utilis
from google_sheet import GoogleSheet
from collections import defaultdict

import matplotlib.pyplot as plt
import plotly.express as px
from venn import venn
from upsetplot import UpSet
import matplotlib.pyplot as plt
from itertools import product

# Warnings
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

In [2]:
##### ----- Generate datasets from Google Sheet ----- #####

#Credential file
KEY_FILE_PATH = 'credentials.json'

#CHO Network Reconstruction + Recon3D_v3 Google Sheet ID
Sec_Recon_SPREADSHEET_ID = '1DaAdZlvMYDqb7g31I5dw-ZCZH52Xj_W3FnQMFUzqmiQ'

# Initialize the GoogleSheet object
sec_recon_gsheet_file = GoogleSheet(Sec_Recon_SPREADSHEET_ID, KEY_FILE_PATH)

# Read data from SecRecon
sec_genes_sheet = 'SecRecon'
ontology_sheet = 'Ontology'

sec_genes = sec_recon_gsheet_file.read_google_sheet(sec_genes_sheet)
ontology = sec_recon_gsheet_file.read_google_sheet(ontology_sheet)

### Network Analysis

In this section we'll use the networks generated in the Network_visualization notebook to visualize experimental data from a CHO High vs Low dataset

In [None]:
# Read dataset
cho_vs_plasma_prot = pd.read_excel('Data/cho_vs_plasma/1-s2.0-S1096717624000521-mmc3.xlsx',
                                   sheet_name = 'Proteome DE proteins')

In [None]:
# Map Mouse genes to Human genes

dict_mouse_human = dict(zip(sec_genes['MOUSE GENE SYMBOL'], sec_genes['GENE SYMBOL']))
cho_vs_plasma_prot['Human_Genes'] = cho_vs_plasma_prot['Mmus_Genes'].map(gene_dict)

In [None]:
cho_vs_plasma_prot.Human_Genes.dropna()

In [None]:
# Read the dictionary from the pickle file
with open('gene_dict.pkl', 'rb') as f:
    gene_dict = pickle.load(f)

### 2. Identification of secRecon genes in CRISPR CHO whole genome library

In [3]:
# Load CRISPR library dataset
guide_rna = pd.read_excel('Data/guide_rna_lib/CRISPR_a_Library_sorted_file_without_duplicates.xlsx')

# Load guide RNA with NT dataset
guide_rna_ntg = pd.read_excel('Data/guide_rna_lib/CRISPRa_library_manifest_NTG_without_duplicates.xlsx')

# Load TFs from IPA dataset
ipa_tfs = pd.read_excel('Data/IPA_analysis/TF_secrecon_lists.xlsx', sheet_name = 'IPA_results_85-TF', usecols='B')

In [4]:
# Find common genes between CRISPR library and secRecon
guide_rna['gene_lower'] = guide_rna['target_name'].str.replace('gene-', '')
common_genes_sec_recon = guide_rna[guide_rna['gene_lower'].isin(sec_genes['CHO GENE SYMBOL'])]
common_genes_sec_recon = common_genes_sec_recon.drop(['gene_lower'], axis=1)

In [5]:
# Find common genes between CRISPR library and IPA Analysis
guide_rna['gene_upper'] = guide_rna['target_name'].str.replace('gene-', '').str.upper()
common_genes_ipa = guide_rna[guide_rna['gene_upper'].isin(ipa_tfs['Upstream Regulator'])]
common_genes_ipa = common_genes_ipa.drop(['gene_upper','gene_lower'], axis=1)

In [6]:
# Combine datasets
final_df = pd.concat([common_genes_sec_recon, common_genes_ipa], ignore_index=True)

In [7]:
# Filter NT dataset

# Filter rows where 'Name' is in 'Manifest Name' of final_df
filter_condition = guide_rna_ntg['Name'].isin(final_df['Manifest Name'])

# Filter rows where 'Name' starts with 'NT_'
nt_condition = guide_rna_ntg['Name'].str.startswith('NT_')

# Combine both conditions
combined_condition = filter_condition | nt_condition

# Apply the combined filter
filtered_guide_rna_ntg = guide_rna_ntg[combined_condition]

In [8]:
filtered_guide_rna_ntg

Unnamed: 0,Name,Sequence
0,NT_1,TTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCGACACC...
1,NT_2,TTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCGCGGTC...
2,NT_3,TTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCGTCTTG...
3,NT_4,TTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCGGTGAT...
4,NT_5,TTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCGCGAGC...
...,...,...
110518,T_118958,TTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCGTTGGT...
110519,T_118959,TTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCGCTGAA...
110520,T_118960,TTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCGCCGAG...
110521,T_118961,TTTCTTGGCTTTATATATCTTGTGGAAAGGACGAAACACCGGTGCC...


In [9]:
# Comparison of genes in the CRISPR library and secRecon

print(f' Total genes in CRISPR library: {len(guide_rna.target_name.unique())}')
print(f' Genes in CRISPR library covered by secRecon: {len(common_genes_sec_recon.target_name.unique())}')
print(f' Total CHO genes in secRecon:',len(sec_genes['CHO GENE SYMBOL'].unique()))

 Total genes in CRISPR library: 13824
 Genes in CRISPR library covered by secRecon: 238
 Total CHO genes in secRecon: 1100


In [10]:
# Comparison of genes in the CRISPR library and IPA analysis

print(f' Total genes in CRISPR library: {len(guide_rna.target_name.unique())}')
print(f' Genes in CRISPR library covered by IPA analysis: {len(common_genes_ipa.target_name.unique())}')
print(f' Total CHO genes in IPA TFs:',len(ipa_tfs['Upstream Regulator'].unique()))

 Total genes in CRISPR library: 13824
 Genes in CRISPR library covered by IPA analysis: 26
 Total CHO genes in IPA TFs: 85


In [11]:
common_genes_sec_recon.to_excel('Data/guide_rna_lib/CRISPR_a_Library_sorted_file_without_duplicates_sec_genes.xlsx', index=False)
common_genes_ipa.to_excel('Data/guide_rna_lib/CRISPR_a_Library_sorted_file_without_duplicates_ipa_tfs.xlsx', index=False)
filtered_guide_rna_ntg.to_excel('Data/guide_rna_lib/CRISPRa_library_manifest_NTG_filtered_secgenes_and_IPATFs.xlsx', index=False)