#### Imports

In [183]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import fdrcorrection
from ast import literal_eval
import itertools
import requests
from Bio import Entrez
import time
from tqdm import tqdm

#### Load Genes From Aggregated Results Files

In [184]:
aa_agg_results = "/mnt/c/Users/Owner/OneDrive/Documents/BENG_Senior_Design/DepMap/data/aggregated_results_ccle_tcga_pcawg.csv"
aa = pd.read_csv(aa_agg_results)
aa["All genes"] = aa.apply(lambda x : x["All genes"].replace("\"", ""), axis = 1)
aa['All genes'] = aa['All genes'].apply(literal_eval) # convert to list

In [185]:
all_genes = []
for i, row in aa.iterrows():
    all_genes += row["All genes"]
all_genes = [gene for gene in all_genes if gene != '']
all_genes = list(set(all_genes))

In [186]:
len(all_genes)

16091

In [187]:
aa_full_gene_list_hg38 = "/mnt/c/Users/Owner/OneDrive/Documents/BENG_Senior_Design/DepMap/data/refGene_hg38.txt"
aa_full_gene_list_hg19 = "/mnt/c/Users/Owner/OneDrive/Documents/BENG_Senior_Design/DepMap/data/refGene_hg19.txt"

list1 = "/mnt/c/Users/Owner/OneDrive/Documents/BENG_Senior_Design/DepMap/data/GRCh38/refGene.txt"
list2 = "/mnt/c/Users/Owner/OneDrive/Documents/BENG_Senior_Design/DepMap/data/GRCh37/human_hg19_september_2011/Genes_July_2010_hg19.gff"
list3 = "/mnt/c/Users/Owner/OneDrive/Documents/BENG_Senior_Design/DepMap/data/hg19/human_hg19_september_2011/Genes_July_2010_hg19.gff"

aa_full_gene_list_hg38 = pd.read_csv(aa_full_gene_list_hg38, sep = "\t", header = None)
aa_full_gene_list_hg19 = pd.read_csv(aa_full_gene_list_hg19, sep = "\t", header = None)

list1 = pd.read_csv(list1, sep = "\t", header = None)
list2 = pd.read_csv(list2, sep = "\t", header = None)
list3 = pd.read_csv(list3, sep = "\t", header = None)

list2[9] = list2.apply(lambda x : x[8].strip().split(";")[2].split("=")[1], axis = 1)
list3[9] = list3.apply(lambda x : x[8].strip().split(";")[2].split("=")[1], axis = 1)

In [188]:
aa_full_gene_list_hg38 = set(list(aa_full_gene_list_hg38[12]))
aa_full_gene_list_hg19 = set(list(aa_full_gene_list_hg19[12]))

list1 = set(list(list1[12]))
list2 = set(list(list2[9]))
list3 = set(list(list3[9]))

print("hg38:", len(aa_full_gene_list_hg38))
print("hg19:", len(aa_full_gene_list_hg19))
print("hg38_2:", len(list1))
print("hg19_2:", len(list2))
print("hg19_3:", len(list3))

aa_full_gene_list_full = set(list(aa_full_gene_list_hg38) + list(aa_full_gene_list_hg19) + list(list1) + list(list2) + list(list3))
print("Combined Gene List:", len(aa_full_gene_list_full))

hg38: 28286
hg19: 27325
hg38_2: 28278
hg19_2: 21824
hg19_3: 21841
Combined Gene List: 32247


#### Find Aliases Using HGNC

In [189]:
def fetch_previous_hgnc_symbols(gene_name):
    url = f"https://rest.genenames.org/fetch/symbol/{gene_name}"
    headers = {"Accept": "application/json"}
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        if "response" in data and data["response"]["numFound"] > 0:
            doc = data["response"]["docs"][0]
            aliases = doc.get('alias_symbol', [])
            prev = doc.get('prev_symbol', [])
            return aliases + prev
    else:
        print(f"Failed to retrieve data for {gene_name}, Status code: {response.status_code}")
    return []

In [191]:
gene_alias_mapping = {}

for gene in tqdm(aa_full_gene_list_full, desc="Processing genes"):
    aliases = fetch_previous_hgnc_symbols(gene)
    if aliases == ['']:
        aliases = []
    gene_alias_mapping[gene] = aliases

Processing genes: 100%|█████████████████████████████████████████████████████████| 32247/32247 [3:10:25<00:00,  2.82it/s]


#### Final Alias Mapper

In [192]:
gene_alias_mapping

{'SLC2A12': ['GLUT12', 'GLUT8'],
 'CEACAM20': ['UNQ9366'],
 'C7orf27': [],
 'PCBP4': ['MCG10', 'LIP4'],
 'MMP9': ['CLG4B'],
 'MIR9-3': ['hsa-mir-9-3', 'MIRN9-3'],
 'DLC1': ['HP', 'ARHGAP7', 'STARD12', 'DLC-1', 'p122-RhoGAP'],
 'RBM12': ['HRIHFB2091', 'KIAA0765', 'SWAN'],
 'CD300H': [],
 'SNORA36B': ['ACA36b'],
 'DIAPH1-AS1': [],
 'CEP89': ['FLJ14640', 'CCDC123'],
 'FURIN': ['SPC1', 'PCSK3', 'FUR', 'PACE'],
 'GJB5': ['CX31.1'],
 'CRB3': ['MGC17303'],
 'MRM1': ['FLJ22578'],
 'SLC38A8': ['SNAT8'],
 'C1QTNF1-AS1': [],
 'VAC14-AS1': [],
 'PHF20L1': ['CGI-72', 'FLJ13649', 'MGC64923', 'FLJ21615', 'TDRD20B'],
 'SFTPB': ['SP-B', 'SFTP3'],
 'NUDT18': ['FLJ22494', 'MTH3'],
 'SOGA1': [],
 'SNORA41B': [],
 'LINC01511': ['RP11-325I22.2'],
 'LOC442421': [],
 'CPSF2': ['KIAA1367', 'CPSF100'],
 'SKIDA1': ['FLJ45187', 'C10orf140'],
 'FAM213B': [],
 'SPRR2A': [],
 'PLEKHA9': [],
 'PCDHA4': ['CNR1', 'CRNR1', 'PCDH-ALPHA4', 'CNRN1'],
 'C6orf146': [],
 'PSMB8-AS1': ['XXbac-BPG246D15.8', 'TAP1-AS1', 'TAPSAR1

In [194]:
len(gene_alias_mapping)

32247

In [217]:
new = {}

In [222]:
for g in gene_alias_mapping:
    aliases = gene_alias_mapping[g]
    for a in aliases:
        all_genes = [g] + aliases
        all_genes.remove(a)
        new[a] = all_genes

In [223]:
len(new)

49689

#### Create DepMap Mapper

In [198]:
depmap = "/mnt/c/Users/Owner/OneDrive/Documents/BENG_Senior_Design/DepMap/data/CRISPR_DepMap_Public_24Q2_Score_Chronos_subsetted_metadata.csv"

In [199]:
score_matrix = pd.read_csv(depmap)

  score_matrix = pd.read_csv(depmap)


In [200]:
score_matrix

Unnamed: 0,depmap_id,cell_line_display_name,lineage_1,lineage_2,lineage_3,lineage_5,lineage_6,lineage_4,A1BG,A1CF,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
0,ACH-001270,127399,Soft Tissue,Synovial Sarcoma,Synovial Sarcoma,,,,-0.125700,-0.190342,...,-0.560529,-0.266138,0.022924,0.021368,-0.096022,0.126506,-0.068497,0.075919,-0.352637,-0.662216
1,ACH-002680,170MGBA,CNS/Brain,Diffuse Glioma,Glioblastoma,Glioblastoma,,,-0.056783,-0.074939,...,-0.149722,-0.521723,0.072913,0.058573,-0.100639,-0.123846,-0.217436,-0.100587,-0.064954,-0.282006
2,ACH-002401,21MT2,Breast,Invasive Breast Carcinoma,Breast Invasive Ductal Carcinoma,,,,-0.039836,-0.116822,...,-0.261890,-0.458087,0.009890,0.145250,0.027379,-0.008181,-0.178623,-0.051439,-0.307155,-0.183515
3,ACH-002399,21NT,Breast,Invasive Breast Carcinoma,Breast Invasive Ductal Carcinoma,,,,0.013372,-0.082142,...,-0.315221,-0.431554,-0.200092,0.004946,-0.051205,-0.006521,-0.240958,-0.013675,-0.263179,-0.207334
4,ACH-000520,59M,Ovary/Fallopian Tube,Ovarian Epithelial Tumor,High-Grade Serous Ovarian Cancer,High Grade Serous,,,-0.222447,-0.096272,...,-0.333100,-0.556162,-0.151574,-0.308978,-0.178016,-0.031038,-0.394326,-0.105367,-0.159511,-0.135067
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1145,ACH-000146,THP1,Myeloid,Acute Myeloid Leukemia,Acute Myeloid Leukemia,M5,,,-0.142702,-0.110119,...,-0.300280,-0.182461,0.102202,-0.069822,-0.173859,-0.111945,0.098092,0.064468,0.003113,-0.179357
1146,ACH-000835,GCT,Soft Tissue,Undifferentiated Pleomorphic Sarcoma/Malignant...,Undifferentiated Pleomorphic Sarcoma/Malignant...,,,,-0.174303,-0.017148,...,-0.278384,-0.498170,0.024834,0.104500,-0.020534,-0.105610,-0.111710,0.042609,-0.238596,-0.414749
1147,ACH-001300,CHLA15,Peripheral Nervous System,Neuroblastoma,Neuroblastoma,,,,-0.210638,0.075649,...,-0.321568,-0.486237,-0.031050,0.193135,-0.337853,-0.035557,-0.274687,0.158534,-0.217785,-0.240230
1148,ACH-001301,COGN278,Peripheral Nervous System,Neuroblastoma,Neuroblastoma,,MYCN Amp,,-0.014705,-0.000141,...,-0.490207,-0.502931,0.398217,0.238408,0.007151,0.091789,-0.489645,-0.033089,-0.210309,-0.135954


In [201]:
depmap_genes = list(score_matrix.columns)
#depmap_genes.remove("Unnamed: 0")
depmap_genes.remove("depmap_id")
depmap_genes.remove("cell_line_display_name")
depmap_genes.remove("lineage_1")
depmap_genes.remove("lineage_2")
depmap_genes.remove("lineage_3")
depmap_genes.remove("lineage_4")
depmap_genes.remove("lineage_5")
depmap_genes.remove("lineage_6")
len(depmap_genes)

18435

In [204]:
missing = []
for g in depmap_genes:
    if g not in gene_alias_mapping and g not in new:
        missing.append(g)

In [205]:
len(missing)

262

In [210]:
missing_genes_mapping = {}

for gene in tqdm(missing, desc="Processing genes"):
    aliases = fetch_previous_hgnc_symbols(gene)
    if aliases == ['']:
        aliases = []
    missing_genes_mapping[gene] = aliases

Processing genes: 100%|███████████████████████████████████████████████████████████████| 262/262 [01:57<00:00,  2.23it/s]


In [211]:
missing_genes_mapping

{'ABTB3': ['FLJ33957', 'ABTB2B', 'BTBD11'],
 'ACTMAP': ['FLJ41131', 'C19orf54'],
 'ADGRD2': ['PGR24', 'GPR144'],
 'ADPRS': ['ARH3', 'FLJ20446', 'ADPRHL2'],
 'AQP7B': [],
 'ATOSA': ['FLJ10980', 'KIAA1370', 'FAM214A'],
 'ATOSB': ['FLJ11560', 'bA182N22.6', 'KIAA1539', 'FAM214B'],
 'ATP5MJ': ['MP68', 'MLQ', '6.8PL', 'C14orf2', 'ATP5MPL'],
 'ATP5MK': ['MGC14697', 'bA792D24.4', 'DAPIT', 'AGP', 'USMG5', 'ATP5MD'],
 'SPMIP1': ['ATP6V1FNB'],
 'BBLN': ['Hero9', 'MGC4639', 'EST00098', 'FLJ12823', 'C9orf16'],
 'BLTP1': ['FLJ21404', 'FSA', 'KIAA1371', 'Tweek', 'KIAA1109'],
 'BLTP2': ['DKFZp686M0843',
  'MGC111488',
  'BCOX1',
  'CT101',
  'BCOX',
  'FMP27',
  'Hob',
  'KIAA0100'],
 'BLTP3A': ['FLJ20302', 'dJ349A12.1', 'C6orf107', 'UHRF1BP1'],
 'BLTP3B': ['KIAA0701', 'SHIP164', 'UHRF1BP1L'],
 'BMAL1': ['MOP3', 'JAP3', 'PASD3', 'bHLHe5', 'ARNTL1', 'ARNTL'],
 'BMAL2': ['MOP9', 'CLIF', 'PASD9', 'bHLHe6', 'ARNTL2'],
 'BPNT2': ['FLJ20421', 'IMPA3', 'gPAPP', 'IMPAD1'],
 'BRME1': ['MGC11271', 'MEIOK21', 'C

In [212]:
new_missing = {}
for g in missing_genes_mapping:
    aliases = missing_genes_mapping[g]
    for a in aliases:
        all_genes = [g] + aliases
        all_genes.remove(a)
        new_missing[a] = all_genes

In [213]:
new_missing

{'FLJ33957': ['ABTB3', 'ABTB2B', 'BTBD11'],
 'ABTB2B': ['ABTB3', 'FLJ33957', 'BTBD11'],
 'BTBD11': ['ABTB3', 'FLJ33957', 'ABTB2B'],
 'FLJ41131': ['ACTMAP', 'C19orf54'],
 'C19orf54': ['ACTMAP', 'FLJ41131'],
 'PGR24': ['ADGRD2', 'GPR144'],
 'GPR144': ['ADGRD2', 'PGR24'],
 'ARH3': ['ADPRS', 'FLJ20446', 'ADPRHL2'],
 'FLJ20446': ['ADPRS', 'ARH3', 'ADPRHL2'],
 'ADPRHL2': ['ADPRS', 'ARH3', 'FLJ20446'],
 'FLJ10980': ['ATOSA', 'KIAA1370', 'FAM214A'],
 'KIAA1370': ['ATOSA', 'FLJ10980', 'FAM214A'],
 'FAM214A': ['ATOSA', 'FLJ10980', 'KIAA1370'],
 'FLJ11560': ['ATOSB', 'bA182N22.6', 'KIAA1539', 'FAM214B'],
 'bA182N22.6': ['ATOSB', 'FLJ11560', 'KIAA1539', 'FAM214B'],
 'KIAA1539': ['ATOSB', 'FLJ11560', 'bA182N22.6', 'FAM214B'],
 'FAM214B': ['ATOSB', 'FLJ11560', 'bA182N22.6', 'KIAA1539'],
 'MP68': ['ATP5MJ', 'MLQ', '6.8PL', 'C14orf2', 'ATP5MPL'],
 'MLQ': ['ATP5MJ', 'MP68', '6.8PL', 'C14orf2', 'ATP5MPL'],
 '6.8PL': ['ATP5MJ', 'MP68', 'MLQ', 'C14orf2', 'ATP5MPL'],
 'C14orf2': ['ATP5MJ', 'MP68', 'MLQ', '

In [216]:
for gene in missing:
    aliases = missing_genes_mapping[gene]
    for a in aliases:
        if a in gene_alias_mapping:
            print(gene)
            print(a, gene_alias_mapping[a])
        if a in new:
            print(gene)
            print(a, new[a])
        print()



ABTB3
BTBD11 []


ACTMAP
C19orf54 []


ADGRD2
GPR144 []



ADPRS
ADPRHL2 []


ATOSA
KIAA1370 []

ATOSA
FAM214A []



ATOSB
KIAA1539 []

ATOSB
FAM214B []




ATP5MJ
C14orf2 []

ATP5MJ
ATP5MPL []





ATP5MK
USMG5 []

ATP5MK
ATP5MD []

SPMIP1
ATP6V1FNB []





BBLN
C9orf16 []





BLTP1
KIAA1109 []








BLTP2
KIAA0100 []




BLTP3A
UHRF1BP1 []



BLTP3B
UHRF1BP1L []

BMAL1
MOP3 ['SBNO1', 'FLJ10701', 'FLJ10833', 'Sno']





BMAL1
ARNTL []





BMAL2
ARNTL2 []




BPNT2
IMPAD1 []



BRME1
C19orf57 []



SPMIP5
C10orf82 []



TOP6BL
C11orf80 []


RLIG1
C12orf29 []


FERRY3
C12orf4 []


REDIC1
C12orf40 []





MTNAP1
C17orf80 []


LIAT1
C17orf97 []


SPMAP1
C17orf98 []





ARK2N
C18orf25 []

SPMIP3
C1orf100 []


AIRIM
C1orf109 []



FIRRM
FLIP ['CFLAR', 'CASH', 'Casper', 'CLARP', 'FLAME', 'I-FLICE', 'MRIT', 'c-FLIP', 'cFLIP', 'CASP8AP1']


FIRRM
C1orf112 []



KPLCE
C1orf68 []


ADISSP
C20orf27 []



CIMIP1
C20orf85 []

EPCIP
B37 ['ATN1', 'D12S755E', 'DRPLA']




EPCIP
C21orf62 []


CE

In [232]:
mapper = {}

In [233]:
for g in depmap_genes:
    if g in gene_alias_mapping:
        mapper[g] = g
    elif g in new:
        mapper[g] = new[g][0]
    elif g in missing_genes_mapping:
        aliases = missing_genes_mapping[g]
        for a in aliases:
            if a in gene_alias_mapping:
                mapper[g] = a
            if a in new:
                mapper[g] = new[a][0]

In [234]:
mapper

{'A1BG': 'A1BG',
 'A1CF': 'A1CF',
 'A2M': 'A2M',
 'A2ML1': 'A2ML1',
 'A3GALT2': 'A3GALT2',
 'A4GALT': 'A4GALT',
 'A4GNT': 'A4GNT',
 'AAAS': 'AAAS',
 'AACS': 'AACS',
 'AADAC': 'AADAC',
 'AADACL2': 'AADACL2',
 'AADACL3': 'AADACL3',
 'AADACL4': 'AADACL4',
 'AADAT': 'AADAT',
 'AAGAB': 'AAGAB',
 'AAK1': 'AAK1',
 'AAMDC': 'AAMDC',
 'AAMP': 'AAMP',
 'AANAT': 'AANAT',
 'AAR2': 'AAR2',
 'AARD': 'AARD',
 'AARS1': 'AARS1',
 'AARS2': 'AARS2',
 'AARSD1': 'AARSD1',
 'AASDH': 'AASDH',
 'AASDHPPT': 'AASDHPPT',
 'AASS': 'AASS',
 'AATF': 'AATF',
 'AATK': 'AATK',
 'ABAT': 'ABAT',
 'ABCA1': 'ABCA1',
 'ABCA10': 'ABCA10',
 'ABCA12': 'ABCA12',
 'ABCA13': 'ABCA13',
 'ABCA2': 'ABCA2',
 'ABCA3': 'ABCA3',
 'ABCA4': 'ABCA4',
 'ABCA5': 'ABCA5',
 'ABCA6': 'ABCA6',
 'ABCA7': 'ABCA7',
 'ABCA8': 'ABCA8',
 'ABCA9': 'ABCA9',
 'ABCB1': 'ABCB1',
 'ABCB10': 'ABCB10',
 'ABCB11': 'ABCB11',
 'ABCB4': 'ABCB4',
 'ABCB5': 'ABCB5',
 'ABCB6': 'ABCB6',
 'ABCB7': 'ABCB7',
 'ABCB8': 'ABCB8',
 'ABCB9': 'ABCB9',
 'ABCC1': 'ABCC1',
 'AB

In [235]:
len(mapper)

18420

In [236]:
len(depmap_genes)

18435

#### Create Final Mapper

In [249]:
data = []
for gene in mapper:
    curr = []
    curr.append(mapper[gene])
    curr.append(gene)
    data.append(curr)
df = pd.DataFrame(data)
df.columns = ["Gene Name", "DepMap Gene Name"]

In [251]:
df.to_csv("/mnt/c/Users/Owner/OneDrive/Documents/BENG_Senior_Design/DepMap/data/coamp_gene_mapper_12.21.24.tsv", sep = "\t", index = False)