In [157]:
import numpy as np
import pandas as pd
# additional filter for subset_col
AML_cell_lines = pd.read_csv("cell-line-selector.csv")['displayName'].to_list()
AML_cell_lines.sort()

## Create list of synonyms dictionary

In [66]:
with open("cellosaurus.txt") as f:
    content_list = f.readlines()

# print the list
#print(content_list[:50])

# remove new line characters
content_list = [x.strip() for x in content_list]
#print(content_list[:50])

In [67]:
# extract cell line names
cellosaurus_names = []

for line in content_list:
    if "ID   " in line:
            ID = line.replace("ID   ","")
            cellosaurus_names.append(["ID",ID])
    if "SY   " in line:
            SY = line.replace("SY   ","")
            cellosaurus_names.append(["SY",SY])

In [68]:
# remove cell line names with no synonyms
cellosaurus_names_2 = []
for ind, line in enumerate(cellosaurus_names):
    if line[0] == "SY":
        continue
    if ind == len(cellosaurus_names) -1:
        break
    if cellosaurus_names[ind+1][0] == "SY":
        cellosaurus_names_2.append(line)
        cellosaurus_names_2.append(cellosaurus_names[ind+1])

In [83]:
# create dictionary mapping main cell line identifier to list of synonyms
cellosaurus_dict = {}
for line in cellosaurus_names_2:
    if line[0] == "SY":
        continue
    if line[0] == "ID":
        ID_ind = cellosaurus_names_2.index(line)
        SY_ind = ID_ind + 1
        
        ID = line[1]
        
        synonyms = cellosaurus_names_2[SY_ind]
        synonyms = synonyms[1].split(";")
        synonyms = [x.strip(' ') for x in synonyms]
        
        cellosaurus_dict[ID] = synonyms

In [92]:
# create csv for storage
cellosaurus_df = pd.DataFrame([cellosaurus_dict.keys(),cellosaurus_dict.values()]).T
cellosaurus_df.columns = ["ID","synonyms"]
cellosaurus_df.to_csv("cellosaurus_synonyms.csv",index=False)

## Find all possible AML cell line synonyms

In [158]:
AML_all_synonyms = []

# loop AML cell line name from DepMap
for AML_cell in AML_cell_lines:
    #loop through synonyms dictionary
    for key in cellosaurus_dict:
        synonyms = cellosaurus_dict[key]
        
        # check if cell name matches key
        if AML_cell == key:
            synonyms.insert(0, key)
            #print("matched by key")
            print(synonyms)
            # append to list of synonyms
            AML_all_synonyms.append(synonyms)
            
        # check if cell name is in synonym values
        elif AML_cell in synonyms:
            synonyms.insert(0, key)
            #print("matched by synonyms")
            print(synonyms)
            # append to list of synonyms
            AML_all_synonyms.append(synonyms)

['AML-193', 'AML-193', 'AML193']
['BDCM', 'BDCM', 'B-cell with DC Morphology']
['CESS', 'CESS', 'Cess']
['CMK-11-5', 'CMK-11-5', 'CMK 11-5', 'CMK11-5', 'CMK115']
['CMK-86', 'CMK-86', 'CMK86']
['GDM-1', 'GDM-1', 'GDM1']
['HD-MY-Z', 'HD-MY-Z', 'HD-MyZ', 'HDMYZ']
['HEL', 'HEL', 'Hel', 'GM06141', 'GM06141B', 'Human ErythroLeukemia']
['HEL-30', 'HEL-30', 'Hel-30', 'HEL/30', 'HEL30', 'HEL', 'c3H-Epidermal cell Line-30']
['HEL 92.1.7', 'HEL 92.1.7', 'HEL92.1.7', 'HEL-92.1.7', 'HEL-92-1-7', 'HEL-92_1_7', 'HEL-92', 'HEL9217']
['HL-60', 'HL-60', 'HL 60', 'HL.60', 'HL60', 'Human Leukemia-60']
['HNT-34', 'HNT-34', 'HNT34']
['Kasumi-1', 'Kasumi-1', 'KASUMI-1', 'Kasumi 1', 'KASUMI1', 'Kasumi1']
['Kasumi-6', 'Kasumi-6', 'KASUMI-6', 'KASUMI6']
['KG-1', 'KG-1', 'KG1']
['KG1', 'KG1', 'Kidney-Gli1']
['KMOE-2', 'KMOE-2', 'KMOE2', 'Kmoe2']
['KO52', 'KO52', 'K052', 'K-052']
['KY821', 'KY821', 'KY-821']
['M-07e', 'M-07e', 'M-07E', 'M-O7e', 'M07-e', 'M07e', 'Mo7e', 'MO7e', 'M07E', 'MO7E']
['ME-1 [Human leukem

In [159]:
# flatten list of lists
AML_all_synonyms_ = sum(AML_all_synonyms, [])
# remove duplicates
AML_all_synonyms_ = list(set(AML_all_synonyms_))

## Filter AML cell lines

In [146]:
AML_cell_lines_search = AML_all_synonyms_

In [140]:
# import dataset and extract cell types
assay_data = pd.read_csv("AML_assays_with_embeddings.csv")
unique_cells = assay_data["Cell Type"].unique().tolist()
unique_cells_ = [cell for cell in unique_cells if isinstance(cell, str)]
unique_cells_.sort()

In [171]:
# overlap of chEMBL cell types and cellosaurus synonyms for AML cell lines from DepMap
AML_cell_types = list(set(unique_cells_) & set(AML_all_synonyms_))
AML_cell_types

In [175]:
pd.DataFrame(AML_cell_types).to_csv("AML_cell_types.csv",index=False)