# The goal of this notebook is to merge the genes expression difference with the drugs information and embedding.

In [51]:
! uv pip install boto3 cmapPy

[2mUsing Python 3.12.12 environment at: /usr[0m
[2mAudited [1m2 packages[0m [2min 364ms[0m[0m


# Here we gather the data our personal Google Drive.

In [52]:
from google.colab import drive
from google.colab import userdata
import gzip
import shutil
import pandas as pd
from cmapPy.pandasGEXpress import parse

drive.mount('/content/drive')

path_doc = '/content/drive/MyDrive/GSE70138_Broad_LINCS_sig_info_2017-03-06.txt'
gz_file_path = '/content/drive/MyDrive/GSE70138.gctx.gz'
gctx_file_path = '/content/drive/MyDrive/GSE70138.gctx'
try:
    with gzip.open(gz_file_path, 'rb') as f_in:
        with open(gctx_file_path, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print("File decompressed successfully")
except FileNotFoundError:
    print("File already decompressed or path incorrect")
    gctx_file_path = gz_file_path


gctoo = parse.parse(gctx_file_path)

df = gctoo.data_df
try:
    doc_df = pd.read_csv(path_doc, compression='gzip', sep='\t')
    print("Fichier d'information lu avec succès !")
except FileNotFoundError:
    print(f"Erreur: Fichier non trouvé à l'emplacement '{path_doc}'.")
except Exception as e:
    print(f"Une erreur est survenue lors de la lecture du fichier: {e}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File decompressed successfully
Fichier d'information lu avec succès !


# Here we merge the drug information onto the genes expression differences.

In [None]:
merged_df = pd.merge(doc_df,df.T,left_on='sig_id',right_index=True,how='inner')

In [None]:
merged_df.head()

Unnamed: 0,sig_id,pert_id,pert_iname,pert_type,cell_id,pert_idose,pert_itime,distil_id,780,7849,...,79874,100289678,6376,79716,11033,4034,399664,54869,90379,60
0,LJP005_A375_24H:A03,DMSO,DMSO,ctl_vehicle,A375,-666,24 h,LJP005_A375_24H_X1_B19:A03|LJP005_A375_24H_X2_...,-0.154526,-1.16548,...,-0.558054,-0.56127,-1.142208,0.105998,-0.567277,0.346656,0.299755,-0.729163,0.500796,0.456211
1,LJP005_A375_24H:A04,DMSO,DMSO,ctl_vehicle,A375,-666,24 h,LJP005_A375_24H_X1_B19:A04|LJP005_A375_24H_X2_...,0.113874,-0.883416,...,-1.125492,-0.615619,-1.209933,-0.619656,-0.595497,-0.441346,-0.362601,-0.795284,-0.174053,1.291636
2,LJP005_A375_24H:A05,DMSO,DMSO,ctl_vehicle,A375,-666,24 h,LJP005_A375_24H_X1_B19:A05|LJP005_A375_24H_X2_...,-0.038252,-0.183913,...,-0.667727,-0.543937,0.405091,-0.260954,-0.042831,-0.197509,0.8449,0.584811,-0.092387,-0.000192
3,LJP005_A375_24H:A06,DMSO,DMSO,ctl_vehicle,A375,-666,24 h,LJP005_A375_24H_X1_B19:A06|LJP005_A375_24H_X2_...,0.466993,-0.584319,...,-1.175768,-0.386227,-0.118282,-0.642835,-0.235527,-0.579607,0.924895,0.616022,0.115633,0.404643
4,LJP005_A375_24H:A07,BRD-K76908866,CP-724714,trt_cp,A375,10.0 um,24 h,LJP005_A375_24H_X1_B19:A07|LJP005_A375_24H_X2_...,-1.462477,-1.140991,...,-0.317219,0.150348,-0.526254,-0.416583,-0.450565,-1.079166,-0.162815,0.965545,-0.028699,-1.077882


The embedding file will be accessible in the github, you have to add it to your Drive.

# Here we add the embeddings of the molecules onto the previously merged dataset.

In [59]:
import numpy as np
import warnings
warnings.filterwarnings('ignore')
path_embedding = '/content/drive/MyDrive/embbedings_with_molecule_names.csv'
embedding_df = pd.read_csv(path_embedding)
embedding_df =embedding_df.set_index('pert_iname').T

def vectoriser_drogues(df):
  vector_list = []
  vector_df = pd.DataFrame()
  liste_drogues= list(df.columns)
  for el in liste_drogues :
    vector_list.append(np.array(list(df[el])))
  #print(vector_list)
  i=0
  while i < len(liste_drogues):
    vector_df[liste_drogues[i]]=[vector_list[i]]
    i+=1

  return vector_df
vector_embedding = vectoriser_drogues(embedding_df)
vector_embedding = vector_embedding.T
vector_embedding = vector_embedding.reset_index()
vector_embedding.columns = ['pert_iname','embedding']

In [60]:
final_dataset = pd.merge(merged_df,vector_embedding,how='inner',on='pert_iname')

There are only a few genes which have actually been computed the rest are being inferred computationally. To know whether a gene is a landmark (thus has been computed by the machine) we have found a document containing this information. Thus we will filter only the columns that have been computed by the machine. The landmark file is accessible on github.

In [62]:
def filtering(landmark_path,df):
  landmark = pd.read_csv(landmark_path,sep='\t')
  landmark = landmark[['pr_gene_id','pr_is_lm']]#keeping the ids that are landmarked
  landmark = landmark[landmark['pr_is_lm']==1]
  print(landmark)
  liste_gene_ids = []
  for el in landmark['pr_gene_id']:
    liste_gene_ids.append(str(el))
  #gene_ids = [g for g in gene_ids if g in df.columns]

  df =  df[['sig_id','pert_id','pert_iname','pert_type','cell_id','pert_idose','pert_itime','distil_id','embedding']+liste_gene_ids]#filtering the columns that we will use for the predictive task
  return df
path_landmark = '/content/drive/MyDrive/GSE70138_Broad_LINCS_gene_info_2017-03-06.txt'
final_filtered = filtering(path_landmark,final_dataset)

       pr_gene_id  pr_is_lm
0             780         1
1            7849         1
25           6193         1
43             23         1
49           9552         1
...           ...       ...
12184        5467         1
12223        2767         1
12224       23038         1
12286       57048         1
12321       79716         1

[978 rows x 2 columns]


In [63]:
final_filtered.to_parquet('/content/drive/MyDrive/filtered_final_dataset.parquet')