# Map `.cram` files info to treatment

### 1. Map `.cram` file names to treatment

Read the file with file names mapped to accesion ID:

In [1]:
with open("../../body/1raw/EGAD00001004583/Sample_File.map") as f:
    mapped_IDs = {}
    for line in f:
        _, sample, file, _ = line.split()
        mapped_IDs[file.split(".")[0]] = [sample]

In [2]:
# show how the dict looks
mapped_IDs

{'20619_4#1': ['EGAN00001437315'],
 '16223_3': ['EGAN00001437316'],
 '16223_4': ['EGAN00001437317'],
 '16223_5': ['EGAN00001437318'],
 '18058_1': ['EGAN00001437318'],
 '16224_8': ['EGAN00001437319'],
 '21850_7': ['EGAN00001938908'],
 '21826_5': ['EGAN00001938909'],
 '21826_6': ['EGAN00001938911'],
 '21823_7': ['EGAN00001938912'],
 '21822_7': ['EGAN00001938914'],
 '21822_8': ['EGAN00001938915'],
 '21866_8': ['EGAN00001938917'],
 '21781_8#1': ['EGAN00001938918'],
 '21850_8': ['EGAN00001938920'],
 '21826_7': ['EGAN00001938921'],
 '23031_3#3': ['EGAN00001938922'],
 '23031_4#3': ['EGAN00001938922'],
 '23031_5#3': ['EGAN00001938922'],
 '21826_8': ['EGAN00001938923'],
 '21823_8': ['EGAN00001938924'],
 '23125_5#3': ['EGAN00001938925'],
 '23125_6#3': ['EGAN00001938925'],
 '23125_7#3': ['EGAN00001938925'],
 '23125_8#3': ['EGAN00001938925'],
 '19041_6#1': ['EGAN00001437404'],
 '19058_7#1': ['EGAN00001437404'],
 '19058_8#1': ['EGAN00001437404'],
 '19190_6#1': ['EGAN00001437404'],
 '19041_6#2': ['E

For each pair `(file name, accesion ID)` find a corresponding `.xml` file and add the sample ID to the list. Sample id is divided into the sample ID itself and subsample ID.

In [12]:
for file, sample in mapped_IDs.items():
    with open('../../body/1raw/EGAD00001004583/samples/{}.sample.xml'.format(sample[0])) as f:
        for line in f:
            if line.strip().startswith("<TITLE>"):
                ID = line.replace("<TITLE>", "").replace("</TITLE>", "")
                mapped_IDs[file] = mapped_IDs[file] + ID.strip().split("_")

In [14]:
mapped_IDs

{'20619_4#1': ['EGAN00001437315', 'MSM0.11', 's1'],
 '16223_3': ['EGAN00001437316', 'MSM0.11', 's2'],
 '16223_4': ['EGAN00001437317', 'MSM0.11', 's3'],
 '16223_5': ['EGAN00001437318', 'MSM0.4', 's1'],
 '18058_1': ['EGAN00001437318', 'MSM0.4', 's1'],
 '16224_8': ['EGAN00001437319', 'MSM0.4', 's2'],
 '21850_7': ['EGAN00001938908', 'MSM0.86', 's1'],
 '21826_5': ['EGAN00001938909', 'MSM0.86', 's2'],
 '21826_6': ['EGAN00001938911', 'MSM0.82', 's1'],
 '21823_7': ['EGAN00001938912', 'MSM0.82', 's2'],
 '21822_7': ['EGAN00001938914', 'MSM0.99', 's1'],
 '21822_8': ['EGAN00001938915', 'MSM0.99', 's3'],
 '21866_8': ['EGAN00001938917', 'MSM0.109', 's1'],
 '21781_8#1': ['EGAN00001938918', 'MSM0.109', 's2'],
 '21850_8': ['EGAN00001938920', 'MSM0.92', 's1'],
 '21826_7': ['EGAN00001938921', 'MSM0.92', 's2'],
 '23031_3#3': ['EGAN00001938922', 'MSM0.92', 's3'],
 '23031_4#3': ['EGAN00001938922', 'MSM0.92', 's3'],
 '23031_5#3': ['EGAN00001938922', 'MSM0.92', 's3'],
 '21826_8': ['EGAN00001938923', 'MSM0.95'

For files without subsamples add '-' as the last element of the list:

In [15]:
for file, sample in mapped_IDs.items():
    if len(sample) < 3: sample.append("-")

In [16]:
mapped_IDs

{'20619_4#1': ['EGAN00001437315', 'MSM0.11', 's1'],
 '16223_3': ['EGAN00001437316', 'MSM0.11', 's2'],
 '16223_4': ['EGAN00001437317', 'MSM0.11', 's3'],
 '16223_5': ['EGAN00001437318', 'MSM0.4', 's1'],
 '18058_1': ['EGAN00001437318', 'MSM0.4', 's1'],
 '16224_8': ['EGAN00001437319', 'MSM0.4', 's2'],
 '21850_7': ['EGAN00001938908', 'MSM0.86', 's1'],
 '21826_5': ['EGAN00001938909', 'MSM0.86', 's2'],
 '21826_6': ['EGAN00001938911', 'MSM0.82', 's1'],
 '21823_7': ['EGAN00001938912', 'MSM0.82', 's2'],
 '21822_7': ['EGAN00001938914', 'MSM0.99', 's1'],
 '21822_8': ['EGAN00001938915', 'MSM0.99', 's3'],
 '21866_8': ['EGAN00001938917', 'MSM0.109', 's1'],
 '21781_8#1': ['EGAN00001938918', 'MSM0.109', 's2'],
 '21850_8': ['EGAN00001938920', 'MSM0.92', 's1'],
 '21826_7': ['EGAN00001938921', 'MSM0.92', 's2'],
 '23031_3#3': ['EGAN00001938922', 'MSM0.92', 's3'],
 '23031_4#3': ['EGAN00001938922', 'MSM0.92', 's3'],
 '23031_5#3': ['EGAN00001938922', 'MSM0.92', 's3'],
 '21826_8': ['EGAN00001938923', 'MSM0.95'

Write to a `.csv` file: 

In [64]:
with open("../../body/1raw/mapped_filenames.csv", "w") as f:
    f.writelines("filename,sample_accession,sample_id,sample_num\n")
    for file, sample in mapped_IDs.items():
        f.writelines(",".join([file] + sample))
        f.writelines("\n")

### 2. Merge mapped file names with their info

In [18]:
import pandas as pd

Read the table with info for each `.cram` file:

In [19]:
crams = pd.read_csv("../../body/1raw/Crams.txt", sep=" ", header=None)
crams.columns = ['status', 'number', 'user', 'group', 'size', 'month', 'day', 'time', 'kostya_file']
crams.head()

Unnamed: 0,status,number,user,group,size,month,day,time,kostya_file
0,-rw-r--r--,1,popadin,gr-fe,1.7G,Sep,4,16:17,EGAF00002339775/22156_8#16.cram
1,-rw-r--r--,1,popadin,gr-fe,1.4G,Sep,4,16:17,EGAF00002339256/22210_2#12.cram
2,-rw-r--r--,1,popadin,gr-fe,1.8G,Sep,4,16:17,EGAF00002339954/22101_4#5.cram
3,-rw-r--r--,1,popadin,gr-fe,33G,Sep,4,16:17,EGAF00002339160/21898_6.cram
4,-rw-r--r--,1,popadin,gr-fe,5.9G,Sep,4,16:17,EGAF00002338479/23627_5#1.cram


Extract file names and add them as a new column:

In [20]:
crams['filename'] = crams['kostya_file'].str.extract(r'(.+/)(.+)(\.cram)')[1]
crams.head()

Unnamed: 0,status,number,user,group,size,month,day,time,kostya_file,filename
0,-rw-r--r--,1,popadin,gr-fe,1.7G,Sep,4,16:17,EGAF00002339775/22156_8#16.cram,22156_8#16
1,-rw-r--r--,1,popadin,gr-fe,1.4G,Sep,4,16:17,EGAF00002339256/22210_2#12.cram,22210_2#12
2,-rw-r--r--,1,popadin,gr-fe,1.8G,Sep,4,16:17,EGAF00002339954/22101_4#5.cram,22101_4#5
3,-rw-r--r--,1,popadin,gr-fe,33G,Sep,4,16:17,EGAF00002339160/21898_6.cram,21898_6
4,-rw-r--r--,1,popadin,gr-fe,5.9G,Sep,4,16:17,EGAF00002338479/23627_5#1.cram,23627_5#1


Read the `.csv` file created above into a `pandas` dataframe:

In [21]:
treatment = pd.read_csv("../../body/1raw/mapped_filenames.csv")
treatment.head()

Unnamed: 0,filename,accession,sample,subsample,treatment
0,16520_2#90,EGAN00001437324,MSM0.1,s1,Acetaldehyde (12 mM)
1,16521_2#90,EGAN00001437324,MSM0.1,s1,Acetaldehyde (12 mM)
2,16524_2#90,EGAN00001437324,MSM0.1,s1,Acetaldehyde (12 mM)
3,16525_2#90,EGAN00001437324,MSM0.1,s1,Acetaldehyde (12 mM)
4,16527_2#90,EGAN00001437324,MSM0.1,s1,Acetaldehyde (12 mM)


Merge two dataframes together by the `filename` column (with saving all rows in `crams`):

In [22]:
merged_df = pd.merge(crams, treatment, on='filename', how='outer')[['kostya_file', 'filename', 'size', 'accession', 'sample', 'subsample', 'treatment']]
merged_df.head()

Unnamed: 0,kostya_file,filename,size,accession,sample,subsample,treatment
0,EGAF00002339775/22156_8#16.cram,22156_8#16,1.7G,EGAN00001938950,MSM0.105,s3,DMS (0.078 mM)
1,EGAF00002339256/22210_2#12.cram,22210_2#12,1.4G,EGAN00001437534,MSM0.68,s3,Glycidamide (475 uM)
2,EGAF00002339954/22101_4#5.cram,22101_4#5,1.8G,EGAN00001938961,MSM0.101,s1,Formaldehyde (85.5 uM)
3,EGAF00002339160/21898_6.cram,21898_6,33G,EGAN00001938938,MSM0.126,s2,Temozolomide (200 uM)
4,EGAF00002338479/23627_5#1.cram,23627_5#1,5.9G,EGAN00001939019,SIGmut6993676,-,NaCl Control (0.003%)


Write the obtained dataframe a `.csv` file:

In [None]:
merged_df.to_csv("../../body/1raw/treatment_mapped_to_kostya_files.csv")