In [1]:
import pandas as pd
import subprocess
import re
import json
import watermark
import requests
import bs4

import glob

import scipy.stats as st
import numpy as np

from functools import reduce

In [2]:
probe_df = pd.read_csv('../../new_raw_data/GSE11300/GPL6759.ndf.gz', compression='gzip', sep='\t')

In [3]:
probe_df.head()

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
0,5314_0001_0001,BLOCK1,rank_selected,rank:09;score:376;uniq:03;count:37;freq:00;rul...,TETRA00S0021925,AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGC...,0,64456195,64456195,1,1,experimental,TETRAP00318583,2262,5314,1,1
1,5314_0023_0001,BLOCK1,rank_selected,rank:09;score:402;uniq:26;count:37;freq:00;rul...,TETRA00S0012676,AAGGGCGCATTAGTGCCAAATGGGACACGTAGAAAGGTAGAGGGAT...,0,64330340,64330340,1,23,experimental,TETRAP00183246,3282,5314,23,1
2,5314_0025_0001,BLOCK1,rank_selected,rank:08;score:115;uniq:01;count:31;freq:00;rul...,TETRA00S0002513,TATCATATAAAAATGTGTCTTGTTTATAAGACATTAATTGCTTAAT...,0,64193974,64193974,1,25,experimental,TETRAP00036232,262,5314,25,1
3,5314_0027_0001,BLOCK1,rank_selected,rank:11;score:341;uniq:23;count:37;freq:00;rul...,TETRA00S0000062,AGATTTGATGTAATATAAGTTATCCGAAACATGGCAGTAGAATGCA...,0,64161203,64161203,1,27,experimental,TETRAP00000895,825,5314,27,1
4,5314_0029_0001,BLOCK1,rank_selected,rank:01;score:560;uniq:21;count:37;freq:00;rul...,TETRA00S0006635,TAAATAACTAAGAGCATAAGGCTGGTGCAAGTAACTCAATGCAACC...,0,64249458,64249458,1,29,experimental,TETRAP00096103,3290,5314,29,1


In [4]:
len(probe_df)

392778

In [5]:
experimental_probe_df = probe_df.loc[probe_df['PROBE_CLASS']=='experimental']

In [6]:
seq_probe_dict = {s: p for p, s in zip(experimental_probe_df['PROBE_ID'].values, experimental_probe_df['SEQ_ID'].values)}

In [7]:
with open('./microarray_probe_alignment.sam', 'r') as f:
    lines = f.readlines()
    single_alignments = [l for l in lines if l.split()[-1] == 'NH:i:1']

In [8]:
align_dict = {}
for al in single_alignments:
    s = al.split()
    align_dict[s[0]] = s[2]

### Step 6: Load QC'ed data and assign the correct genes for each probe

In [9]:
qc_rma = pd.read_csv('../microarray_QC/QC_probe_rma_values.csv')
qc_rma.head()

Unnamed: 0.1,Unnamed: 0,C0_GSM285570.xys,C0_GSM656230.xys,C10_GSM285578.xys,C10_GSM285591.xys,C12_GSM285579.xys,C12_GSM285592.xys,C12_GSM656237.xys,C14_GSM285580.xys,C14_GSM285593.xys,...,S3_GSM285542.xys,S3_GSM285555.xys,S3_GSM285562.xys,S6_GSM285543.xys,S6_GSM285556.xys,S6_GSM285563.xys,S9_GSM285544.xys,S9_GSM285564.xys,S9_GSM647653.xys,S9_GSM647654.xys
0,TETRA00S0000001,10.830266,9.83693,5.805547,6.005496,6.202794,6.23738,6.314438,7.423571,7.507645,...,9.125377,8.647941,8.126324,10.720722,9.868662,7.614758,9.664811,8.535958,10.010385,10.104864
1,TETRA00S0000002,8.623682,5.570322,4.974599,5.436244,5.051285,5.577026,5.250233,4.974993,5.747498,...,7.443906,8.055853,5.551413,8.174586,6.869377,5.811808,6.470432,5.692521,7.72228,6.830825
2,TETRA00S0000003,6.035942,4.757383,4.82981,4.835038,5.185818,4.862489,4.651688,4.920573,4.636333,...,7.582175,6.712158,5.595456,9.59954,8.572935,5.34532,8.511582,5.509824,8.745682,8.708604
3,TETRA00S0000004,7.119374,6.687419,6.570743,6.682946,7.834773,7.978473,6.970125,8.211853,7.956452,...,7.107718,7.508923,7.805419,7.174651,6.685836,7.606569,7.336635,7.772938,6.703545,6.826793
4,TETRA00S0000005,11.386933,11.193811,11.229506,10.873188,10.801168,10.834801,11.130466,11.207738,11.009172,...,10.859625,11.047529,10.865832,10.806576,10.705724,10.943747,11.259886,11.069086,11.345458,11.425561


In [10]:
qc_rma['PROBE_ID'] = [seq_probe_dict[s] for s in qc_rma['Unnamed: 0'].values]

In [11]:
qc_rma['TTHERM_ID'] = [align_dict[p] if p in align_dict.keys() else 'NA' for p in qc_rma['PROBE_ID'].values]

In [12]:
aligned_qc_rma = qc_rma.loc[qc_rma['TTHERM_ID'] != 'NA']
len(aligned_qc_rma)

21415

In [13]:
len(qc_rma)

23997

In [14]:
aligned_qc_rma = aligned_qc_rma.rename(columns={c: c.split('.')[0] for c in aligned_qc_rma.columns})

In [15]:
aligned_qc_rma.head()

Unnamed: 0.1,Unnamed: 0,C0_GSM285570,C0_GSM656230,C10_GSM285578,C10_GSM285591,C12_GSM285579,C12_GSM285592,C12_GSM656237,C14_GSM285580,C14_GSM285593,...,S3_GSM285562,S6_GSM285543,S6_GSM285556,S6_GSM285563,S9_GSM285544,S9_GSM285564,S9_GSM647653,S9_GSM647654,PROBE_ID,TTHERM_ID
0,TETRA00S0000001,10.830266,9.83693,5.805547,6.005496,6.202794,6.23738,6.314438,7.423571,7.507645,...,8.126324,10.720722,9.868662,7.614758,9.664811,8.535958,10.010385,10.104864,TETRAP00000004,TTHERM_00000010
1,TETRA00S0000002,8.623682,5.570322,4.974599,5.436244,5.051285,5.577026,5.250233,4.974993,5.747498,...,5.551413,8.174586,6.869377,5.811808,6.470432,5.692521,7.72228,6.830825,TETRAP00000019,TTHERM_00000020
2,TETRA00S0000003,6.035942,4.757383,4.82981,4.835038,5.185818,4.862489,4.651688,4.920573,4.636333,...,5.595456,9.59954,8.572935,5.34532,8.511582,5.509824,8.745682,8.708604,TETRAP00000043,TTHERM_00000030
3,TETRA00S0000004,7.119374,6.687419,6.570743,6.682946,7.834773,7.978473,6.970125,8.211853,7.956452,...,7.805419,7.174651,6.685836,7.606569,7.336635,7.772938,6.703545,6.826793,TETRAP00000052,TTHERM_00000040
4,TETRA00S0000005,11.386933,11.193811,11.229506,10.873188,10.801168,10.834801,11.130466,11.207738,11.009172,...,10.865832,10.806576,10.705724,10.943747,11.259886,11.069086,11.345458,11.425561,TETRAP00000063,TTHERM_000000045


In [16]:
aligned_qc_rma.loc[aligned_qc_rma.duplicated(subset=['PROBE_ID'])]

Unnamed: 0.1,Unnamed: 0,C0_GSM285570,C0_GSM656230,C10_GSM285578,C10_GSM285591,C12_GSM285579,C12_GSM285592,C12_GSM656237,C14_GSM285580,C14_GSM285593,...,S3_GSM285562,S6_GSM285543,S6_GSM285556,S6_GSM285563,S9_GSM285544,S9_GSM285564,S9_GSM647653,S9_GSM647654,PROBE_ID,TTHERM_ID


All probes are unique!

In [17]:
list(aligned_qc_rma.columns[1:-2])

['C0_GSM285570',
 'C0_GSM656230',
 'C10_GSM285578',
 'C10_GSM285591',
 'C12_GSM285579',
 'C12_GSM285592',
 'C12_GSM656237',
 'C14_GSM285580',
 'C14_GSM285593',
 'C14_GSM656238',
 'C16_GSM285582',
 'C16_GSM285595',
 'C16_GSM656239',
 'C18_GSM285583',
 'C18_GSM285596',
 'C18_GSM656240',
 'C4_GSM285574',
 'C4_GSM285588',
 'C4_GSM656234',
 'C6_GSM285575',
 'C6_GSM656232',
 'C8_GSM285576',
 'C8_GSM285590',
 'Lh_GSM283691',
 'Lh_GSM284360',
 'Lh_GSM284364',
 'Ll_GSM283687',
 'Ll_GSM284355',
 'Ll_GSM284362',
 'Lm_GSM283690',
 'Lm_GSM284357',
 'Lm_GSM284363',
 'S0_GSM285363',
 'S0_GSM285554',
 'S0_GSM285561',
 'S0_GSM647651',
 'S0_GSM647652',
 'S15_GSM285559',
 'S15_GSM285566',
 'S24_GSM285547',
 'S24_GSM285560',
 'S3_GSM285542',
 'S3_GSM285555',
 'S3_GSM285562',
 'S6_GSM285543',
 'S6_GSM285556',
 'S6_GSM285563',
 'S9_GSM285544',
 'S9_GSM285564',
 'S9_GSM647653',
 'S9_GSM647654']

In [18]:
with open('../../new_raw_data/microarray_accessions_all.json', 'r') as f:
    # Including single REP measurement for C-15m (GSM656231) even though there are no replicates for it
    # in order to replicate the 2011 analysis
    # Format: keys are the physiological phase; values are the geo accessions for each microarray
    all_geo = json.load(f)

In [19]:
all_geo

{'Ll': ['GSM283687', 'GSM284355', 'GSM284362'],
 'Lm': ['GSM283690', 'GSM284357', 'GSM284363'],
 'Lh': ['GSM283691', 'GSM284360', 'GSM284364'],
 'S0': ['GSM285363',
  'GSM285554',
  'GSM285561',
  'GSM647244',
  'GSM647651',
  'GSM647652'],
 'S3': ['GSM285542', 'GSM285555', 'GSM285562'],
 'S6': ['GSM285543', 'GSM285556', 'GSM285563'],
 'S9': ['GSM285544', 'GSM285557', 'GSM285564', 'GSM647653', 'GSM647654'],
 'S12': ['GSM285545', 'GSM285558', 'GSM285565'],
 'S15': ['GSM285546', 'GSM285559', 'GSM285566'],
 'S24': ['GSM285547', 'GSM285560', 'GSM285567', 'GSM647245'],
 'C0': ['GSM285570', 'GSM285586', 'GSM656230'],
 'C15m': ['GSM656231'],
 'C2': ['GSM285572', 'GSM285587', 'GSM656233'],
 'C4': ['GSM285574', 'GSM285588', 'GSM656234'],
 'C6': ['GSM285575', 'GSM285589', 'GSM656232'],
 'C8': ['GSM285576', 'GSM285590', 'GSM656236'],
 'C10': ['GSM285578', 'GSM285591', 'GSM656235'],
 'C12': ['GSM285579', 'GSM285592', 'GSM656237'],
 'C14': ['GSM285580', 'GSM285593', 'GSM656238'],
 'C16': ['GSM28558

In [20]:
col_names = []

for key in all_geo:
    for val in all_geo[key]:
        col_names.append(f'{key}_{val}')

In [21]:
ordered_columns = [c for c in col_names if c in aligned_qc_rma.columns[1:-2]]

In [22]:
tidy_aligned_qc_rma_df = aligned_qc_rma[['TTHERM_ID'] + list(ordered_columns)]
tidy_aligned_qc_rma_df.head()

Unnamed: 0,TTHERM_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
0,TTHERM_00000010,5.066343,4.767264,5.010981,6.139047,4.619361,4.751761,5.81855,5.342529,5.48375,...,6.314438,7.423571,7.507645,7.417087,7.147801,7.74793,7.093641,7.672685,7.51129,6.890117
1,TTHERM_00000020,4.696881,4.638401,4.956299,6.942556,5.101252,4.730307,8.45769,4.526411,4.9083,...,5.250233,4.974993,5.747498,5.252167,5.210531,7.083187,5.252222,5.037613,5.495281,5.013987
2,TTHERM_00000030,4.654278,4.537105,4.928739,5.063991,4.584168,4.91188,5.935311,4.51947,4.757861,...,4.651688,4.920573,4.636333,4.883712,4.779395,4.744335,4.51314,4.838428,4.961475,4.65334
3,TTHERM_00000040,7.811975,7.646583,7.445984,7.874682,7.614192,7.439959,7.162609,7.429543,7.21277,...,6.970125,8.211853,7.956452,7.436405,7.208571,7.504531,6.901063,7.587402,7.378368,7.177293
4,TTHERM_000000045,9.633489,9.977124,10.027529,9.720665,9.605762,10.225542,10.279608,10.459966,10.693337,...,11.130466,11.207738,11.009172,10.615417,11.038938,11.009222,10.216348,11.099187,11.172276,10.561021


### Step 7: Correct the .ndf file with correct gene assignments (never actually use this for anything)

In [23]:
ttids = []
for probe_id in experimental_probe_df['PROBE_ID'].values:
    try:
        ttid = align_dict[probe_id]
        ttids.append(ttid)
    except:
        ttids.append('NA')

In [24]:
corrected_probe_ndf = probe_df.loc[probe_df['PROBE_CLASS'] == 'experimental']
corrected_probe_ndf['SEQ_ID'] = ttids
corrected_probe_ndf.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  corrected_probe_ndf['SEQ_ID'] = ttids


Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
0,5314_0001_0001,BLOCK1,rank_selected,rank:09;score:376;uniq:03;count:37;freq:00;rul...,TTHERM_00709600,AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGC...,0,64456195,64456195,1,1,experimental,TETRAP00318583,2262,5314,1,1
1,5314_0023_0001,BLOCK1,rank_selected,rank:09;score:402;uniq:26;count:37;freq:00;rul...,TTHERM_00529480,AAGGGCGCATTAGTGCCAAATGGGACACGTAGAAAGGTAGAGGGAT...,0,64330340,64330340,1,23,experimental,TETRAP00183246,3282,5314,23,1
2,5314_0025_0001,BLOCK1,rank_selected,rank:08;score:115;uniq:01;count:31;freq:00;rul...,,TATCATATAAAAATGTGTCTTGTTTATAAGACATTAATTGCTTAAT...,0,64193974,64193974,1,25,experimental,TETRAP00036232,262,5314,25,1
3,5314_0027_0001,BLOCK1,rank_selected,rank:11;score:341;uniq:23;count:37;freq:00;rul...,TTHERM_00002620,AGATTTGATGTAATATAAGTTATCCGAAACATGGCAGTAGAATGCA...,0,64161203,64161203,1,27,experimental,TETRAP00000895,825,5314,27,1
4,5314_0029_0001,BLOCK1,rank_selected,rank:01;score:560;uniq:21;count:37;freq:00;rul...,TTHERM_01013320,TAAATAACTAAGAGCATAAGGCTGGTGCAAGTAACTCAATGCAACC...,0,64249458,64249458,1,29,experimental,TETRAP00096103,3290,5314,29,1


In [25]:
corrected_probe_ndf = corrected_probe_ndf.loc[corrected_probe_ndf['SEQ_ID'] != 'NA']
len(corrected_probe_ndf)

297488

In [26]:
corrected_probe_ndf.to_csv('./2021_Tetrahymena_expr_corrected.ndf', index=False, sep='\t')

### Step 8: Get geometric means for genes that are probes multiple times as save dataframes for different phase groupings

In [27]:
tidy_aligned_qc_rma_df.loc[tidy_aligned_qc_rma_df.duplicated(subset=['TTHERM_ID'])]

Unnamed: 0,TTHERM_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
20,TTHERM_000001243,11.016431,10.576109,11.173800,11.557930,10.874785,11.009191,11.579152,10.237133,11.100503,...,11.474562,11.976322,12.463951,11.953201,11.866761,12.216943,10.672487,11.850012,12.033058,11.565778
36,TTHERM_00001480,4.700491,4.667979,4.711559,4.965456,4.970090,4.744194,4.889514,4.570671,4.540975,...,5.703505,4.973763,4.866402,4.896428,5.165352,5.097879,5.367105,5.597338,5.377334,5.712692
38,TTHERM_000001490,5.614544,5.375390,4.966231,4.958224,5.198247,5.204828,6.051168,4.867997,5.422448,...,4.856101,5.227113,5.038306,5.330369,5.459958,5.343990,5.749558,5.495174,5.372136,5.926892
40,TTHERM_000001490,5.010986,4.956277,5.061556,4.780270,5.048285,4.927103,5.273838,4.521668,5.040086,...,4.651895,4.663449,4.740052,4.847667,4.919039,4.444621,4.991735,5.038364,4.817430,4.780232
41,TTHERM_000001490,6.374212,6.427538,5.387405,5.018113,6.071937,5.785633,7.171887,5.317020,5.407567,...,6.005697,5.610994,5.037020,6.605437,5.855979,5.013803,7.170814,5.363476,5.552600,6.953270
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23990,TTHERM_01165210,5.082752,5.450693,5.124898,5.332384,5.938026,5.191738,5.294790,4.774181,4.490938,...,4.659965,4.966076,4.455510,5.420018,4.571970,4.685785,4.818078,5.550386,4.539166,4.897801
23991,TTHERM_01165210,4.843806,5.624439,5.008276,5.299802,5.307850,5.063217,5.625071,4.808601,4.646916,...,4.713349,5.020553,4.612872,5.611077,4.539242,4.588483,5.020234,5.375437,4.643127,5.029853
23992,TTHERM_00648600,4.627832,4.896337,4.998963,4.676617,4.640689,5.172536,4.986456,4.772653,4.542141,...,4.485875,4.617993,4.664032,5.092341,4.641734,4.382280,4.707127,4.560315,4.843762,4.578259
23993,TTHERM_01165210,5.153146,5.888775,4.977209,4.945611,5.408725,4.574284,6.026624,4.775564,4.569627,...,4.935414,4.977147,4.558275,5.847145,4.504703,4.851697,4.574623,5.245583,4.499663,4.857914


In [28]:
aggregated_tidy_aligned_qc_rma_df = tidy_aligned_qc_rma_df.groupby('TTHERM_ID').aggregate(st.mstats.gmean).reset_index()

In [29]:
len(aggregated_tidy_aligned_qc_rma_df)

20038

### NO DATA LOSS ALIGNMENT METHOD

DESERIALIZE THE COMPLETE TETRA_ID TO TTHERM_IDS LIST DICT

In [30]:
import pickle

with open('tetra_ttherm_dict.pkl', 'rb') as file:
    tetra_ttherm_dict = pickle.load(file)
tetra_ttherm_dict

{'TETRA00S0021925': ['TTHERM_00709600',
  'TTHERM_000709619',
  'TTHERM_000709599'],
 'TETRA00S0012676': ['TTHERM_00529480'],
 'TETRA00S0000062': ['TTHERM_00002620'],
 'TETRA00S0006635': ['TTHERM_01013320'],
 'TETRA00S0018517': ['TTHERM_00455220'],
 'TETRA00S0002596': ['TTHERM_00794640'],
 'TETRA00S0005684': ['TTHERM_00966510', 'TTHERM_00966520'],
 'TETRA00S0021711': ['TTHERM_00841340'],
 'TETRA00S0010741': ['TTHERM_01287980'],
 'TETRA00S0010465': ['TTHERM_00312280'],
 'TETRA00S0016037': ['TTHERM_00147630'],
 'TETRA00S0024690': ['TTHERM_00089220'],
 'TETRA00S0013085': ['TTHERM_00471910'],
 'TETRA00S0005204': ['TTHERM_00209370'],
 'TETRA00S0020375': ['TTHERM_00586800'],
 'TETRA00S0008376': ['TTHERM_01084410'],
 'TETRA00S0001799': ['TTHERM_00758920'],
 'TETRA00S0010349': ['TTHERM_000052309'],
 'TETRA00S0024507': ['TTHERM_00083400'],
 'TETRA00S0017259': ['TTHERM_00329780'],
 'TETRA00S0003819': ['TTHERM_00859240'],
 'TETRA00S0015667': ['TTHERM_00127230'],
 'TETRA00S0000421': ['TTHERM_00001

All TETRA IDS ARE ACCOUNTED FOR, AS THE NUMBER OF TETRA IDS IN THE DF AND IN THE DICT ARE THE SAME

In [31]:
len(tetra_ttherm_dict.keys())

23997

In [32]:
tetra_ids = qc_rma.loc[:, 'Unnamed: 0'].values
tetra_ids[0:3]

array(['TETRA00S0000001', 'TETRA00S0000002', 'TETRA00S0000003'],
      dtype=object)

In [33]:
len(tetra_ids)

23997

CREATE A TETRA ID TO TTHERM ID DICTIONARY WITH TETRA IDS THAT MAP TO ONE TTHERM ID ONLY

In [34]:
master_tetra_ttherm_dict = {}
multiple_mappings = []

for tetra_id in tetra_ttherm_dict.keys():
    if len(tetra_ttherm_dict[tetra_id]) == 1:
        master_tetra_ttherm_dict[tetra_id] = tetra_ttherm_dict[tetra_id][0]
    elif len(tetra_ttherm_dict[tetra_id]) > 1:
        multiple_mappings.append(tetra_id)
    elif len(tetra_ttherm_dict[tetra_id]) < 1:
        print(F'NO TTHERM IDS FOR {tetra_id}')

In [35]:
len(multiple_mappings)

2033

In [36]:
len(master_tetra_ttherm_dict)

21964

In [37]:
qc_rma_signal_data_df = pd.read_csv('../microarray_QC/QC_probe_rma_values.csv')
qc_rma_signal_data_df.head()

Unnamed: 0.1,Unnamed: 0,C0_GSM285570.xys,C0_GSM656230.xys,C10_GSM285578.xys,C10_GSM285591.xys,C12_GSM285579.xys,C12_GSM285592.xys,C12_GSM656237.xys,C14_GSM285580.xys,C14_GSM285593.xys,...,S3_GSM285542.xys,S3_GSM285555.xys,S3_GSM285562.xys,S6_GSM285543.xys,S6_GSM285556.xys,S6_GSM285563.xys,S9_GSM285544.xys,S9_GSM285564.xys,S9_GSM647653.xys,S9_GSM647654.xys
0,TETRA00S0000001,10.830266,9.83693,5.805547,6.005496,6.202794,6.23738,6.314438,7.423571,7.507645,...,9.125377,8.647941,8.126324,10.720722,9.868662,7.614758,9.664811,8.535958,10.010385,10.104864
1,TETRA00S0000002,8.623682,5.570322,4.974599,5.436244,5.051285,5.577026,5.250233,4.974993,5.747498,...,7.443906,8.055853,5.551413,8.174586,6.869377,5.811808,6.470432,5.692521,7.72228,6.830825
2,TETRA00S0000003,6.035942,4.757383,4.82981,4.835038,5.185818,4.862489,4.651688,4.920573,4.636333,...,7.582175,6.712158,5.595456,9.59954,8.572935,5.34532,8.511582,5.509824,8.745682,8.708604
3,TETRA00S0000004,7.119374,6.687419,6.570743,6.682946,7.834773,7.978473,6.970125,8.211853,7.956452,...,7.107718,7.508923,7.805419,7.174651,6.685836,7.606569,7.336635,7.772938,6.703545,6.826793
4,TETRA00S0000005,11.386933,11.193811,11.229506,10.873188,10.801168,10.834801,11.130466,11.207738,11.009172,...,10.859625,11.047529,10.865832,10.806576,10.705724,10.943747,11.259886,11.069086,11.345458,11.425561


In [38]:
len(qc_rma_signal_data_df['Unnamed: 0'].values)

23997

In [39]:
len(tetra_ttherm_dict)

23997

MAP THE TETRA IDS WITH ONLY ONE TTHERM ID MAPPING TO THEIR TTHERM IDS

In [40]:
qc_rma_signal_data_df['TTHERM_ID'] = [tetra_ttherm_dict[tetra_id][0] if tetra_id not in multiple_mappings else 'MULTIPLE MAPPINGS' for tetra_id in qc_rma_signal_data_df['Unnamed: 0'].values]

In [41]:
qc_rma_signal_data_df.head()

Unnamed: 0.1,Unnamed: 0,C0_GSM285570.xys,C0_GSM656230.xys,C10_GSM285578.xys,C10_GSM285591.xys,C12_GSM285579.xys,C12_GSM285592.xys,C12_GSM656237.xys,C14_GSM285580.xys,C14_GSM285593.xys,...,S3_GSM285555.xys,S3_GSM285562.xys,S6_GSM285543.xys,S6_GSM285556.xys,S6_GSM285563.xys,S9_GSM285544.xys,S9_GSM285564.xys,S9_GSM647653.xys,S9_GSM647654.xys,TTHERM_ID
0,TETRA00S0000001,10.830266,9.83693,5.805547,6.005496,6.202794,6.23738,6.314438,7.423571,7.507645,...,8.647941,8.126324,10.720722,9.868662,7.614758,9.664811,8.535958,10.010385,10.104864,TTHERM_00000010
1,TETRA00S0000002,8.623682,5.570322,4.974599,5.436244,5.051285,5.577026,5.250233,4.974993,5.747498,...,8.055853,5.551413,8.174586,6.869377,5.811808,6.470432,5.692521,7.72228,6.830825,TTHERM_00000020
2,TETRA00S0000003,6.035942,4.757383,4.82981,4.835038,5.185818,4.862489,4.651688,4.920573,4.636333,...,6.712158,5.595456,9.59954,8.572935,5.34532,8.511582,5.509824,8.745682,8.708604,TTHERM_00000030
3,TETRA00S0000004,7.119374,6.687419,6.570743,6.682946,7.834773,7.978473,6.970125,8.211853,7.956452,...,7.508923,7.805419,7.174651,6.685836,7.606569,7.336635,7.772938,6.703545,6.826793,MULTIPLE MAPPINGS
4,TETRA00S0000005,11.386933,11.193811,11.229506,10.873188,10.801168,10.834801,11.130466,11.207738,11.009172,...,11.047529,10.865832,10.806576,10.705724,10.943747,11.259886,11.069086,11.345458,11.425561,TTHERM_000000045


In [42]:
cleaned_qc_rma_signal_data_df = qc_rma_signal_data_df.loc[qc_rma_signal_data_df['TTHERM_ID'] != 'MULTIPLE MAPPINGS']
len(cleaned_qc_rma_signal_data_df)

21964

In [43]:
ordered_columns

['Ll_GSM283687',
 'Ll_GSM284355',
 'Ll_GSM284362',
 'Lm_GSM283690',
 'Lm_GSM284357',
 'Lm_GSM284363',
 'Lh_GSM283691',
 'Lh_GSM284360',
 'Lh_GSM284364',
 'S0_GSM285363',
 'S0_GSM285554',
 'S0_GSM285561',
 'S0_GSM647651',
 'S0_GSM647652',
 'S3_GSM285542',
 'S3_GSM285555',
 'S3_GSM285562',
 'S6_GSM285543',
 'S6_GSM285556',
 'S6_GSM285563',
 'S9_GSM285544',
 'S9_GSM285564',
 'S9_GSM647653',
 'S9_GSM647654',
 'S15_GSM285559',
 'S15_GSM285566',
 'S24_GSM285547',
 'S24_GSM285560',
 'C0_GSM285570',
 'C0_GSM656230',
 'C4_GSM285574',
 'C4_GSM285588',
 'C4_GSM656234',
 'C6_GSM285575',
 'C6_GSM656232',
 'C8_GSM285576',
 'C8_GSM285590',
 'C10_GSM285578',
 'C10_GSM285591',
 'C12_GSM285579',
 'C12_GSM285592',
 'C12_GSM656237',
 'C14_GSM285580',
 'C14_GSM285593',
 'C14_GSM656238',
 'C16_GSM285582',
 'C16_GSM285595',
 'C16_GSM656239',
 'C18_GSM285583',
 'C18_GSM285596',
 'C18_GSM656240']

In [44]:
modified_ordered_columns = [col_name+'.xys' for col_name in ordered_columns]

In [45]:
column_list = list(qc_rma_signal_data_df.columns)

In [46]:
tidy_column_order_list = ['TTHERM_ID'] + modified_ordered_columns
tidy_column_order_list

['TTHERM_ID',
 'Ll_GSM283687.xys',
 'Ll_GSM284355.xys',
 'Ll_GSM284362.xys',
 'Lm_GSM283690.xys',
 'Lm_GSM284357.xys',
 'Lm_GSM284363.xys',
 'Lh_GSM283691.xys',
 'Lh_GSM284360.xys',
 'Lh_GSM284364.xys',
 'S0_GSM285363.xys',
 'S0_GSM285554.xys',
 'S0_GSM285561.xys',
 'S0_GSM647651.xys',
 'S0_GSM647652.xys',
 'S3_GSM285542.xys',
 'S3_GSM285555.xys',
 'S3_GSM285562.xys',
 'S6_GSM285543.xys',
 'S6_GSM285556.xys',
 'S6_GSM285563.xys',
 'S9_GSM285544.xys',
 'S9_GSM285564.xys',
 'S9_GSM647653.xys',
 'S9_GSM647654.xys',
 'S15_GSM285559.xys',
 'S15_GSM285566.xys',
 'S24_GSM285547.xys',
 'S24_GSM285560.xys',
 'C0_GSM285570.xys',
 'C0_GSM656230.xys',
 'C4_GSM285574.xys',
 'C4_GSM285588.xys',
 'C4_GSM656234.xys',
 'C6_GSM285575.xys',
 'C6_GSM656232.xys',
 'C8_GSM285576.xys',
 'C8_GSM285590.xys',
 'C10_GSM285578.xys',
 'C10_GSM285591.xys',
 'C12_GSM285579.xys',
 'C12_GSM285592.xys',
 'C12_GSM656237.xys',
 'C14_GSM285580.xys',
 'C14_GSM285593.xys',
 'C14_GSM656238.xys',
 'C16_GSM285582.xys',
 'C16_G

In [47]:
len(tidy_column_order_list)

52

In [48]:
len(list(qc_rma_signal_data_df.columns)) # one more column because conatins both tetra and ttherm ids now

53

In [49]:
tidy_cleaned_qc_rma_signal_data_df = cleaned_qc_rma_signal_data_df.loc[:, tidy_column_order_list]
tidy_cleaned_qc_rma_signal_data_df.head()

Unnamed: 0,TTHERM_ID,Ll_GSM283687.xys,Ll_GSM284355.xys,Ll_GSM284362.xys,Lm_GSM283690.xys,Lm_GSM284357.xys,Lm_GSM284363.xys,Lh_GSM283691.xys,Lh_GSM284360.xys,Lh_GSM284364.xys,...,C12_GSM656237.xys,C14_GSM285580.xys,C14_GSM285593.xys,C14_GSM656238.xys,C16_GSM285582.xys,C16_GSM285595.xys,C16_GSM656239.xys,C18_GSM285583.xys,C18_GSM285596.xys,C18_GSM656240.xys
0,TTHERM_00000010,5.066343,4.767264,5.010981,6.139047,4.619361,4.751761,5.81855,5.342529,5.48375,...,6.314438,7.423571,7.507645,7.417087,7.147801,7.74793,7.093641,7.672685,7.51129,6.890117
1,TTHERM_00000020,4.696881,4.638401,4.956299,6.942556,5.101252,4.730307,8.45769,4.526411,4.9083,...,5.250233,4.974993,5.747498,5.252167,5.210531,7.083187,5.252222,5.037613,5.495281,5.013987
2,TTHERM_00000030,4.654278,4.537105,4.928739,5.063991,4.584168,4.91188,5.935311,4.51947,4.757861,...,4.651688,4.920573,4.636333,4.883712,4.779395,4.744335,4.51314,4.838428,4.961475,4.65334
4,TTHERM_000000045,9.633489,9.977124,10.027529,9.720665,9.605762,10.225542,10.279608,10.459966,10.693337,...,11.130466,11.207738,11.009172,10.615417,11.038938,11.009222,10.216348,11.099187,11.172276,10.561021
5,TTHERM_00000062,4.674796,4.407296,4.803447,4.820582,4.334627,4.402841,4.734502,4.786774,4.825016,...,4.645308,4.48239,4.644225,4.534301,4.656172,4.528732,4.5469,4.517456,4.509302,4.440268


In [50]:
tidy_cleaned_qc_rma_signal_data_df = tidy_cleaned_qc_rma_signal_data_df.rename(columns={c: c.split('.')[0] for c in tidy_cleaned_qc_rma_signal_data_df.columns})
tidy_cleaned_qc_rma_signal_data_df.head()

Unnamed: 0,TTHERM_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
0,TTHERM_00000010,5.066343,4.767264,5.010981,6.139047,4.619361,4.751761,5.81855,5.342529,5.48375,...,6.314438,7.423571,7.507645,7.417087,7.147801,7.74793,7.093641,7.672685,7.51129,6.890117
1,TTHERM_00000020,4.696881,4.638401,4.956299,6.942556,5.101252,4.730307,8.45769,4.526411,4.9083,...,5.250233,4.974993,5.747498,5.252167,5.210531,7.083187,5.252222,5.037613,5.495281,5.013987
2,TTHERM_00000030,4.654278,4.537105,4.928739,5.063991,4.584168,4.91188,5.935311,4.51947,4.757861,...,4.651688,4.920573,4.636333,4.883712,4.779395,4.744335,4.51314,4.838428,4.961475,4.65334
4,TTHERM_000000045,9.633489,9.977124,10.027529,9.720665,9.605762,10.225542,10.279608,10.459966,10.693337,...,11.130466,11.207738,11.009172,10.615417,11.038938,11.009222,10.216348,11.099187,11.172276,10.561021
5,TTHERM_00000062,4.674796,4.407296,4.803447,4.820582,4.334627,4.402841,4.734502,4.786774,4.825016,...,4.645308,4.48239,4.644225,4.534301,4.656172,4.528732,4.5469,4.517456,4.509302,4.440268


In [51]:
tidy_cleaned_qc_rma_signal_data_df.shape

(21964, 52)

TAKE THE GEOMETRIC MEAN OF THE SIGNAL VALUES OF DUPLICATED TTHERMS

In [52]:
tidy_cleaned_qc_rma_signal_data_df.loc[tidy_cleaned_qc_rma_signal_data_df.duplicated(subset=['TTHERM_ID'])]

Unnamed: 0,TTHERM_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
36,TTHERM_00001480,4.700491,4.667979,4.711559,4.965456,4.970090,4.744194,4.889514,4.570671,4.540975,...,5.703505,4.973763,4.866402,4.896428,5.165352,5.097879,5.367105,5.597338,5.377334,5.712692
38,TTHERM_000001490,5.614544,5.375390,4.966231,4.958224,5.198247,5.204828,6.051168,4.867997,5.422448,...,4.856101,5.227113,5.038306,5.330369,5.459958,5.343990,5.749558,5.495174,5.372136,5.926892
40,TTHERM_000001490,5.010986,4.956277,5.061556,4.780270,5.048285,4.927103,5.273838,4.521668,5.040086,...,4.651895,4.663449,4.740052,4.847667,4.919039,4.444621,4.991735,5.038364,4.817430,4.780232
41,TTHERM_000001490,6.374212,6.427538,5.387405,5.018113,6.071937,5.785633,7.171887,5.317020,5.407567,...,6.005697,5.610994,5.037020,6.605437,5.855979,5.013803,7.170814,5.363476,5.552600,6.953270
43,TTHERM_000001501,5.365149,5.457097,5.356373,4.985974,6.198803,5.591974,6.393816,5.573241,4.759123,...,5.438921,6.108846,4.856709,6.153831,4.829986,5.787445,6.411224,5.919497,5.773846,6.581112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23992,TTHERM_00648600,4.627832,4.896337,4.998963,4.676617,4.640689,5.172536,4.986456,4.772653,4.542141,...,4.485875,4.617993,4.664032,5.092341,4.641734,4.382280,4.707127,4.560315,4.843762,4.578259
23993,TTHERM_01165210,5.153146,5.888775,4.977209,4.945611,5.408725,4.574284,6.026624,4.775564,4.569627,...,4.935414,4.977147,4.558275,5.847145,4.504703,4.851697,4.574623,5.245583,4.499663,4.857914
23994,TTHERM_00673360,5.688543,5.696582,4.727555,5.669193,5.468421,4.628962,6.440657,4.642310,4.707874,...,4.644603,5.156057,4.725613,5.258865,4.886414,4.623322,4.808239,6.444813,4.823808,4.759183
23995,TTHERM_00673360,5.671534,5.422774,4.915194,5.256722,5.404735,4.569096,6.332531,4.852606,4.985492,...,4.794583,4.805228,4.451050,5.557238,5.226020,4.592799,4.818323,6.264985,4.697243,4.984032


In [53]:
aggregated_tidy_cleaned_qc_rma_signal_data_df = tidy_cleaned_qc_rma_signal_data_df.groupby('TTHERM_ID').aggregate(st.mstats.gmean).reset_index()
aggregated_tidy_cleaned_qc_rma_signal_data_df.head()

Unnamed: 0,TTHERM_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
0,TTHERM_000000045,9.633489,9.977124,10.027529,9.720665,9.605762,10.225542,10.279608,10.459966,10.693337,...,11.130466,11.207738,11.009172,10.615417,11.038938,11.009222,10.216348,11.099187,11.172276,10.561021
1,TTHERM_00000010,5.066343,4.767264,5.010981,6.139047,4.619361,4.751761,5.81855,5.342529,5.48375,...,6.314438,7.423571,7.507645,7.417087,7.147801,7.74793,7.093641,7.672685,7.51129,6.890117
2,TTHERM_00000020,4.696881,4.638401,4.956299,6.942556,5.101252,4.730307,8.45769,4.526411,4.9083,...,5.250233,4.974993,5.747498,5.252167,5.210531,7.083187,5.252222,5.037613,5.495281,5.013987
3,TTHERM_00000030,4.654278,4.537105,4.928739,5.063991,4.584168,4.91188,5.935311,4.51947,4.757861,...,4.651688,4.920573,4.636333,4.883712,4.779395,4.744335,4.51314,4.838428,4.961475,4.65334
4,TTHERM_00000062,4.674796,4.407296,4.803447,4.820582,4.334627,4.402841,4.734502,4.786774,4.825016,...,4.645308,4.48239,4.644225,4.534301,4.656172,4.528732,4.5469,4.517456,4.509302,4.440268


In [54]:
aggregated_tidy_cleaned_qc_rma_signal_data_df.shape

(20251, 52)

### ^NO DATA LOSS ALIGNMENT METHOD^

### MAPPING METHOD ANALYSIS

In [55]:
no_data_loss_unique_genes = []
data_loss_unique_genes = []

no_data_loss_genes = list(aggregated_tidy_cleaned_qc_rma_signal_data_df['TTHERM_ID'].values)
data_loss_genes = list(aggregated_tidy_aligned_qc_rma_df['TTHERM_ID'].values)

for gene in no_data_loss_genes:
    if gene not in data_loss_genes:
        no_data_loss_unique_genes.append(gene)
        
for gene in data_loss_genes:
    if gene not in no_data_loss_genes:
        data_loss_unique_genes.append(gene)
        
different_genes = no_data_loss_unique_genes + data_loss_unique_genes

In [56]:
len(different_genes)

2903

In [57]:
len(no_data_loss_unique_genes)

1558

In [58]:
len(data_loss_unique_genes)

1345

### ^MAPPING METHOD ANALYSIS^

In [59]:
growth = ['Ll', 'Lm', 'Lh']
starvation = ['S0', 'S3', 'S6', 'S9', 'S12', 'S15', 'S24']
conjugation = ['C0', 'C15m', 'C2', 'C4', 'C6', 'C8', 'C10', 'C12', 'C14', 'C16', 'C18']
vegetative = growth + starvation

grow_cols = [c for c in aggregated_tidy_cleaned_qc_rma_signal_data_df.columns if c.split('_')[0] in growth]
starve_cols = [c for c in aggregated_tidy_cleaned_qc_rma_signal_data_df.columns if c.split('_')[0] in starvation]
sex_cols = [c for c in aggregated_tidy_cleaned_qc_rma_signal_data_df.columns if c.split('_')[0] in conjugation]
veg_cols = [c for c in aggregated_tidy_cleaned_qc_rma_signal_data_df.columns if c.split('_')[0] in vegetative]

In [60]:
aggregated_tidy_cleaned_qc_rma_signal_data_df_grow = aggregated_tidy_cleaned_qc_rma_signal_data_df[['TTHERM_ID'] + grow_cols]
aggregated_tidy_cleaned_qc_rma_signal_data_df_starve = aggregated_tidy_cleaned_qc_rma_signal_data_df[['TTHERM_ID'] + starve_cols]
aggregated_tidy_cleaned_qc_rma_signal_data_df_sex = aggregated_tidy_cleaned_qc_rma_signal_data_df[['TTHERM_ID'] + sex_cols]
aggregated_tidy_cleaned_qc_rma_signal_data_df_veg = aggregated_tidy_cleaned_qc_rma_signal_data_df[['TTHERM_ID'] + veg_cols]

In [61]:
aggregated_tidy_cleaned_qc_rma_signal_data_df.to_csv('./agg_tidy_2021aligned_qc_rma_expression_full.csv', index=False)
aggregated_tidy_cleaned_qc_rma_signal_data_df_grow.to_csv('./agg_tidy_2021aligned_qc_rma_expression_grow.csv', index=False)
aggregated_tidy_cleaned_qc_rma_signal_data_df_starve.to_csv('./agg_tidy_2021aligned_qc_rma_expression_starve.csv', index=False)
aggregated_tidy_cleaned_qc_rma_signal_data_df_veg.to_csv('./agg_tidy_2021aligned_qc_rma_expression_veg.csv', index=False)
aggregated_tidy_cleaned_qc_rma_signal_data_df_sex.to_csv('./agg_tidy_2021aligned_qc_rma_expression_sex.csv', index=False)

#### Now, go to ./gene_filtering.Rmd to finish the data pre-processing

In [62]:
%load_ext watermark

In [63]:
%watermark --iversions

numpy    : 1.23.5
bs4      : 4.12.0
watermark: 2.4.3
re       : 2.2.1
requests : 2.31.0
json     : 2.0.9
pandas   : 1.5.3
scipy    : 1.11.1

