In [11]:
import subprocess
import re
import json
import watermark
import requests
import bs4

import pandas as pd
import scipy.stats as st

from functools import reduce

## Step 0: install necessary tools

Install [hisat 2.2.1](https://daehwankimlab.github.io/hisat2/download/)

## Step 1: make probe fasta

Load the probe dataset

In [44]:
probe_df = pd.read_csv('../raw_data/2007-02-28_Tetrahymena_expr.ndf', sep='\t')

Take a look

In [45]:
probe_df.head()

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
0,5314_0001_0001,BLOCK1,rank_selected,rank:09;score:376;uniq:03;count:37;freq:00;rul...,TETRA00S0021925,AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGC...,0,64456195,64456195,1,1,experimental,TETRAP00318583,2262,5314,1,1
1,5314_0023_0001,BLOCK1,rank_selected,rank:09;score:402;uniq:26;count:37;freq:00;rul...,TETRA00S0012676,AAGGGCGCATTAGTGCCAAATGGGACACGTAGAAAGGTAGAGGGAT...,0,64330340,64330340,1,23,experimental,TETRAP00183246,3282,5314,23,1
2,5314_0025_0001,BLOCK1,rank_selected,rank:08;score:115;uniq:01;count:31;freq:00;rul...,TETRA00S0002513,TATCATATAAAAATGTGTCTTGTTTATAAGACATTAATTGCTTAAT...,0,64193974,64193974,1,25,experimental,TETRAP00036232,262,5314,25,1
3,5314_0027_0001,BLOCK1,rank_selected,rank:11;score:341;uniq:23;count:37;freq:00;rul...,TETRA00S0000062,AGATTTGATGTAATATAAGTTATCCGAAACATGGCAGTAGAATGCA...,0,64161203,64161203,1,27,experimental,TETRAP00000895,825,5314,27,1
4,5314_0029_0001,BLOCK1,rank_selected,rank:01;score:560;uniq:21;count:37;freq:00;rul...,TETRA00S0006635,TAAATAACTAAGAGCATAAGGCTGGTGCAAGTAACTCAATGCAACC...,0,64249458,64249458,1,29,experimental,TETRAP00096103,3290,5314,29,1


The seq ids are not unique

In [46]:
len(probe_df['SEQ_ID'].values) == len(probe_df['SEQ_ID'].unique())

False

Can find negative controls if need be

In [47]:
probe_df['DESIGN_NOTE'].unique()

array(['rank_selected', nan, '-', 'upper left fiducial', '0 cycles',
       '04 cycles', '08 cycles', '12 cycles', '16 cycles', '20 cycles',
       '24 cycles', '28 cycles', '32 cycles', '36 cycles', '40 cycles',
       '44 cycles', '48 cycles', '52 cycles', '56 cycles', '60 cycles',
       '64 cycles', '68 cycles', '72 cycles', '76 cycles', '80 cycles',
       '84 cycles', '88 cycles', '92 cycles', '96 cycles', '100 cycles',
       '104 cycles', 'synthesis control', 'upper center fiducial',
       'upper right fiducial', 'REPLICATE1', 'EMPTY',
       'upper right chip_id', '+', 'uniformity control',
       'vertical design_id', 'left center fiducial',
       'center cross fiducial', 'right center fiducial',
       'lower center fiducial', 'REPLICATE2', 'lower left fiducial',
       'lower right fiducial', 'lower left chip_id',
       'horizontal design_id'], dtype=object)

In [48]:
probe_df.loc[probe_df['DESIGN_NOTE'] == '-']

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
158,5314_0491_0001,NGS_CONTROLS,-,T/A,ARRAY_QC_A,ACGTCCCCCTCTGGaTGTTCATACGGTATG,10004,1100011,62205230,1,491,control:reseq_qc:synthesis,XENOSYNTH0093,12,5314,491,1
523,5314_0492_0002,NGS_CONTROLS,-,T/C,ARRAY_QC_A,ACGTCCCCCTCTGGcTGTTCATACGGTATG,10005,1100011,62205231,2,492,control:reseq_qc:synthesis,XENOSYNTH0094,12,5314,492,2
818,5314_0491_0003,NGS_CONTROLS,-,T/G,ARRAY_QC_A,ACGTCCCCCTCTGGgTGTTCATACGGTATG,10006,1100011,62205232,3,491,control:reseq_qc:synthesis,XENOSYNTH0095,12,5314,491,3
1190,5314_0492_0004,NGS_CONTROLS,-,T/T,ARRAY_QC_A,ACGTCCCCCTCTGGtTGTTCATACGGTATG,10007,1100011,62205233,4,492,control:reseq_qc:synthesis,XENOSYNTH0096,12,5314,492,4
6079,5314_0341_0017,NGS_CONTROLS,-,G/A,ARRAY_QC_C,GCGCGGCGTTGGACaTCTGACTAATACATCAA,10004,1100089,62205854,17,341,control:reseq_qc:synthesis,XENOSYNTH0717,90,5314,341,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389799,5314_0434_1016,NGS_CONTROLS,-,G/T,ARRAY_QC_C,ATGTGCGCGGCGTTtGACGTCTGACTAAT,10007,1100093,62205889,1016,434,control:reseq_qc:synthesis,XENOSYNTH0752,94,5314,434,1016
390294,5314_0661_1017,NGS_CONTROLS,-,A/A,LABEL_QC_B,ATTAGGCCCTTCGCaCGCAGCGGCGTGCG,10004,1100155,62206382,1017,661,control:reseq_qc:label,XENOLABEL0445,56,5314,661,1017
390675,5314_0662_1018,NGS_CONTROLS,-,A/C,LABEL_QC_B,ATTAGGCCCTTCGCcCGCAGCGGCGTGCG,10005,1100155,62206383,1018,662,control:reseq_qc:label,XENOLABEL0446,56,5314,662,1018
391056,5314_0661_1019,NGS_CONTROLS,-,A/G,LABEL_QC_B,ATTAGGCCCTTCGCgCGCAGCGGCGTGCG,10006,1100155,62206384,1019,661,control:reseq_qc:label,XENOLABEL0447,56,5314,661,1019


Example of many probes to a single id

In [49]:
probe_df.loc[probe_df['SEQ_ID']=='TETRA00S0021925']

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
0,5314_0001_0001,BLOCK1,rank_selected,rank:09;score:376;uniq:03;count:37;freq:00;rul...,TETRA00S0021925,AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGC...,0,64456195,64456195,1,1,experimental,TETRAP00318583,2262,5314,1,1
12641,5314_0474_0034,BLOCK1,rank_selected,rank:14;score:346;uniq:13;count:37;freq:00;rul...,TETRA00S0021925,TAATAATTTAATAGCGGATAGTCGATAATGTCAAACAGCATTTAAA...,0,64456191,64456191,34,474,experimental,TETRAP00318579,850,5314,474,34
67117,5314_0450_0176,BLOCK1,rank_selected,rank:08;score:378;uniq:05;count:37;freq:00;rul...,TETRA00S0021925,TCAAATATGTACCCCATTACATATCAAACAATGATGAGTTAAAACC...,0,64456194,64456194,176,450,experimental,TETRAP00318582,1777,5314,450,176
82746,5314_0219_0217,BLOCK1,rank_selected,rank:12;score:353;uniq:00;count:37;freq:00;rul...,TETRA00S0021925,CTGTCAGCTCAATCTTCTTACTTTCTGATGGTCAGGACAATAATTC...,0,64456200,64456200,217,219,experimental,TETRAP00318588,5102,5314,219,217
128974,5314_0515_0337,BLOCK1,rank_selected,rank:05;score:423;uniq:06;count:37;freq:00;rul...,TETRA00S0021925,AATTTAGCAATATGAAATCAATAACTAGGCCAAGATATATGCACAA...,0,64456196,64456196,337,515,experimental,TETRAP00318584,2706,5314,515,337
149412,5314_0688_0390,BLOCK1,rank_selected,rank:02;score:487;uniq:09;count:37;freq:00;rul...,TETRA00S0021925,TCGTTTTAGAAATATAGATAGGGTGGAGTTACCAATAGATTGATTA...,0,64456190,64456190,390,688,experimental,TETRAP00318578,742,5314,688,390
203654,5314_0520_0532,BLOCK1,rank_selected,rank:13;score:349;uniq:00;count:37;freq:00;rul...,TETRA00S0021925,TTGATTAAAATGGCAATGATGCTTTCAATCAAAGTGATGATTTAGC...,0,64456192,64456192,532,520,experimental,TETRAP00318580,1133,5314,520,532
208163,5314_0358_0544,BLOCK1,rank_selected,rank:04;score:433;uniq:09;count:37;freq:00;rul...,TETRA00S0021925,GCAATTCATAACATAATTAGATTTAATTCTATCCATAGCCAACATC...,0,64456193,64456193,544,358,experimental,TETRAP00318581,1487,5314,358,544
208452,5314_0167_0545,BLOCK1,rank_selected,rank:03;score:466;uniq:10;count:37;freq:00;rul...,TETRA00S0021925,GAATAGCATAATAATGGATGGCATTATCATAGAGAGTATTGACAAG...,0,64456198,64456198,545,167,experimental,TETRAP00318586,3792,5314,167,545
221084,5314_0088_0578,BLOCK1,rank_selected,rank:11;score:353;uniq:08;count:37;freq:00;rul...,TETRA00S0021925,TGAATATATAGATGGTTAAAACAATATAATTTACGACTCGAATGAG...,0,64456197,64456197,578,88,experimental,TETRAP00318585,3168,5314,88,578


The PROBE_IDs are also not unique

In [50]:
len(probe_df['PROBE_ID'].values) == len(probe_df['PROBE_ID'].unique())

False

In [51]:
probe_df[probe_df.duplicated(subset=['PROBE_ID'])]

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
284,5314_0005_0002,NGS_CONTROLS,upper left fiducial,bright,FIDUCIAL_UPPER_LEFT,TGAGTTGTTTGATAGGATTATTCATAGAGGTCATTACAGCGAGAGG...,0,2000197,62205062,2,5,fiducial,CPK6,0,5314,5,2
285,5314_0007_0002,NGS_CONTROLS,upper left fiducial,bright,FIDUCIAL_UPPER_LEFT,TGAGTTGTTTGATAGGATTATTCATAGAGGTCATTACAGCGAGAGG...,0,2000198,62205063,2,7,fiducial,CPK6,0,5314,7,2
286,5314_0009_0002,NGS_CONTROLS,upper left fiducial,bright,FIDUCIAL_UPPER_LEFT,TGAGTTGTTTGATAGGATTATTCATAGAGGTCATTACAGCGAGAGG...,0,2000199,62205064,2,9,fiducial,CPK6,0,5314,9,2
287,5314_0011_0002,NGS_CONTROLS,upper left fiducial,bright,FIDUCIAL_UPPER_LEFT,TGAGTTGTTTGATAGGATTATTCATAGAGGTCATTACAGCGAGAGG...,0,2000200,62205065,2,11,fiducial,CPK6,0,5314,11,2
288,5314_0013_0002,NGS_CONTROLS,upper left fiducial,bright,FIDUCIAL_UPPER_LEFT,TGAGTTGTTTGATAGGATTATTCATAGAGGTCATTACAGCGAGAGG...,0,2000201,62205066,2,13,fiducial,CPK6,0,5314,13,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392507,5314_0118_1024,H_CODE,horizontal design_id,dark,DESIGN_ID1,N,0,1060094,63771045,3,31,encoded number,empty,15,5314,118,1024
392508,5314_0120_1024,H_CODE,horizontal design_id,dark,DESIGN_ID1,N,0,1060100,63771051,3,33,encoded number,empty,16,5314,120,1024
392509,5314_0122_1024,H_CODE,horizontal design_id,dark,DESIGN_ID1,N,0,1060106,63771057,3,35,encoded number,empty,17,5314,122,1024
392510,5314_0124_1024,H_CODE,horizontal design_id,dark,DESIGN_ID1,N,0,1060112,63771063,3,37,encoded number,empty,18,5314,124,1024


In [52]:
probe_df['PROBE_CLASS'].unique()

array(['experimental', nan, 'control:reseq_qc:synthesis', 'fiducial',
       'linker', 'synthesis', 'control:sample_tracking:A',
       'control:empty', 'encoded number', 'control:reseq_qc:label',
       'uniformity', 'control', 'control:sample_tracking:B'], dtype=object)

In [53]:
probe_df[probe_df.duplicated(subset=['PROBE_ID'])]['PROBE_CLASS'].unique()

array(['fiducial', 'linker', 'synthesis', 'control:empty',
       'encoded number', 'uniformity', 'control',
       'control:sample_tracking:B'], dtype=object)

These are all controls of various sorts, etc. and I can exclude them.

In [54]:
experimental_probe_df = probe_df.loc[probe_df['PROBE_CLASS']=='experimental']

In [55]:
experimental_probe_df['PROBE_CLASS'].unique()

array(['experimental'], dtype=object)

In [56]:
len(experimental_probe_df)

384999

In [57]:
experimental_probe_df['MISMATCH'].unique()

array([0])

Extract the probe ids and sequences to build a fasta file

In [58]:
probe_ids = experimental_probe_df['PROBE_ID'].values
probe_seqs = experimental_probe_df['PROBE_SEQUENCE'].values

Build the fasta file

In [59]:
with open('./2007-02-28_microarray_experimental_probes.fna', 'w') as f:
    for i, p in zip(probe_ids, probe_seqs):
        f.write(f">{i}\n")
        f.write(f"{p}\n\n")

## Step 2: download microarray data and prepare for replicate filtering

Functions for downloading microarray data. Download Wei Miao's, Marty Gorovsky's, Yifan Liu's, and Ron Pearlman's microarrays.

In [2]:
def download_microarray_values(microarray_accession):
    """
    Function to download microarray data based on NCBI GEO accession
    """
    
    base1 = 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc='
    base2 = 'https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?view=data&acc='
    r1 = requests.get(base1 + microarray_accession)
    soup1 = bs4.BeautifulSoup(r1.text, 'html5lib')
    to_data_link = soup1.find("input", {"name": "fulltable"}).attrs['onclick']
    link = re.search(r"\&[^']+", to_data_link).group()
    r2 = requests.get(base2 + microarray_accession + link)
    print(base2 + microarray_accession + link)
    soup2 = bs4.BeautifulSoup(r2.text, 'html5lib')
    data = soup2.find("pre").text
    return data

def download_and_write_tsv(growth_phase, accession):
    
    data = download_microarray_values(accession)
    with open(f'../raw_data/{growth_phase}_{accession}.tsv', 'w') as f:
        f.write(data)

In [3]:
with open('../raw_data/microarray_accessions_all.json', 'r') as f:
    # Including single REP measurement for C-15m (GSM656231) even though there are no replicates for it
    # in order to replicate the 2011 analysis
    # Format: keys are the physiological phase; values are the geo accessions for each microarray
    all_geo = json.load(f)

In [4]:
all_geo

{'Ll': ['GSM283687', 'GSM284355', 'GSM284362'],
 'Lm': ['GSM283690', 'GSM284357', 'GSM284363'],
 'Lh': ['GSM283691', 'GSM284360', 'GSM284364'],
 'S0': ['GSM285363',
  'GSM285554',
  'GSM285561',
  'GSM647244',
  'GSM647651',
  'GSM647652'],
 'S3': ['GSM285542', 'GSM285555', 'GSM285562'],
 'S6': ['GSM285543', 'GSM285556', 'GSM285563'],
 'S9': ['GSM285544', 'GSM285557', 'GSM285564', 'GSM647653', 'GSM647654'],
 'S12': ['GSM285545', 'GSM285558', 'GSM285565'],
 'S15': ['GSM285546', 'GSM285559', 'GSM285566'],
 'S24': ['GSM285547', 'GSM285560', 'GSM285567', 'GSM647245'],
 'C0': ['GSM285570', 'GSM285586', 'GSM656230'],
 'C15m': ['GSM656231'],
 'C2': ['GSM285572', 'GSM285587', 'GSM656233'],
 'C4': ['GSM285574', 'GSM285588', 'GSM656234'],
 'C6': ['GSM285575', 'GSM285589', 'GSM656232'],
 'C8': ['GSM285576', 'GSM285590', 'GSM656236'],
 'C10': ['GSM285578', 'GSM285591', 'GSM656235'],
 'C12': ['GSM285579', 'GSM285592', 'GSM656237'],
 'C14': ['GSM285580', 'GSM285593', 'GSM656238'],
 'C16': ['GSM28558

In [5]:
for k, v in all_geo.items():
    for acc in v:
        print(k, acc)
        download_and_write_tsv(k, acc)

Ll GSM283687
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?view=data&acc=GSM283687&acc=GSM283687&id=20258&db=GeoDb_blob24
Ll GSM284355
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?view=data&acc=GSM284355&acc=GSM284355&id=20261&db=GeoDb_blob24
Ll GSM284362
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?view=data&acc=GSM284362&acc=GSM284362&id=20264&db=GeoDb_blob24
Lm GSM283690
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?view=data&acc=GSM283690&acc=GSM283690&id=20259&db=GeoDb_blob24
Lm GSM284357
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?view=data&acc=GSM284357&acc=GSM284357&id=20262&db=GeoDb_blob24
Lm GSM284363
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?view=data&acc=GSM284363&acc=GSM284363&id=20265&db=GeoDb_blob24
Lh GSM283691
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?view=data&acc=GSM283691&acc=GSM283691&id=20260&db=GeoDb_blob24
Lh GSM284360
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?view=data&acc=GSM284360&acc=GSM284360&id=20263&db=GeoDb_blob24
Lh GSM284364
htt

Define the sexual and vegetative phases

In [6]:
grow = ['Ll', 'Lm', 'Lh']
starve = ['S0', 'S3', 'S6', 'S9', 'S12', 'S15', 'S24']
sex = ['C0', 'C15m', 'C2', 'C4', 'C6', 'C8', 'C10', 'C12', 'C14', 'C16', 'C18']

grow_cols = []
starve_cols = []
sex_cols = []
for k, v in all_geo.items():
    for acc in v:
        if k in grow:
            grow_cols.append(f'{k}_{acc}')
        elif k in starve:
            starve_cols.append(f'{k}_{acc}')
        else:
            sex_cols.append(f'{k}_{acc}')

In [7]:
grow_cols

['Ll_GSM283687',
 'Ll_GSM284355',
 'Ll_GSM284362',
 'Lm_GSM283690',
 'Lm_GSM284357',
 'Lm_GSM284363',
 'Lh_GSM283691',
 'Lh_GSM284360',
 'Lh_GSM284364']

In [8]:
starve_cols

['S0_GSM285363',
 'S0_GSM285554',
 'S0_GSM285561',
 'S0_GSM647244',
 'S0_GSM647651',
 'S0_GSM647652',
 'S3_GSM285542',
 'S3_GSM285555',
 'S3_GSM285562',
 'S6_GSM285543',
 'S6_GSM285556',
 'S6_GSM285563',
 'S9_GSM285544',
 'S9_GSM285557',
 'S9_GSM285564',
 'S9_GSM647653',
 'S9_GSM647654',
 'S12_GSM285545',
 'S12_GSM285558',
 'S12_GSM285565',
 'S15_GSM285546',
 'S15_GSM285559',
 'S15_GSM285566',
 'S24_GSM285547',
 'S24_GSM285560',
 'S24_GSM285567',
 'S24_GSM647245']

In [9]:
sex_cols

['C0_GSM285570',
 'C0_GSM285586',
 'C0_GSM656230',
 'C15m_GSM656231',
 'C2_GSM285572',
 'C2_GSM285587',
 'C2_GSM656233',
 'C4_GSM285574',
 'C4_GSM285588',
 'C4_GSM656234',
 'C6_GSM285575',
 'C6_GSM285589',
 'C6_GSM656232',
 'C8_GSM285576',
 'C8_GSM285590',
 'C8_GSM656236',
 'C10_GSM285578',
 'C10_GSM285591',
 'C10_GSM656235',
 'C12_GSM285579',
 'C12_GSM285592',
 'C12_GSM656237',
 'C14_GSM285580',
 'C14_GSM285593',
 'C14_GSM656238',
 'C16_GSM285582',
 'C16_GSM285595',
 'C16_GSM656239',
 'C18_GSM285583',
 'C18_GSM285596',
 'C18_GSM656240']

Build dataframe of the original microarray data

In [13]:
dfs = []
for k, v in all_geo.items():
    for acc in v:
        df = pd.read_csv(f'../raw_data/{k}_{acc}.tsv', sep='\t', skiprows=2)
        df['VALUE'] = pd.to_numeric(df['VALUE'])
        df = df.rename(columns={'ID_REF': 'UNCORRECTED_SEQ_ID', 'VALUE': f'{k}_{acc}'})
        dfs.append(df)

combined = reduce(lambda df1, df2: df1.merge(df2, on='UNCORRECTED_SEQ_ID'), dfs)

In [14]:
combined.head()

Unnamed: 0,UNCORRECTED_SEQ_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
0,TETRA00S0000001,29.7032,21.7593,46.9154,76.6333,22.6343,33.6221,62.9082,45.9322,53.1154,...,111.2228,234.6018,249.6019,178.3625,181.5551,258.8981,159.2486,239.091,221.8999,143.8424
1,TETRA00S0000002,17.8175,20.0113,43.5554,132.3966,36.6707,38.5801,450.5282,18.8678,45.2883,...,55.4968,39.0713,76.1872,53.449,47.0664,161.0668,46.694,40.4292,56.0046,38.8021
2,TETRA00S0000003,18.5897,19.2872,46.7733,24.9495,18.6105,45.9071,85.8857,20.9437,39.8288,...,23.9787,34.7924,29.3843,28.5066,31.1215,33.2412,25.491,29.2651,38.943,27.4689
3,TETRA00S0000004,263.723,209.7968,154.8699,248.9202,203.7581,174.2286,154.9472,193.9711,156.9843,...,156.4422,385.7303,292.5566,198.1962,222.9066,257.7303,125.9521,243.9288,294.5302,152.4899
4,TETRA00S0000005,3424.0985,5661.3524,2033.0047,3834.3597,6898.2485,2373.6192,4655.6436,6966.6308,3685.3673,...,6830.2352,8284.1254,7618.1843,5835.2323,7590.1526,6924.2819,5427.9806,8522.6952,7160.2338,4996.3176


In [15]:
len(combined)

28064

This is the number of probes that Xiong et al. had at the start of their analysis (pre-filtering) in the 2011 TGN paper. Save to file.

In [16]:
combined.to_csv('./microarray_expression_uncorrected.csv', index=False)

## Step 3: use hisat 2 to align probes to 2006 genome and further verify that Xiong et al. didn't do this for their initial analysis

Note: running this on Mac OS.

In [18]:
sequence_df_2006 = pd.read_csv('../raw_data/2006_Tthermophila_CDS_and_protein_seqs.csv')
sequence_df_2006.head()

Unnamed: 0,NO,PREID,ID,CDS,Aa
0,1,>1.m00557,TTHERM_00000010,ATGTCTGCTGCAAATAGCCCTAAGGTTGGTAACATATGGGTAAAAT...,MSAANSPKVGNIWVKCQPHLDEINQNASSFKAEISNSLEPFESLHG...
1,2,>1.m00558,TTHERM_00000020,ATGAACAAACATAGTTTTTGGGTAAAAACAATTCCATCATTGGATG...,MNKHSFWVKTIPSLDAEKELFHVSAYICDSLENSQDQNQNGVLKIY...
2,3,>1.m00559,TTHERM_00000030,ATGTAAAGCTAAACGATTACTCAGAACAATCATGCTAATCCTTAAG...,MQSQTITQNNHANPQVWTVCNNCKDYDQDTVQFAGQISDSKDPFSS...
3,4,>1.m00560,TTHERM_00000040,ATGAAAAACTAACAATTACAATATATCGAATTATAAAAATTAGATA...,MKNQQLQYIELQKLDNLSSPSKSQESEQILNQINEARHFKKDGGYK...
4,5,>1.m00561,TTHERM_00000050,ATGGATAAATATTCATGCAAGTGTGGGCAAAAATACTTCACTAACT...,MDKYSCKCGQKYFTNSSLYNHIRLKHDNDLGMKQNILKMGRPLGTK...


In [19]:
ttids = sequence_df_2006['ID'].values

In [20]:
len(ttids)

27769

In [21]:
ttids[0]

'TTHERM_00000010'

In [22]:
blank_indices = [i for i, ttid in enumerate(ttids) if 'TTHERM' not in str(ttid)]

In [23]:
sequence_df_2006.iloc[blank_indices]

Unnamed: 0,NO,PREID,ID,CDS,Aa
549,550,>1.m10028,,ATGGTTATGGTAAATTTGATGATTTTATTTCACTAGATTGACTATA...,VMVNLMILFHQIDYNCQQTLHKLHLIIRTLRHCIDQLNFNLFLHIN...
550,551,>1.m10029,,ATGTTAGAATTTTTCAATGAAGATCGCTCTTAGATAAACGAGAGTT...,LEFFNEDRSQINESWLIKQNLNAILGIRVVLLIIVHAIWWFQIYNG...
551,552,>1.m10030,,ATGAATTGCTTACTATTCAAAATAAATTTGCATGAATATATTTATA...,NCLLFKINLHEYIYKNNKLQFFKSVKTNQITFFNKQLLQIKQVKYI...
552,553,>1.m10034,,ATGTTATTATAATTGCCCATAAATATATTATTTCTCATTAATTAAT...,LLQLPINILFLINQCSYALPFQEIETSILSIFDSAQISSFFLDKCQ...
553,554,>1.m10035,,ATGAGTTTAATGACATGGTTTTAAGGTAATTCTTTTAAAAAACCAG...,SLMTWFQGNSFKKPESNANIFILKDNKVINKLQYFYNKFLFSYLPN...
...,...,...,...,...,...
27634,27635,>96.m01488,,ATGGCTAGGGATAAACATTCAACAAACTTCTTTACCGGCAGATTCA...,ARDKHSTNFFTGRFISDAGIEQLDKYQYVGGAYSWLDNKMNGYWLW...
27755,27756,>99.m01410,,ATGAATTCAAATTTAGTCACATAAATTAGAACTTATGGTTGTTATA...,NSNLVTQIRTYGCYKSNMVFSIITYFIFLILIRLTIQFVTCEDIFN...
27756,27757,>99.m01411,,ATGATTGATTTATTTTGTTAATCTTTCACTTTAATGCTAATTTTTC...,IDLFCQSFTLMLIFLHRFVCLIALIMTRRLQFIRQNNNFILTSLQM...
27757,27758,>99.m01412,,ATGATCAAATAGTTTATAAATAAATTAATAAGAAAAATAAATTAAT...,IKQFINKLIRKINQFFVKYLSKICDQFKVEVKNFQISFKEIFYFNS...


In [24]:
sequence_df_2006.iloc[blank_indices[0]]['CDS']

'ATGGTTATGGTAAATTTGATGATTTTATTTCACTAGATTGACTATAATTGCTAACAAACTTTGCACAAACTACATTTGATTATTCGTACTCTTCGTCATTGTATTGATTAATTGAATTTTAATTTGTTTCTTCACATAAACTACTCTAAAGACTACCTTGTCTACTGTTGCTAGTAGAATCTTCATCTGAAGAGTCTCCAATTTCAAACTTAAAGATTAAATCTTCACGGTTTTTATTAATTTTAGCCTTAGAAAATGGAATGTAATAAAAGACATAGGGCACAATCAAGCATGGAGTTATCTCTACCAACGAAAAGAAAAATAAAAATACTTCAGATTCATTTTTAAATGAAGAGAAAATATAATCATCAGTGCAAACAATGCTTGGTTCTACTGAGTATTCTCTGA'

In [25]:
for i, l in enumerate(['a', 'b', 'c']):
    print(i)

0
1
2


Build CDS FASTA, taking TTHERM_IDs when available

In [82]:
with open('./Tthermophila_2006_CDS.fasta', 'w') as f:
    for i in range(len(sequence_df_2006)):
        row = sequence_df_2006.iloc[i]
        ttherm_id = row['ID']
        preid = row['PREID'][1:]
        cds = row['CDS']
        
        if i in blank_indices:
            f.write(f'>{preid}\n{cds}\n\n')
                    
        else:
            f.write(f'>{ttherm_id}\n{cds}\n\n')  


Bash command to index the 2006 _T. thermophila_ genome CDS fasta

In [83]:
index_genome_command = "hisat2-build -f ./Tthermophila_2006_CDS.fasta ttherm_2006"

In [84]:
index_genome_command.split()

['hisat2-build', '-f', './Tthermophila_2006_CDS.fasta', 'ttherm_2006']

In [85]:
r = subprocess.run(args=index_genome_command.split(), capture_output=True)

In [86]:
print(r.stdout.decode('utf-8'))

Building DifferenceCoverSample
  Building sPrime
  Building sPrimeOrder
  V-Sorting samples
  V-Sorting samples time: 00:00:00
  Allocating rank array
  Ranking v-sort output
  Ranking v-sort output time: 00:00:00
  Invoking Larsson-Sadakane on ranks
  Invoking Larsson-Sadakane on ranks time: 00:00:01
  Sanity-checking and returning
Building samples
Reserving space for 12 sample suffixes
Generating random suffixes
QSorting 12 sample offsets, eliminating duplicates
QSorting sample offsets, eliminating duplicates time: 00:00:00
Multikey QSorting 12 samples
  (Using difference cover)
  Multikey QSorting samples time: 00:00:00
Calculating bucket sizes
Splitting and merging
  Splitting and merging time: 00:00:00
Split 1, merged 7; iterating...
Splitting and merging
  Splitting and merging time: 00:00:00
Split 1, merged 0; iterating...
Splitting and merging
  Splitting and merging time: 00:00:00
Split 1, merged 1; iterating...
Splitting and merging
  Splitting and merging time: 00:00:00
Avg 

In [87]:
print(r.stderr.decode('utf-8'))

Settings:
  Output files: "ttherm_2006.*.ht2"
  Line rate: 6 (line is 64 bytes)
  Lines per side: 1 (side is 64 bytes)
  Offset rate: 4 (one in 16)
  FTable chars: 10
  Strings: unpacked
  Local offset rate: 3 (one in 8)
  Local fTable chars: 6
  Local sequence length: 57344
  Local sequence overlap between two consecutive indexes: 1024
  Endianness: little
  Actual local endianness: little
  Sanity checking: disabled
  Assertions: disabled
  Random seed: 0
  Sizeofs: void*:8, int:4, long:8, size_t:8
Input files DNA, FASTA:
  ./Tthermophila_2006_CDS.fasta
Reading reference sizes
  Time reading reference sizes: 00:00:00
Calculating joined length
Writing header
Reserving space for joined string
Joining reference sequences
  Time to join reference sequences: 00:00:01
  Time to read SNPs and splice sites: 00:00:00
Using parameters --bmax 9357617 --dcv 1024
  Doing ahead-of-time memory usage test
  Passed!  Constructing with these parameters: --bmax 9357617 --dcv 1024
Constructing suffix-ar

Bash command to align the probe sequences to the CDS regions

In [88]:
align_to_genome_command = """
hisat2 -f -x ttherm_2006 --no-hd
-U ./2007-02-28_microarray_experimental_probes.fna 
-S microarray_probe_alignment.sam"""

In [89]:
align_to_genome_command.split()

['hisat2',
 '-f',
 '-x',
 'ttherm_2006',
 '--no-hd',
 '-U',
 './2007-02-28_microarray_experimental_probes.fna',
 '-S',
 'microarray_probe_alignment.sam']

In [90]:
r2 = subprocess.run(args=align_to_genome_command.split(), capture_output=True)

In [91]:
print(r2.stdout.decode('utf-8'))




In [92]:
print(r2.stderr.decode('utf-8'))

384999 reads; of these:
  384999 (100.00%) were unpaired; of these:
    2775 (0.72%) aligned 0 times
    357570 (92.88%) aligned exactly 1 time
    24654 (6.40%) aligned >1 times
99.28% overall alignment rate



In [93]:
with open('./microarray_probe_alignment.sam', 'r') as f:
    lines = f.readlines()
    single_alignments = [l for l in lines if re.search(r'NH:i:1$', l)]

Sanity check that there are fewer single alignments than total alignments

In [94]:
len(lines)

436459

In [95]:
len(single_alignments)

357570

Take a look at the formatting

In [96]:
single_alignments[0].split()

['TETRAP00318583',
 '0',
 'TTHERM_00709600',
 '2262',
 '60',
 '60M',
 '*',
 '0',
 '0',
 'AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGCAAATATTTTAAGCC',
 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII',
 'AS:i:0',
 'XN:i:0',
 'XM:i:0',
 'XO:i:0',
 'XG:i:0',
 'NM:i:0',
 'MD:Z:60',
 'YT:Z:UU',
 'NH:i:1']

Build probe_id to ttherm_id dictionary

In [97]:
align_dict = {}
for al in single_alignments:
    s = al.split()
    align_dict[s[0]] = s[2]

In [98]:
align_dict['TETRAP00318583']

'TTHERM_00709600'

Build seq_id to probe_id dictionary

In [99]:
seq_probe_dict = {s: p for p, s in zip(experimental_probe_df['PROBE_ID'].values, experimental_probe_df['SEQ_ID'].values)}

In [100]:
seq_probe_dict['TETRA00S0021925']

'TETRAP00318587'

Build a seq to gene dict

In [101]:
seq_gene_dict = {}
gene_seq_dict = {}
for k, v in seq_probe_dict.items():
    if v in align_dict.keys():
        seq_gene_dict[k] = align_dict[v]
        
        seqs = gene_seq_dict.get(align_dict[v], [])
        seqs.append(k)
        gene_seq_dict[align_dict[v]] = seqs

In [102]:
list(seq_gene_dict.items())[:5]

[('TETRA00S0021925', 'TTHERM_00709600'),
 ('TETRA00S0012676', 'TTHERM_00529480'),
 ('TETRA00S0002513', '133.m00873'),
 ('TETRA00S0000062', 'TTHERM_00002620'),
 ('TETRA00S0006635', 'TTHERM_01013320')]

In [103]:
list(gene_seq_dict.items())[:5]

[('TTHERM_00709600', ['TETRA00S0021925']),
 ('TTHERM_00529480', ['TETRA00S0012676']),
 ('133.m00873', ['TETRA00S0002513']),
 ('TTHERM_00002620', ['TETRA00S0000062']),
 ('TTHERM_01013320', ['TETRA00S0006635'])]

Check that all probes are accounted for

In [104]:
len(seq_gene_dict)

25936

In [105]:
probe_count = 0
for v in gene_seq_dict.values():
    probe_count += len(v)
probe_count

25936

Assign the correct genes to each uncorrected probe seq id, NA if the probe did not uniquely align

In [106]:
combined['TTHERM_ID'] = [seq_gene_dict[seq_id] if seq_id in seq_gene_dict.keys() \
                         else 'NA' \
                         for seq_id in combined['UNCORRECTED_SEQ_ID'].values]

Take only the uniquely assigned probes

In [107]:
corrected = combined.loc[combined['TTHERM_ID'] != 'NA']

In [108]:
len(corrected)

25936

Rearrange the columns for ease of readability

In [109]:
cols = corrected.columns.tolist()
len(cols)

69

In [110]:
cols = cols[-1:] + cols[1:-1]
len(cols)

68

In [111]:
corrected = corrected[cols]

In [112]:
corrected.head()

Unnamed: 0,TTHERM_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
0,TTHERM_00000010,29.7032,21.7593,46.9154,76.6333,22.6343,33.6221,62.9082,45.9322,53.1154,...,111.2228,234.6018,249.6019,178.3625,181.5551,258.8981,159.2486,239.091,221.8999,143.8424
1,TTHERM_00000020,17.8175,20.0113,43.5554,132.3966,36.6707,38.5801,450.5282,18.8678,45.2883,...,55.4968,39.0713,76.1872,53.449,47.0664,161.0668,46.694,40.4292,56.0046,38.8021
2,TTHERM_00000030,18.5897,19.2872,46.7733,24.9495,18.6105,45.9071,85.8857,20.9437,39.8288,...,23.9787,34.7924,29.3843,28.5066,31.1215,33.2412,25.491,29.2651,38.943,27.4689
3,TTHERM_00000040,263.723,209.7968,154.8699,248.9202,203.7581,174.2286,154.9472,193.9711,156.9843,...,156.4422,385.7303,292.5566,198.1962,222.9066,257.7303,125.9521,243.9288,294.5302,152.4899
4,TTHERM_00000050,3424.0985,5661.3524,2033.0047,3834.3597,6898.2485,2373.6192,4655.6436,6966.6308,3685.3673,...,6830.2352,8284.1254,7618.1843,5835.2323,7590.1526,6924.2819,5427.9806,8522.6952,7160.2338,4996.3176


In [113]:
len(corrected)

25936

Check which probes hit a given gene more than once

In [114]:
corrected.loc[corrected.duplicated(subset=['TTHERM_ID'])]

Unnamed: 0,TTHERM_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
1207,TTHERM_00717930,75.7775,126.0664,43.3197,39.3295,111.9790,51.5452,156.4319,78.7681,58.6163,...,235.9610,122.9711,109.9178,109.5524,153.6120,95.4299,201.3398,322.9845,119.8086,215.7121
1625,TTHERM_00752040,2810.7037,1030.8372,2069.8273,1576.1447,1432.5064,1620.9497,3501.8041,1384.7408,1527.9438,...,650.8867,1398.7313,1917.4951,779.2060,2673.6085,2191.1520,1562.5729,1600.3009,2202.7796,2641.8855
2184,13.m04867,28.7663,76.9915,42.7282,47.4550,93.0183,64.8998,38.3750,112.5243,87.7249,...,150.7876,100.2525,94.1939,79.8841,74.8789,97.8898,121.5636,98.4834,125.0537,76.0857
2454,TTHERM_00794030,365.5791,349.5607,264.2913,304.4287,418.6636,280.6463,366.1915,310.0410,260.8275,...,289.8880,506.2560,537.2710,501.3893,414.9439,554.4350,557.8680,538.2794,476.4476,376.4066
2903,TTHERM_00166070,90.1609,83.2118,121.6644,123.0786,105.5498,169.0832,155.8975,163.0649,171.0901,...,110.4839,146.6798,98.5739,89.8804,288.4834,190.1911,200.4902,273.8369,214.2617,183.0940
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27974,TTHERM_00599850,28.8118,99.7446,47.6477,31.6915,123.1194,43.6060,200.1912,35.0597,23.0822,...,23.4992,54.8735,21.9014,42.5787,43.8989,24.5839,45.4470,166.6384,24.1837,41.9082
27988,TTHERM_00630230,16.6888,17.6519,36.4680,16.4334,15.5018,36.6257,16.4960,15.2728,28.2996,...,21.1549,22.0915,24.3591,29.4246,23.5811,21.4280,19.4448,26.2675,22.1755,21.4514
28004,TTHERM_00648600,18.9318,25.0699,41.6095,18.9020,22.3889,51.6352,26.2264,23.6071,29.0651,...,25.3970,28.9857,29.9709,45.4578,27.5162,23.3409,28.4426,27.8582,33.9110,29.6232
28005,TTHERM_00648620,15.5639,19.6646,59.2269,21.0092,19.1264,43.5750,31.6968,18.6004,39.0034,...,32.5252,29.6480,31.4117,65.2298,41.9184,25.3299,42.6067,38.0303,38.1776,36.5782


For genes that have multiple probes, take the geometric mean of the measurements. I think this is the best way to summarize data that has already been normalized (RMA, in this case).

In [115]:
corrected_geom_mean_replicate_probes = corrected.groupby('TTHERM_ID').aggregate(st.mstats.gmean).reset_index()

In [116]:
len(corrected_geom_mean_replicate_probes)

25767

In [117]:
corrected.loc[corrected['TTHERM_ID'] == 'TTHERM_00717930']

Unnamed: 0,TTHERM_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
1204,TTHERM_00717930,115.9345,130.7261,46.7459,78.1046,170.7318,54.7165,260.197,111.8028,88.5908,...,353.8231,182.8143,206.1874,141.8276,287.5981,174.8431,275.1259,437.0536,263.4279,311.6361
1207,TTHERM_00717930,75.7775,126.0664,43.3197,39.3295,111.979,51.5452,156.4319,78.7681,58.6163,...,235.961,122.9711,109.9178,109.5524,153.612,95.4299,201.3398,322.9845,119.8086,215.7121


In [118]:
corrected_geom_mean_replicate_probes.loc[corrected_geom_mean_replicate_probes['TTHERM_ID'] == 'TTHERM_00717930']

Unnamed: 0,TTHERM_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
17483,TTHERM_00717930,93.729539,128.37511,45.000204,55.423956,138.269217,53.107183,201.75012,93.842923,72.061536,...,288.943684,149.936238,150.544563,124.649725,210.186868,129.171435,235.358861,375.714704,177.653955,259.275293


Confirm that the geometric mean worked as intended

In [119]:
st.mstats.gmean([115.9345, 75.7775])

93.72953949396107

Since it looks like Wei and Jie did not do an alignment against the 2006 CDS file or averaging prior to filtering, save compare the corrected and uncorrected dataframes. Comparison in gene_filtering.Rmd in this directory.

In [120]:
corrected_geom_mean_replicate_probes.to_csv('./microarray_expression_corrected_against_2006_genome.csv', index=False)

## Step 4: Return from gene filtering (see replicate_gene_filtering.Rmd)

Returning from gene_filtering.Rmd in this directory, it's clear that Wei and Jie proceeded with the uncorrected data set, filtered it against the median absolute range of expression per gene and also kept genes with a mean expression greater than the median. Pursuing this approach, I end up with the same number of genes kept (15,091). Now, to rename them:

In [121]:
replicate_dataset = pd.read_csv('./TGN_2011_replicate_expression_dataset.csv')
replicate_dataset = replicate_dataset.rename(columns={'Unnamed: 0': 'UNCORRECTED_SEQ_ID'})
replicate_dataset.head()

Unnamed: 0,UNCORRECTED_SEQ_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
0,TETRA00S0000001,4.892546,4.44356,5.55199,6.2599,4.500439,5.071338,5.975176,5.521434,5.731058,...,6.797309,7.87407,7.963485,7.478669,7.504264,8.016241,7.315137,7.901416,7.793765,7.168345
1,TETRA00S0000002,4.155223,4.322743,5.44478,7.048722,5.196556,5.269785,8.815474,4.237854,5.501066,...,5.794333,5.288037,6.251477,5.740091,5.556626,7.331515,5.545165,5.337326,5.807473,5.278063
2,TETRA00S0000005,11.741508,12.466931,10.989398,11.90477,12.752014,11.212873,12.184765,12.766245,11.847593,...,12.73772,13.016134,12.895231,12.510574,12.889913,12.757449,12.4062,13.057094,12.805791,12.286649
3,TETRA00S0000007,4.338246,6.129996,7.063431,5.154676,6.171141,6.495439,7.149044,7.827519,7.825561,...,6.865911,5.832865,6.392171,5.814609,6.061297,7.035859,6.764728,7.832963,7.76658,6.648373
4,TETRA00S0000009,11.051141,12.064436,10.428769,10.641567,11.550626,10.440448,10.763831,12.040421,10.316591,...,11.377394,10.328885,10.308905,11.144679,10.427054,10.351164,11.824022,10.3018,10.402265,11.387048


In [122]:
replicate_dataset['TTHERM_ID'] = [seq_gene_dict[seq_id] if seq_id in seq_gene_dict.keys() \
                         else seq_id \
                         for seq_id in replicate_dataset['UNCORRECTED_SEQ_ID'].values]

In [123]:
replicate_dataset.head()

Unnamed: 0,UNCORRECTED_SEQ_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240,TTHERM_ID
0,TETRA00S0000001,4.892546,4.44356,5.55199,6.2599,4.500439,5.071338,5.975176,5.521434,5.731058,...,7.87407,7.963485,7.478669,7.504264,8.016241,7.315137,7.901416,7.793765,7.168345,TTHERM_00000010
1,TETRA00S0000002,4.155223,4.322743,5.44478,7.048722,5.196556,5.269785,8.815474,4.237854,5.501066,...,5.288037,6.251477,5.740091,5.556626,7.331515,5.545165,5.337326,5.807473,5.278063,TTHERM_00000020
2,TETRA00S0000005,11.741508,12.466931,10.989398,11.90477,12.752014,11.212873,12.184765,12.766245,11.847593,...,13.016134,12.895231,12.510574,12.889913,12.757449,12.4062,13.057094,12.805791,12.286649,TTHERM_00000050
3,TETRA00S0000007,4.338246,6.129996,7.063431,5.154676,6.171141,6.495439,7.149044,7.827519,7.825561,...,5.832865,6.392171,5.814609,6.061297,7.035859,6.764728,7.832963,7.76658,6.648373,TTHERM_00000070
4,TETRA00S0000009,11.051141,12.064436,10.428769,10.641567,11.550626,10.440448,10.763831,12.040421,10.316591,...,10.328885,10.308905,11.144679,10.427054,10.351164,11.824022,10.3018,10.402265,11.387048,TTHERM_00001080


In [124]:
missing_gene_name_indices = [i for i, ttid in enumerate(replicate_dataset['TTHERM_ID'].values) if 'TTHERM_' not in str(ttid)]

In [125]:
len(missing_gene_name_indices)

1239

In [126]:
replicate_dataset.iloc[missing_gene_name_indices]

Unnamed: 0,UNCORRECTED_SEQ_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240,TTHERM_ID
15,TETRA00S0000027,11.259899,10.862308,10.822879,11.756290,11.112717,10.685120,11.850949,10.502872,10.784160,...,11.938681,12.458233,11.717692,11.896734,12.170458,10.589308,11.877426,12.011693,11.265666,TETRA00S0000027
166,TETRA00S0000333,7.805446,9.246343,7.564189,8.598994,9.779958,7.719495,9.178070,10.099211,8.644419,...,8.665926,8.434788,9.501262,8.683820,8.129070,10.438154,9.148891,8.292993,9.882434,TETRA00S0000333
168,TETRA00S0000336,12.074475,10.786071,10.695620,10.618766,10.027462,9.783726,11.804493,12.780572,11.989773,...,11.615949,12.055575,13.257304,12.947496,12.917300,13.721910,12.913025,13.046435,13.560062,TETRA00S0000336
183,TETRA00S0000359,4.308572,4.304394,5.248763,4.232630,4.256513,5.343020,4.523330,4.078473,5.242911,...,4.921008,5.169660,5.233436,6.781042,8.840538,6.525552,10.605322,11.174250,6.302056,TETRA00S0000359
184,TETRA00S0000363,8.484151,8.241670,7.782576,8.835275,8.346202,7.578139,8.834784,8.095225,7.658440,...,9.020691,8.852411,9.472072,8.445456,9.000405,8.836564,9.038028,9.233272,9.092585,TETRA00S0000363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15091,TETRA00S0028570,8.423005,8.635822,6.324922,8.239211,8.334476,5.115886,9.251002,6.044296,5.878716,...,8.028859,5.636555,7.935665,7.320909,6.398645,9.238629,9.465458,4.867684,9.352216,TETRA00S0028570
15092,TETRA00S0028581,7.356286,8.970582,5.659505,8.553240,9.319967,5.940202,9.163982,6.483297,5.426238,...,8.380416,5.145262,8.046201,8.487750,5.660198,9.191677,9.211191,5.961401,9.173900,TETRA00S0028581
15093,TETRA00S0028582,7.589011,8.784031,5.365770,8.051975,9.099739,5.751027,9.215854,6.099131,6.107472,...,8.087186,6.197997,8.106758,7.879441,6.211546,9.366154,9.160269,4.408399,8.898594,TETRA00S0028582
15094,TETRA00S0028713,10.097656,6.512287,9.366847,6.033203,5.252995,8.440906,8.814771,7.658445,7.331362,...,5.021546,5.265812,6.396490,6.522125,5.378567,6.084262,5.040099,5.081629,6.621671,TETRA00S0028713


It looks like there are 1238 probes that we just won't know what to do with once they are clustered somewhere, if they don't get thrown out by MCL.

In [127]:
replicate_dataset.loc[replicate_dataset.duplicated(subset=['TTHERM_ID'])]

Unnamed: 0,UNCORRECTED_SEQ_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240,TTHERM_ID
608,TETRA00S0001238,6.243698,6.97804,5.436951,5.29754,6.807084,5.687766,7.289391,6.29954,5.87323,...,6.942175,6.780281,6.775477,7.263147,6.576369,7.653489,8.335321,6.904588,7.752963,TTHERM_00717930
800,TETRA00S0001680,11.456716,10.009601,11.015295,10.622184,10.484326,10.662624,11.773883,10.4354,10.577376,...,10.449903,10.905007,9.605861,11.384573,11.097474,10.609708,10.644127,11.105109,11.367352,TTHERM_00752040
1252,TETRA00S0002536,8.51404,8.449399,8.045985,8.249961,8.709648,8.132609,8.516454,8.276315,8.026952,...,8.983723,9.069506,8.969787,8.696772,9.114875,9.12378,9.072211,8.896174,8.556148,TTHERM_00794030
12801,TETRA00S0023405,11.826661,10.5718,10.960391,11.0845,11.489998,10.866819,12.313717,10.930757,10.788747,...,11.793387,11.855897,10.194914,11.958765,12.094213,10.754072,11.88952,11.973632,11.064113,TTHERM_01570160
13372,TETRA00S0024443,14.415671,13.69547,14.096115,14.426257,14.415581,14.037657,14.732643,13.907661,14.011862,...,14.372267,14.350199,13.129094,14.239149,14.255666,13.542568,14.332072,14.345515,13.49451,TTHERM_01707350
13904,TETRA00S0025530,14.712615,14.42996,14.294241,14.791348,14.669765,14.520572,15.251428,14.472195,14.448989,...,14.995525,14.958738,13.87257,14.965279,14.914207,14.27442,14.777892,14.85124,14.257377,TTHERM_01213990
15013,TETRA00S0027808,14.817981,15.272203,13.614994,14.819747,15.282058,13.677392,14.334603,15.005176,13.542452,...,14.17479,14.178833,14.725922,14.088646,14.285652,14.937058,14.423238,14.324184,14.56794,TTHERM_00449430
15015,TETRA00S0027813,14.330142,13.881464,14.131626,14.029808,14.046341,13.715625,14.683371,13.936238,13.856879,...,14.334515,14.024484,12.771538,14.202922,14.043446,13.131129,13.932179,14.049413,13.354624,TTHERM_01751620
15018,TETRA00S0027820,15.52035,15.77095,15.065631,15.392029,15.66377,14.972751,14.910675,15.579011,14.400532,...,14.052286,13.918021,14.214374,14.060119,14.116867,14.510474,14.524008,14.422643,14.379518,TTHERM_00992730
15022,TETRA00S0027829,4.91254,5.362196,5.347432,4.651339,6.029479,6.094758,4.503043,6.969213,6.064126,...,9.913838,9.749554,11.343718,8.985948,9.233888,9.899112,8.308235,9.086821,8.933531,TTHERM_00794670


It looks like 21 probes in this dataset show up more than once. It's not clear to me what the CLR and MCL algorithms will think of non-unique node labels, so I would just rely on indices. No need to write an updated, relabeled file.

After going through a lot of trouble and successfully replicating their filtering. I'm having trouble getting the same number of nodes and edges from the CLR algorithm. What they corrected the genes against the genome *after* they filtered the microarray?

In [128]:
filtered = pd.read_csv('TGN_2011_replicate_expression_dataset.csv')
filtered = filtered.rename(columns={'Unnamed: 0': 'Probe'})
filtered

Unnamed: 0,Probe,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
0,TETRA00S0000001,4.892546,4.443560,5.551990,6.259900,4.500439,5.071338,5.975176,5.521434,5.731058,...,6.797309,7.874070,7.963485,7.478669,7.504264,8.016241,7.315137,7.901416,7.793765,7.168345
1,TETRA00S0000002,4.155223,4.322743,5.444780,7.048722,5.196556,5.269785,8.815474,4.237854,5.501066,...,5.794333,5.288037,6.251477,5.740091,5.556626,7.331515,5.545165,5.337326,5.807473,5.278063
2,TETRA00S0000005,11.741508,12.466931,10.989398,11.904770,12.752014,11.212873,12.184765,12.766245,11.847593,...,12.737720,13.016134,12.895231,12.510574,12.889913,12.757449,12.406200,13.057094,12.805791,12.286649
3,TETRA00S0000007,4.338246,6.129996,7.063431,5.154676,6.171141,6.495439,7.149044,7.827519,7.825561,...,6.865911,5.832865,6.392171,5.814609,6.061297,7.035859,6.764728,7.832963,7.766580,6.648373
4,TETRA00S0000009,11.051141,12.064436,10.428769,10.641567,11.550626,10.440448,10.763831,12.040421,10.316591,...,11.377394,10.328885,10.308905,11.144679,10.427054,10.351164,11.824022,10.301800,10.402265,11.387048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15091,TETRA00S0028570,8.423005,8.635822,6.324922,8.239211,8.334476,5.115886,9.251002,6.044296,5.878716,...,8.076541,8.028859,5.636555,7.935665,7.320909,6.398645,9.238629,9.465458,4.867684,9.352216
15092,TETRA00S0028581,7.356286,8.970582,5.659505,8.553240,9.319967,5.940202,9.163982,6.483297,5.426238,...,7.462817,8.380416,5.145262,8.046201,8.487750,5.660198,9.191677,9.211191,5.961401,9.173900
15093,TETRA00S0028582,7.589011,8.784031,5.365770,8.051975,9.099739,5.751027,9.215854,6.099131,6.107472,...,7.589005,8.087186,6.197997,8.106758,7.879441,6.211546,9.366154,9.160269,4.408399,8.898594
15094,TETRA00S0028713,10.097656,6.512287,9.366847,6.033203,5.252995,8.440906,8.814771,7.658445,7.331362,...,5.558534,5.021546,5.265812,6.396490,6.522125,5.378567,6.084262,5.040099,5.081629,6.621671


In [129]:
filtered['TTHERM_ID'] = [seq_gene_dict[p] if p in seq_gene_dict.keys() else 'NA' for p in filtered['Probe'].values]

In [130]:
filtered

Unnamed: 0,Probe,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240,TTHERM_ID
0,TETRA00S0000001,4.892546,4.443560,5.551990,6.259900,4.500439,5.071338,5.975176,5.521434,5.731058,...,7.874070,7.963485,7.478669,7.504264,8.016241,7.315137,7.901416,7.793765,7.168345,TTHERM_00000010
1,TETRA00S0000002,4.155223,4.322743,5.444780,7.048722,5.196556,5.269785,8.815474,4.237854,5.501066,...,5.288037,6.251477,5.740091,5.556626,7.331515,5.545165,5.337326,5.807473,5.278063,TTHERM_00000020
2,TETRA00S0000005,11.741508,12.466931,10.989398,11.904770,12.752014,11.212873,12.184765,12.766245,11.847593,...,13.016134,12.895231,12.510574,12.889913,12.757449,12.406200,13.057094,12.805791,12.286649,TTHERM_00000050
3,TETRA00S0000007,4.338246,6.129996,7.063431,5.154676,6.171141,6.495439,7.149044,7.827519,7.825561,...,5.832865,6.392171,5.814609,6.061297,7.035859,6.764728,7.832963,7.766580,6.648373,TTHERM_00000070
4,TETRA00S0000009,11.051141,12.064436,10.428769,10.641567,11.550626,10.440448,10.763831,12.040421,10.316591,...,10.328885,10.308905,11.144679,10.427054,10.351164,11.824022,10.301800,10.402265,11.387048,TTHERM_00001080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15091,TETRA00S0028570,8.423005,8.635822,6.324922,8.239211,8.334476,5.115886,9.251002,6.044296,5.878716,...,8.028859,5.636555,7.935665,7.320909,6.398645,9.238629,9.465458,4.867684,9.352216,
15092,TETRA00S0028581,7.356286,8.970582,5.659505,8.553240,9.319967,5.940202,9.163982,6.483297,5.426238,...,8.380416,5.145262,8.046201,8.487750,5.660198,9.191677,9.211191,5.961401,9.173900,
15093,TETRA00S0028582,7.589011,8.784031,5.365770,8.051975,9.099739,5.751027,9.215854,6.099131,6.107472,...,8.087186,6.197997,8.106758,7.879441,6.211546,9.366154,9.160269,4.408399,8.898594,
15094,TETRA00S0028713,10.097656,6.512287,9.366847,6.033203,5.252995,8.440906,8.814771,7.658445,7.331362,...,5.021546,5.265812,6.396490,6.522125,5.378567,6.084262,5.040099,5.081629,6.621671,


In [131]:
filtered2 = filtered.loc[filtered['TTHERM_ID'] != 'NA']

In [132]:
filtered2

Unnamed: 0,Probe,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240,TTHERM_ID
0,TETRA00S0000001,4.892546,4.443560,5.551990,6.259900,4.500439,5.071338,5.975176,5.521434,5.731058,...,7.874070,7.963485,7.478669,7.504264,8.016241,7.315137,7.901416,7.793765,7.168345,TTHERM_00000010
1,TETRA00S0000002,4.155223,4.322743,5.444780,7.048722,5.196556,5.269785,8.815474,4.237854,5.501066,...,5.288037,6.251477,5.740091,5.556626,7.331515,5.545165,5.337326,5.807473,5.278063,TTHERM_00000020
2,TETRA00S0000005,11.741508,12.466931,10.989398,11.904770,12.752014,11.212873,12.184765,12.766245,11.847593,...,13.016134,12.895231,12.510574,12.889913,12.757449,12.406200,13.057094,12.805791,12.286649,TTHERM_00000050
3,TETRA00S0000007,4.338246,6.129996,7.063431,5.154676,6.171141,6.495439,7.149044,7.827519,7.825561,...,5.832865,6.392171,5.814609,6.061297,7.035859,6.764728,7.832963,7.766580,6.648373,TTHERM_00000070
4,TETRA00S0000009,11.051141,12.064436,10.428769,10.641567,11.550626,10.440448,10.763831,12.040421,10.316591,...,10.328885,10.308905,11.144679,10.427054,10.351164,11.824022,10.301800,10.402265,11.387048,TTHERM_00001080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15049,TETRA00S0027885,11.332984,7.989309,7.583811,6.325467,8.112645,7.273272,7.121557,6.199861,6.066453,...,5.079160,4.926706,5.644903,5.064956,5.064577,7.788410,4.870764,4.620229,6.965380,TTHERM_01071450
15050,TETRA00S0027886,11.627667,8.295184,7.489789,6.595529,8.086054,7.240285,7.154241,6.347698,5.858260,...,5.046068,5.132836,5.925114,5.157820,5.276132,7.988043,5.210654,5.528121,7.409787,TTHERM_01071450
15054,TETRA00S0027890,8.720702,8.033177,6.383371,6.196040,7.299660,7.066853,7.350990,7.867450,6.150529,...,8.347612,7.079649,9.708094,8.387417,8.216887,10.937766,8.023092,8.998098,10.663220,TTHERM_00716150
15058,TETRA00S0027899,5.267880,4.780226,5.316606,4.366476,4.400169,5.599380,4.168570,5.095629,5.101516,...,4.875067,4.895996,4.751067,5.033749,4.919512,5.016679,4.894289,5.250386,5.165787,TTHERM_00076990


In [133]:
filtered2 = filtered2[['TTHERM_ID']+list(filtered2.columns[1:-1])]

In [134]:
filtered2

Unnamed: 0,TTHERM_ID,Ll_GSM283687,Ll_GSM284355,Ll_GSM284362,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
0,TTHERM_00000010,4.892546,4.443560,5.551990,6.259900,4.500439,5.071338,5.975176,5.521434,5.731058,...,6.797309,7.874070,7.963485,7.478669,7.504264,8.016241,7.315137,7.901416,7.793765,7.168345
1,TTHERM_00000020,4.155223,4.322743,5.444780,7.048722,5.196556,5.269785,8.815474,4.237854,5.501066,...,5.794333,5.288037,6.251477,5.740091,5.556626,7.331515,5.545165,5.337326,5.807473,5.278063
2,TTHERM_00000050,11.741508,12.466931,10.989398,11.904770,12.752014,11.212873,12.184765,12.766245,11.847593,...,12.737720,13.016134,12.895231,12.510574,12.889913,12.757449,12.406200,13.057094,12.805791,12.286649
3,TTHERM_00000070,4.338246,6.129996,7.063431,5.154676,6.171141,6.495439,7.149044,7.827519,7.825561,...,6.865911,5.832865,6.392171,5.814609,6.061297,7.035859,6.764728,7.832963,7.766580,6.648373
4,TTHERM_00001080,11.051141,12.064436,10.428769,10.641567,11.550626,10.440448,10.763831,12.040421,10.316591,...,11.377394,10.328885,10.308905,11.144679,10.427054,10.351164,11.824022,10.301800,10.402265,11.387048
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15049,TTHERM_01071450,11.332984,7.989309,7.583811,6.325467,8.112645,7.273272,7.121557,6.199861,6.066453,...,6.265186,5.079160,4.926706,5.644903,5.064956,5.064577,7.788410,4.870764,4.620229,6.965380
15050,TTHERM_01071450,11.627667,8.295184,7.489789,6.595529,8.086054,7.240285,7.154241,6.347698,5.858260,...,6.697903,5.046068,5.132836,5.925114,5.157820,5.276132,7.988043,5.210654,5.528121,7.409787
15054,TTHERM_00716150,8.720702,8.033177,6.383371,6.196040,7.299660,7.066853,7.350990,7.867450,6.150529,...,9.585347,8.347612,7.079649,9.708094,8.387417,8.216887,10.937766,8.023092,8.998098,10.663220
15058,TTHERM_00076990,5.267880,4.780226,5.316606,4.366476,4.400169,5.599380,4.168570,5.095629,5.101516,...,4.625650,4.875067,4.895996,4.751067,5.033749,4.919512,5.016679,4.894289,5.250386,5.165787


In [135]:
filtered2.to_csv('TGN_2011_replicate_expression_dataset_2006_corrected.csv', index=False)

Did they only use the 60 microarrays that are on the TFGD??

In [136]:
exclude = ['S0_GSM647244', 'S0_GSM647651', 'S0_GSM647652', 'S9_GSM647653', 'S9_GSM647654', 'S24_GSM647245', 'C15m_GSM656231', 'TTHERM_ID']

columns = [c for c in filtered.columns if c not in exclude]

In [137]:
filtered3 = filtered[columns]
len(filtered3.columns)

61

In [138]:
filtered3.to_csv('TGN_2011_replicate_expression_dataset_60_chips.csv', index=False)

In [139]:
%load_ext watermark

In [140]:
%watermark --iversions

watermark: 2.3.0
scipy    : 1.7.3
pandas   : 1.3.5
bs4      : 4.10.0
json     : 2.0.9
re       : 2.2.1
requests : 2.27.1

