In [1]:
import subprocess
import re
import json
import watermark
import requests
import bs4

import glob

import scipy.stats as st
import numpy as np

from functools import reduce

## Step 0: install necessary tools

Install [hisat 2.2.1](https://daehwankimlab.github.io/hisat2/download/)

## Step 1: make probe fasta

In [2]:
import pandas as pd

Load the probe dataset

In [4]:
probe_df = pd.read_csv('../../raw_data/2007-02-28_Tetrahymena_expr.ndf', sep='\t')

Take a look

In [5]:
probe_df.head()

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
0,5314_0001_0001,BLOCK1,rank_selected,rank:09;score:376;uniq:03;count:37;freq:00;rul...,TETRA00S0021925,AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGC...,0,64456195,64456195,1,1,experimental,TETRAP00318583,2262,5314,1,1
1,5314_0023_0001,BLOCK1,rank_selected,rank:09;score:402;uniq:26;count:37;freq:00;rul...,TETRA00S0012676,AAGGGCGCATTAGTGCCAAATGGGACACGTAGAAAGGTAGAGGGAT...,0,64330340,64330340,1,23,experimental,TETRAP00183246,3282,5314,23,1
2,5314_0025_0001,BLOCK1,rank_selected,rank:08;score:115;uniq:01;count:31;freq:00;rul...,TETRA00S0002513,TATCATATAAAAATGTGTCTTGTTTATAAGACATTAATTGCTTAAT...,0,64193974,64193974,1,25,experimental,TETRAP00036232,262,5314,25,1
3,5314_0027_0001,BLOCK1,rank_selected,rank:11;score:341;uniq:23;count:37;freq:00;rul...,TETRA00S0000062,AGATTTGATGTAATATAAGTTATCCGAAACATGGCAGTAGAATGCA...,0,64161203,64161203,1,27,experimental,TETRAP00000895,825,5314,27,1
4,5314_0029_0001,BLOCK1,rank_selected,rank:01;score:560;uniq:21;count:37;freq:00;rul...,TETRA00S0006635,TAAATAACTAAGAGCATAAGGCTGGTGCAAGTAACTCAATGCAACC...,0,64249458,64249458,1,29,experimental,TETRAP00096103,3290,5314,29,1


The seq ids are not unique

In [6]:
len(probe_df['SEQ_ID'].values) == len(probe_df['SEQ_ID'].unique())

False

Can find negative controls if need be

In [7]:
probe_df['DESIGN_NOTE'].unique()

array(['rank_selected', nan, '-', 'upper left fiducial', '0 cycles',
       '04 cycles', '08 cycles', '12 cycles', '16 cycles', '20 cycles',
       '24 cycles', '28 cycles', '32 cycles', '36 cycles', '40 cycles',
       '44 cycles', '48 cycles', '52 cycles', '56 cycles', '60 cycles',
       '64 cycles', '68 cycles', '72 cycles', '76 cycles', '80 cycles',
       '84 cycles', '88 cycles', '92 cycles', '96 cycles', '100 cycles',
       '104 cycles', 'synthesis control', 'upper center fiducial',
       'upper right fiducial', 'REPLICATE1', 'EMPTY',
       'upper right chip_id', '+', 'uniformity control',
       'vertical design_id', 'left center fiducial',
       'center cross fiducial', 'right center fiducial',
       'lower center fiducial', 'REPLICATE2', 'lower left fiducial',
       'lower right fiducial', 'lower left chip_id',
       'horizontal design_id'], dtype=object)

In [8]:
probe_df.loc[probe_df['DESIGN_NOTE'] == '-']

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
158,5314_0491_0001,NGS_CONTROLS,-,T/A,ARRAY_QC_A,ACGTCCCCCTCTGGaTGTTCATACGGTATG,10004,1100011,62205230,1,491,control:reseq_qc:synthesis,XENOSYNTH0093,12,5314,491,1
523,5314_0492_0002,NGS_CONTROLS,-,T/C,ARRAY_QC_A,ACGTCCCCCTCTGGcTGTTCATACGGTATG,10005,1100011,62205231,2,492,control:reseq_qc:synthesis,XENOSYNTH0094,12,5314,492,2
818,5314_0491_0003,NGS_CONTROLS,-,T/G,ARRAY_QC_A,ACGTCCCCCTCTGGgTGTTCATACGGTATG,10006,1100011,62205232,3,491,control:reseq_qc:synthesis,XENOSYNTH0095,12,5314,491,3
1190,5314_0492_0004,NGS_CONTROLS,-,T/T,ARRAY_QC_A,ACGTCCCCCTCTGGtTGTTCATACGGTATG,10007,1100011,62205233,4,492,control:reseq_qc:synthesis,XENOSYNTH0096,12,5314,492,4
6079,5314_0341_0017,NGS_CONTROLS,-,G/A,ARRAY_QC_C,GCGCGGCGTTGGACaTCTGACTAATACATCAA,10004,1100089,62205854,17,341,control:reseq_qc:synthesis,XENOSYNTH0717,90,5314,341,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389799,5314_0434_1016,NGS_CONTROLS,-,G/T,ARRAY_QC_C,ATGTGCGCGGCGTTtGACGTCTGACTAAT,10007,1100093,62205889,1016,434,control:reseq_qc:synthesis,XENOSYNTH0752,94,5314,434,1016
390294,5314_0661_1017,NGS_CONTROLS,-,A/A,LABEL_QC_B,ATTAGGCCCTTCGCaCGCAGCGGCGTGCG,10004,1100155,62206382,1017,661,control:reseq_qc:label,XENOLABEL0445,56,5314,661,1017
390675,5314_0662_1018,NGS_CONTROLS,-,A/C,LABEL_QC_B,ATTAGGCCCTTCGCcCGCAGCGGCGTGCG,10005,1100155,62206383,1018,662,control:reseq_qc:label,XENOLABEL0446,56,5314,662,1018
391056,5314_0661_1019,NGS_CONTROLS,-,A/G,LABEL_QC_B,ATTAGGCCCTTCGCgCGCAGCGGCGTGCG,10006,1100155,62206384,1019,661,control:reseq_qc:label,XENOLABEL0447,56,5314,661,1019


Example of many probes to a single id

In [9]:
probe_df.loc[probe_df['SEQ_ID']=='TETRA00S0021925']

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
0,5314_0001_0001,BLOCK1,rank_selected,rank:09;score:376;uniq:03;count:37;freq:00;rul...,TETRA00S0021925,AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGC...,0,64456195,64456195,1,1,experimental,TETRAP00318583,2262,5314,1,1
12641,5314_0474_0034,BLOCK1,rank_selected,rank:14;score:346;uniq:13;count:37;freq:00;rul...,TETRA00S0021925,TAATAATTTAATAGCGGATAGTCGATAATGTCAAACAGCATTTAAA...,0,64456191,64456191,34,474,experimental,TETRAP00318579,850,5314,474,34
67117,5314_0450_0176,BLOCK1,rank_selected,rank:08;score:378;uniq:05;count:37;freq:00;rul...,TETRA00S0021925,TCAAATATGTACCCCATTACATATCAAACAATGATGAGTTAAAACC...,0,64456194,64456194,176,450,experimental,TETRAP00318582,1777,5314,450,176
82746,5314_0219_0217,BLOCK1,rank_selected,rank:12;score:353;uniq:00;count:37;freq:00;rul...,TETRA00S0021925,CTGTCAGCTCAATCTTCTTACTTTCTGATGGTCAGGACAATAATTC...,0,64456200,64456200,217,219,experimental,TETRAP00318588,5102,5314,219,217
128974,5314_0515_0337,BLOCK1,rank_selected,rank:05;score:423;uniq:06;count:37;freq:00;rul...,TETRA00S0021925,AATTTAGCAATATGAAATCAATAACTAGGCCAAGATATATGCACAA...,0,64456196,64456196,337,515,experimental,TETRAP00318584,2706,5314,515,337
149412,5314_0688_0390,BLOCK1,rank_selected,rank:02;score:487;uniq:09;count:37;freq:00;rul...,TETRA00S0021925,TCGTTTTAGAAATATAGATAGGGTGGAGTTACCAATAGATTGATTA...,0,64456190,64456190,390,688,experimental,TETRAP00318578,742,5314,688,390
203654,5314_0520_0532,BLOCK1,rank_selected,rank:13;score:349;uniq:00;count:37;freq:00;rul...,TETRA00S0021925,TTGATTAAAATGGCAATGATGCTTTCAATCAAAGTGATGATTTAGC...,0,64456192,64456192,532,520,experimental,TETRAP00318580,1133,5314,520,532
208163,5314_0358_0544,BLOCK1,rank_selected,rank:04;score:433;uniq:09;count:37;freq:00;rul...,TETRA00S0021925,GCAATTCATAACATAATTAGATTTAATTCTATCCATAGCCAACATC...,0,64456193,64456193,544,358,experimental,TETRAP00318581,1487,5314,358,544
208452,5314_0167_0545,BLOCK1,rank_selected,rank:03;score:466;uniq:10;count:37;freq:00;rul...,TETRA00S0021925,GAATAGCATAATAATGGATGGCATTATCATAGAGAGTATTGACAAG...,0,64456198,64456198,545,167,experimental,TETRAP00318586,3792,5314,167,545
221084,5314_0088_0578,BLOCK1,rank_selected,rank:11;score:353;uniq:08;count:37;freq:00;rul...,TETRA00S0021925,TGAATATATAGATGGTTAAAACAATATAATTTACGACTCGAATGAG...,0,64456197,64456197,578,88,experimental,TETRAP00318585,3168,5314,88,578


The PROBE_IDs are also not unique

In [10]:
len(probe_df['PROBE_ID'].values) == len(probe_df['PROBE_ID'].unique())

False

In [11]:
probe_df[probe_df.duplicated(subset=['PROBE_ID'])]

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
284,5314_0005_0002,NGS_CONTROLS,upper left fiducial,bright,FIDUCIAL_UPPER_LEFT,TGAGTTGTTTGATAGGATTATTCATAGAGGTCATTACAGCGAGAGG...,0,2000197,62205062,2,5,fiducial,CPK6,0,5314,5,2
285,5314_0007_0002,NGS_CONTROLS,upper left fiducial,bright,FIDUCIAL_UPPER_LEFT,TGAGTTGTTTGATAGGATTATTCATAGAGGTCATTACAGCGAGAGG...,0,2000198,62205063,2,7,fiducial,CPK6,0,5314,7,2
286,5314_0009_0002,NGS_CONTROLS,upper left fiducial,bright,FIDUCIAL_UPPER_LEFT,TGAGTTGTTTGATAGGATTATTCATAGAGGTCATTACAGCGAGAGG...,0,2000199,62205064,2,9,fiducial,CPK6,0,5314,9,2
287,5314_0011_0002,NGS_CONTROLS,upper left fiducial,bright,FIDUCIAL_UPPER_LEFT,TGAGTTGTTTGATAGGATTATTCATAGAGGTCATTACAGCGAGAGG...,0,2000200,62205065,2,11,fiducial,CPK6,0,5314,11,2
288,5314_0013_0002,NGS_CONTROLS,upper left fiducial,bright,FIDUCIAL_UPPER_LEFT,TGAGTTGTTTGATAGGATTATTCATAGAGGTCATTACAGCGAGAGG...,0,2000201,62205066,2,13,fiducial,CPK6,0,5314,13,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392507,5314_0118_1024,H_CODE,horizontal design_id,dark,DESIGN_ID1,N,0,1060094,63771045,3,31,encoded number,empty,15,5314,118,1024
392508,5314_0120_1024,H_CODE,horizontal design_id,dark,DESIGN_ID1,N,0,1060100,63771051,3,33,encoded number,empty,16,5314,120,1024
392509,5314_0122_1024,H_CODE,horizontal design_id,dark,DESIGN_ID1,N,0,1060106,63771057,3,35,encoded number,empty,17,5314,122,1024
392510,5314_0124_1024,H_CODE,horizontal design_id,dark,DESIGN_ID1,N,0,1060112,63771063,3,37,encoded number,empty,18,5314,124,1024


In [12]:
probe_df['PROBE_CLASS'].unique()

array(['experimental', nan, 'control:reseq_qc:synthesis', 'fiducial',
       'linker', 'synthesis', 'control:sample_tracking:A',
       'control:empty', 'encoded number', 'control:reseq_qc:label',
       'uniformity', 'control', 'control:sample_tracking:B'], dtype=object)

In [13]:
probe_df[probe_df.duplicated(subset=['PROBE_ID'])]['PROBE_CLASS'].unique()

array(['fiducial', 'linker', 'synthesis', 'control:empty',
       'encoded number', 'uniformity', 'control',
       'control:sample_tracking:B'], dtype=object)

These are all controls of various sorts, etc. and I can exclude them.

In [14]:
experimental_probe_df = probe_df.loc[probe_df['PROBE_CLASS']=='experimental']

In [15]:
experimental_probe_df.head()

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
0,5314_0001_0001,BLOCK1,rank_selected,rank:09;score:376;uniq:03;count:37;freq:00;rul...,TETRA00S0021925,AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGC...,0,64456195,64456195,1,1,experimental,TETRAP00318583,2262,5314,1,1
1,5314_0023_0001,BLOCK1,rank_selected,rank:09;score:402;uniq:26;count:37;freq:00;rul...,TETRA00S0012676,AAGGGCGCATTAGTGCCAAATGGGACACGTAGAAAGGTAGAGGGAT...,0,64330340,64330340,1,23,experimental,TETRAP00183246,3282,5314,23,1
2,5314_0025_0001,BLOCK1,rank_selected,rank:08;score:115;uniq:01;count:31;freq:00;rul...,TETRA00S0002513,TATCATATAAAAATGTGTCTTGTTTATAAGACATTAATTGCTTAAT...,0,64193974,64193974,1,25,experimental,TETRAP00036232,262,5314,25,1
3,5314_0027_0001,BLOCK1,rank_selected,rank:11;score:341;uniq:23;count:37;freq:00;rul...,TETRA00S0000062,AGATTTGATGTAATATAAGTTATCCGAAACATGGCAGTAGAATGCA...,0,64161203,64161203,1,27,experimental,TETRAP00000895,825,5314,27,1
4,5314_0029_0001,BLOCK1,rank_selected,rank:01;score:560;uniq:21;count:37;freq:00;rul...,TETRA00S0006635,TAAATAACTAAGAGCATAAGGCTGGTGCAAGTAACTCAATGCAACC...,0,64249458,64249458,1,29,experimental,TETRAP00096103,3290,5314,29,1


In [16]:
len(probe_df)

392778

In [17]:
len(experimental_probe_df)

384999

In [18]:
experimental_probe_df['MISMATCH'].unique()

array([0])

Extract the probe ids and sequences to build a fasta file

In [19]:
probe_ids = experimental_probe_df['PROBE_ID'].values
probe_seqs = experimental_probe_df['PROBE_SEQUENCE'].values

Build the fasta file

In [20]:
with open('./2007-02-28_microarray_experimental_probes.fna', 'w') as f:
    for i, p in zip(probe_ids, probe_seqs):
        f.write(f">{i}\n")
        f.write(f"{p}\n\n")

## Step 2: use hisat 2 to align probes to newest genome

Note: running this on Mac OS.

Bash command to index the 2021 _T. thermophila_ genome CDS fasta

In [21]:
index_genome_command = "hisat2-build -f ../../raw_data/Tthermophila_MAC_CDS_2021.fasta ttherm_2021"

In [22]:
index_genome_command.split()

['hisat2-build',
 '-f',
 '../../raw_data/Tthermophila_MAC_CDS_2021.fasta',
 'ttherm_2021']

In [23]:
r = subprocess.run(args=index_genome_command.split(), capture_output=True)

In [24]:
print(r.stdout.decode('utf-8'))

Building DifferenceCoverSample
  Building sPrime
  Building sPrimeOrder
  V-Sorting samples
  V-Sorting samples time: 00:00:00
  Allocating rank array
  Ranking v-sort output
  Ranking v-sort output time: 00:00:00
  Invoking Larsson-Sadakane on ranks
  Invoking Larsson-Sadakane on ranks time: 00:00:01
  Sanity-checking and returning
Building samples
Reserving space for 12 sample suffixes
Generating random suffixes
QSorting 12 sample offsets, eliminating duplicates
QSorting sample offsets, eliminating duplicates time: 00:00:00
Multikey QSorting 12 samples
  (Using difference cover)
  Multikey QSorting samples time: 00:00:00
Calculating bucket sizes
Splitting and merging
  Splitting and merging time: 00:00:00
Avg bucket size: 6.06217e+06 (target: 9093253)
Getting block 1 of 8
  Reserving size (9093254) for bucket 1
  Calculating Z arrays for bucket 1
  Entering block accumulator loop for bucket 1:
  bucket 1: 10%
  bucket 1: 20%
  bucket 1: 30%
  bucket 1: 40%
  bucket 1: 50%
  bucket 1:

In [25]:
print(r.stderr.decode('utf-8'))

Settings:
  Output files: "ttherm_2021.*.ht2"
  Line rate: 6 (line is 64 bytes)
  Lines per side: 1 (side is 64 bytes)
  Offset rate: 4 (one in 16)
  FTable chars: 10
  Strings: unpacked
  Local offset rate: 3 (one in 8)
  Local fTable chars: 6
  Local sequence length: 57344
  Local sequence overlap between two consecutive indexes: 1024
  Endianness: little
  Actual local endianness: little
  Sanity checking: disabled
  Assertions: disabled
  Random seed: 0
  Sizeofs: void*:8, int:4, long:8, size_t:8
Input files DNA, FASTA:
  ../../raw_data/Tthermophila_MAC_CDS_2021.fasta
Reading reference sizes
  Time reading reference sizes: 00:00:00
Calculating joined length
Writing header
Reserving space for joined string
Joining reference sequences
  Time to join reference sequences: 00:00:01
  Time to read SNPs and splice sites: 00:00:00
Using parameters --bmax 9093254 --dcv 1024
  Doing ahead-of-time memory usage test
  Passed!  Constructing with these parameters: --bmax 9093254 --dcv 1024
Const

Bash command to align the probe sequences to the CDS regions

In [26]:
align_to_genome_command = """
hisat2 -f -x ttherm_2021 --no-hd
-U ./2007-02-28_microarray_experimental_probes.fna 
-S microarray_probe_alignment.sam"""

In [27]:
align_to_genome_command.split()

['hisat2',
 '-f',
 '-x',
 'ttherm_2021',
 '--no-hd',
 '-U',
 './2007-02-28_microarray_experimental_probes.fna',
 '-S',
 'microarray_probe_alignment.sam']

In [28]:
r2 = subprocess.run(args=align_to_genome_command.split(), capture_output=True)

In [29]:
print(r2.stdout.decode('utf-8'))




In [30]:
print(r2.stderr.decode('utf-8'))

384999 reads; of these:
  384999 (100.00%) were unpaired; of these:
    77143 (20.04%) aligned 0 times
    297488 (77.27%) aligned exactly 1 time
    10368 (2.69%) aligned >1 times
79.96% overall alignment rate



In [31]:
with open('./microarray_probe_alignment.sam', 'r') as f:
    lines = f.readlines()
    single_alignments = [l for l in lines if l.split()[-1] == 'NH:i:1']

Sanity check that there are fewer single alignments than total alignments

In [32]:
len(lines)

405651

In [33]:
lines[0]

'TETRAP00318583\t0\tTTHERM_00709600\t807\t60\t60M\t*\t0\t0\tAGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGCAAATATTTTAAGCC\tIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII\tAS:i:0\tXN:i:0\tXM:i:0\tXO:i:0\tXG:i:0\tNM:i:0\tMD:Z:60\tYT:Z:UU\tNH:i:1\n'

In [34]:
test = []
for l in lines:
    if l.split()[2] == 'TTHERM_000486279':
        test.append(l.split()[-1])

In [35]:
test

['NH:i:1',
 'NH:i:1',
 'NH:i:1',
 'NH:i:1',
 'NH:i:1',
 'NH:i:1',
 'NH:i:1',
 'NH:i:1']

In [36]:
re.search(r'NH:i:1$', test[0])

<re.Match object; span=(0, 6), match='NH:i:1'>

In [37]:
len(single_alignments)

297488

In [38]:
test = []
for l in single_alignments:
    if l.split()[2] == 'TTHERM_000486279':
        test.append((l.split()[0], l.split()[2]))

In [39]:
print(test)

[('TETRAP00177701', 'TTHERM_000486279'), ('TETRAP00177699', 'TTHERM_000486279'), ('TETRAP00177705', 'TTHERM_000486279'), ('TETRAP00177703', 'TTHERM_000486279'), ('TETRAP00177702', 'TTHERM_000486279'), ('TETRAP00177704', 'TTHERM_000486279'), ('TETRAP00177697', 'TTHERM_000486279'), ('TETRAP00177700', 'TTHERM_000486279')]


In [40]:
single_alignments[0].split()

['TETRAP00318583',
 '0',
 'TTHERM_00709600',
 '807',
 '60',
 '60M',
 '*',
 '0',
 '0',
 'AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGCAAATATTTTAAGCC',
 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII',
 'AS:i:0',
 'XN:i:0',
 'XM:i:0',
 'XO:i:0',
 'XG:i:0',
 'NM:i:0',
 'MD:Z:60',
 'YT:Z:UU',
 'NH:i:1']

Take a look at the formatting

In [41]:
single_alignments[0].split()

['TETRAP00318583',
 '0',
 'TTHERM_00709600',
 '807',
 '60',
 '60M',
 '*',
 '0',
 '0',
 'AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGCAAATATTTTAAGCC',
 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII',
 'AS:i:0',
 'XN:i:0',
 'XM:i:0',
 'XO:i:0',
 'XG:i:0',
 'NM:i:0',
 'MD:Z:60',
 'YT:Z:UU',
 'NH:i:1']

Build probe_id to ttherm_id dictionary

In [42]:
single_aligned_probes = [l.split()[0] for l in single_alignments]

In [43]:
align_dict = {}
for al in single_alignments:
    s = al.split()
    align_dict[s[0]] = s[2]

In [44]:
align_dict['TETRAP00177701']

'TTHERM_000486279'

Build seq_id to probe_id dictionary

In [45]:
seq_probe_dict = {s: p for p, s in zip(experimental_probe_df['PROBE_ID'].values, experimental_probe_df['SEQ_ID'].values)}

In [46]:
probe_seq_dict = {p: s for p, s in zip(experimental_probe_df['PROBE_ID'].values, experimental_probe_df['SEQ_ID'].values)}

In [47]:
probe_seq_dict['TETRAP00177701']

'TETRA00S0012296'

In [48]:
seq_probe_dict['TETRA00S0021925']

'TETRAP00318587'

Build a seq to gene dict

In [49]:
for probe, seq in probe_seq_dict.items():
    if probe in align_dict.keys():
        if probe in [t[0] for t in test]:
            print(align_dict[probe])

TTHERM_000486279
TTHERM_000486279
TTHERM_000486279
TTHERM_000486279
TTHERM_000486279
TTHERM_000486279
TTHERM_000486279
TTHERM_000486279


In [50]:
seq_gene_dict = {}
gene_seq_dict = {}
for probe, seq in probe_seq_dict.items():
#     print(probe, seq)
    if probe in align_dict.keys():
        genes = seq_gene_dict.get(seq, [])
        gene = align_dict[probe]
        genes.append(gene)
        seq_gene_dict[seq] = genes
        
#         seq_gene_dict[seq] = align_dict[probe]
        
        seqs = gene_seq_dict.get(align_dict[probe], [])
        seqs.append(seq)
        gene_seq_dict[align_dict[probe]] = seqs

In [51]:
list(seq_gene_dict.items())[:5]

[('TETRA00S0021925',
  ['TTHERM_00709600',
   'TTHERM_00709600',
   'TTHERM_000709619',
   'TTHERM_00709600',
   'TTHERM_00709600',
   'TTHERM_00709600',
   'TTHERM_000709599',
   'TTHERM_000709599',
   'TTHERM_000709619',
   'TTHERM_000709619']),
 ('TETRA00S0012676',
  ['TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480']),
 ('TETRA00S0000062',
  ['TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620']),
 ('TETRA00S0006635',
  ['TTHERM_01013320',
   'TTHERM_01013320',
   'TTHERM_01013320',
   'TTHERM_01013320',
   

In [52]:
list(gene_seq_dict.items())[:5]

[('TTHERM_00709600',
  ['TETRA00S0021925',
   'TETRA00S0021925',
   'TETRA00S0021925',
   'TETRA00S0021925',
   'TETRA00S0021925']),
 ('TTHERM_00529480',
  ['TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676']),
 ('TTHERM_00002620',
  ['TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062']),
 ('TTHERM_01013320',
  ['TETRA00S0006635',
   'TETRA00S0006635',
   'TETRA00S0006635',
   'TETRA00S0006635',
   'TETRA00S0006635',
   'TETRA00S0006635',
   'TETRA00S0006635',
   'TETRA00S0006635',
   'TETRA00S0006635',
   'TETR

Check that all probes are accounted for

In [53]:
len(seq_gene_dict)

23997

In [54]:
probe_count = 0
for v in gene_seq_dict.values():
    probe_count += len(v)
probe_count

297488

In [55]:
gene_seq_dict['TTHERM_000486279']

['TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296']

In [56]:
gene_seq_dict['TTHERM_00486270']

['TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296']

In [57]:
seq_gene_dict['TETRA00S0012296']

['TTHERM_000486279',
 'TTHERM_000486279',
 'TTHERM_00486270',
 'TTHERM_00486270',
 'TTHERM_000486279',
 'TTHERM_00486270',
 'TTHERM_000486279',
 'TTHERM_000486279',
 'TTHERM_000486279',
 'TTHERM_00486270',
 'TTHERM_00486270',
 'TTHERM_000486279',
 'TTHERM_000486279',
 'TTHERM_00486270']

In [58]:
gene_seq_dict['TTHERM_00321680']

['TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026']

In [59]:
seq_gene_dict['TETRA00S0011026']

['TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680']

There are cases where the measurement, as identified by the sequence (which is what we have in the GEO dataset), actually hits multiple genes in the current genome. We can't use these at all because we don't know which gene they're supposed to be hitting. Hence, we can use the probe IDs directly (align_dict) with the newly QC'ed microarray data

## Step 3: Load QC'ed data and correct the gene calls 

In [60]:
qc_rma = pd.read_csv('../microarray_QC/QC_probe_rma_values.csv')
qc_rma.head()

Unnamed: 0.1,Unnamed: 0,C0_GSM285570.xys,C0_GSM285586.xys,C0_GSM656230.xys,C10_GSM285578.xys,C10_GSM285591.xys,C12_GSM285579.xys,C12_GSM285592.xys,C12_GSM656237.xys,C14_GSM285580.xys,...,S3_GSM285542.xys,S3_GSM285555.xys,S3_GSM285562.xys,S6_GSM285543.xys,S6_GSM285556.xys,S6_GSM285563.xys,S9_GSM285544.xys,S9_GSM285564.xys,S9_GSM647653.xys,S9_GSM647654.xys
0,TETRA00S0000001,10.806513,11.138391,9.882465,5.94222,6.139343,6.230995,6.432716,6.445105,7.540559,...,9.176775,8.68717,8.114607,10.645298,9.888239,7.690063,9.659395,8.541027,10.080228,10.118915
1,TETRA00S0000002,8.583234,11.779033,5.605695,4.952352,5.570084,5.137474,5.778136,5.346029,4.975534,...,7.458952,8.128677,5.653863,8.175701,6.951447,5.889877,6.553791,5.770251,7.812986,6.93978
2,TETRA00S0000003,6.025072,7.702754,4.742815,4.80348,4.818365,5.19589,4.86651,4.611808,4.897123,...,7.575806,6.778402,5.568907,9.521188,8.563302,5.364381,8.466416,5.508847,8.719552,8.750582
3,TETRA00S0000004,7.111901,7.54114,6.695119,6.580019,6.78257,7.864984,8.118854,7.031177,8.234402,...,7.136329,7.592605,7.831403,7.179601,6.730384,7.635805,7.364327,7.835547,6.767574,6.911135
4,TETRA00S0000005,13.19868,12.724825,13.392404,13.127016,12.632725,12.658993,12.69613,12.845982,13.065571,...,12.263783,12.71262,12.643596,12.461448,12.707265,13.058492,12.865136,13.112395,12.890727,12.919449


In [61]:
qc_rma['PROBE_ID'] = [seq_probe_dict[s] for s in qc_rma['Unnamed: 0'].values]

In [65]:
qc_rma['TTHERM_ID'] = [align_dict[p] if p in align_dict.keys() else 'NA' for p in qc_rma['PROBE_ID'].values]

In [66]:
aligned_qc_rma = qc_rma.loc[qc_rma['TTHERM_ID'] != 'NA']
len(aligned_qc_rma)

21415

In [67]:
len(qc_rma)

28064

In [68]:
aligned_qc_rma.head()

Unnamed: 0.1,Unnamed: 0,C0_GSM285570.xys,C0_GSM285586.xys,C0_GSM656230.xys,C10_GSM285578.xys,C10_GSM285591.xys,C12_GSM285579.xys,C12_GSM285592.xys,C12_GSM656237.xys,C14_GSM285580.xys,...,S3_GSM285562.xys,S6_GSM285543.xys,S6_GSM285556.xys,S6_GSM285563.xys,S9_GSM285544.xys,S9_GSM285564.xys,S9_GSM647653.xys,S9_GSM647654.xys,PROBE_ID,TTHERM_ID
0,TETRA00S0000001,10.806513,11.138391,9.882465,5.94222,6.139343,6.230995,6.432716,6.445105,7.540559,...,8.114607,10.645298,9.888239,7.690063,9.659395,8.541027,10.080228,10.118915,TETRAP00000004,TTHERM_00000010
1,TETRA00S0000002,8.583234,11.779033,5.605695,4.952352,5.570084,5.137474,5.778136,5.346029,4.975534,...,5.653863,8.175701,6.951447,5.889877,6.553791,5.770251,7.812986,6.93978,TETRAP00000019,TTHERM_00000020
2,TETRA00S0000003,6.025072,7.702754,4.742815,4.80348,4.818365,5.19589,4.86651,4.611808,4.897123,...,5.568907,9.521188,8.563302,5.364381,8.466416,5.508847,8.719552,8.750582,TETRAP00000043,TTHERM_00000030
3,TETRA00S0000004,7.111901,7.54114,6.695119,6.580019,6.78257,7.864984,8.118854,7.031177,8.234402,...,7.831403,7.179601,6.730384,7.635805,7.364327,7.835547,6.767574,6.911135,TETRAP00000052,TTHERM_00000040
4,TETRA00S0000005,13.19868,12.724825,13.392404,13.127016,12.632725,12.658993,12.69613,12.845982,13.065571,...,12.643596,12.461448,12.707265,13.058492,12.865136,13.112395,12.890727,12.919449,TETRAP00000063,TTHERM_000000045


In [69]:
aligned_qc_rma = aligned_qc_rma.rename(columns={c: c.split('.')[0] for c in aligned_qc_rma.columns})

In [70]:
aligned_qc_rma.head()

Unnamed: 0.1,Unnamed: 0,C0_GSM285570,C0_GSM285586,C0_GSM656230,C10_GSM285578,C10_GSM285591,C12_GSM285579,C12_GSM285592,C12_GSM656237,C14_GSM285580,...,S3_GSM285562,S6_GSM285543,S6_GSM285556,S6_GSM285563,S9_GSM285544,S9_GSM285564,S9_GSM647653,S9_GSM647654,PROBE_ID,TTHERM_ID
0,TETRA00S0000001,10.806513,11.138391,9.882465,5.94222,6.139343,6.230995,6.432716,6.445105,7.540559,...,8.114607,10.645298,9.888239,7.690063,9.659395,8.541027,10.080228,10.118915,TETRAP00000004,TTHERM_00000010
1,TETRA00S0000002,8.583234,11.779033,5.605695,4.952352,5.570084,5.137474,5.778136,5.346029,4.975534,...,5.653863,8.175701,6.951447,5.889877,6.553791,5.770251,7.812986,6.93978,TETRAP00000019,TTHERM_00000020
2,TETRA00S0000003,6.025072,7.702754,4.742815,4.80348,4.818365,5.19589,4.86651,4.611808,4.897123,...,5.568907,9.521188,8.563302,5.364381,8.466416,5.508847,8.719552,8.750582,TETRAP00000043,TTHERM_00000030
3,TETRA00S0000004,7.111901,7.54114,6.695119,6.580019,6.78257,7.864984,8.118854,7.031177,8.234402,...,7.831403,7.179601,6.730384,7.635805,7.364327,7.835547,6.767574,6.911135,TETRAP00000052,TTHERM_00000040
4,TETRA00S0000005,13.19868,12.724825,13.392404,13.127016,12.632725,12.658993,12.69613,12.845982,13.065571,...,12.643596,12.461448,12.707265,13.058492,12.865136,13.112395,12.890727,12.919449,TETRAP00000063,TTHERM_000000045


In [72]:
aligned_qc_rma.loc[aligned_qc_rma.duplicated(subset=['PROBE_ID'])]

Unnamed: 0.1,Unnamed: 0,C0_GSM285570,C0_GSM285586,C0_GSM656230,C10_GSM285578,C10_GSM285591,C12_GSM285579,C12_GSM285592,C12_GSM656237,C14_GSM285580,...,S3_GSM285562,S6_GSM285543,S6_GSM285556,S6_GSM285563,S9_GSM285544,S9_GSM285564,S9_GSM647653,S9_GSM647654,PROBE_ID,TTHERM_ID


All probes are unique!

In [75]:
list(aligned_qc_rma.columns[1:-2])

['C0_GSM285570',
 'C0_GSM285586',
 'C0_GSM656230',
 'C10_GSM285578',
 'C10_GSM285591',
 'C12_GSM285579',
 'C12_GSM285592',
 'C12_GSM656237',
 'C14_GSM285580',
 'C14_GSM285593',
 'C14_GSM656238',
 'C16_GSM285582',
 'C16_GSM285595',
 'C16_GSM656239',
 'C18_GSM285583',
 'C18_GSM285596',
 'C18_GSM656240',
 'C4_GSM285574',
 'C4_GSM285588',
 'C4_GSM656234',
 'C6_GSM285575',
 'C6_GSM656232',
 'C8_GSM285576',
 'C8_GSM285590',
 'C8_GSM656236',
 'Lh_GSM283691',
 'Lh_GSM284360',
 'Lh_GSM284364',
 'Ll_GSM283687',
 'Ll_GSM284355',
 'Lm_GSM283690',
 'Lm_GSM284357',
 'Lm_GSM284363',
 'S0_GSM285363',
 'S0_GSM285554',
 'S0_GSM285561',
 'S0_GSM647651',
 'S0_GSM647652',
 'S15_GSM285559',
 'S15_GSM285566',
 'S24_GSM285547',
 'S24_GSM285560',
 'S3_GSM285542',
 'S3_GSM285555',
 'S3_GSM285562',
 'S6_GSM285543',
 'S6_GSM285556',
 'S6_GSM285563',
 'S9_GSM285544',
 'S9_GSM285564',
 'S9_GSM647653',
 'S9_GSM647654']

In [78]:
with open('../../raw_data/microarray_accessions_all.json', 'r') as f:
    # Including single REP measurement for C-15m (GSM656231) even though there are no replicates for it
    # in order to replicate the 2011 analysis
    # Format: keys are the physiological phase; values are the geo accessions for each microarray
    all_geo = json.load(f)

In [79]:
all_geo

{'Ll': ['GSM283687', 'GSM284355', 'GSM284362'],
 'Lm': ['GSM283690', 'GSM284357', 'GSM284363'],
 'Lh': ['GSM283691', 'GSM284360', 'GSM284364'],
 'S0': ['GSM285363',
  'GSM285554',
  'GSM285561',
  'GSM647244',
  'GSM647651',
  'GSM647652'],
 'S3': ['GSM285542', 'GSM285555', 'GSM285562'],
 'S6': ['GSM285543', 'GSM285556', 'GSM285563'],
 'S9': ['GSM285544', 'GSM285557', 'GSM285564', 'GSM647653', 'GSM647654'],
 'S12': ['GSM285545', 'GSM285558', 'GSM285565'],
 'S15': ['GSM285546', 'GSM285559', 'GSM285566'],
 'S24': ['GSM285547', 'GSM285560', 'GSM285567', 'GSM647245'],
 'C0': ['GSM285570', 'GSM285586', 'GSM656230'],
 'C15m': ['GSM656231'],
 'C2': ['GSM285572', 'GSM285587', 'GSM656233'],
 'C4': ['GSM285574', 'GSM285588', 'GSM656234'],
 'C6': ['GSM285575', 'GSM285589', 'GSM656232'],
 'C8': ['GSM285576', 'GSM285590', 'GSM656236'],
 'C10': ['GSM285578', 'GSM285591', 'GSM656235'],
 'C12': ['GSM285579', 'GSM285592', 'GSM656237'],
 'C14': ['GSM285580', 'GSM285593', 'GSM656238'],
 'C16': ['GSM28558

In [80]:
col_names = []

for key in all_geo:
    for val in all_geo[key]:
        col_names.append(f'{key}_{val}')

In [84]:
ordered_columns = [c for c in col_names if c in aligned_qc_rma.columns[1:-2]]

In [85]:
tidy_aligned_qc_rma_df = aligned_qc_rma[['TTHERM_ID'] + list(ordered_columns)]
tidy_aligned_qc_rma_df.head()

Unnamed: 0,TTHERM_ID,Ll_GSM283687,Ll_GSM284355,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,S0_GSM285363,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
0,TTHERM_00000010,4.97725,4.700894,6.147231,4.561489,4.650328,5.771366,5.444103,5.476506,9.486278,...,6.445105,7.540559,7.647457,7.481482,7.190172,7.777746,7.115077,7.64726,7.570432,7.016953
1,TTHERM_00000020,4.610397,4.599924,7.004953,5.177869,4.731092,8.479832,4.476522,4.876978,8.636355,...,5.346029,4.975534,5.92653,5.350573,5.258042,7.16272,5.248086,5.091658,5.584016,5.0267
2,TTHERM_00000030,4.573324,4.485747,5.022181,4.549989,4.836871,5.895068,4.488806,4.708944,8.796035,...,4.611808,4.897123,4.623331,4.891631,4.752432,4.783482,4.469749,4.816814,4.920257,4.621033
3,TTHERM_00000040,7.842499,7.68388,7.931735,7.651553,7.512039,7.17733,7.45281,7.310263,7.279969,...,7.031177,8.234402,8.125958,7.519027,7.181691,7.574957,6.85736,7.564313,7.447906,7.126748
4,TTHERM_000000045,11.407056,12.160098,11.588117,12.389438,11.62799,11.869775,12.417156,12.242265,12.659424,...,12.845982,13.065571,12.924649,12.558395,12.919496,12.813026,12.521966,13.101503,12.877656,12.391555


## Step 4: Get geometric means for genes that are probes multiple times as save dataframes for different phase groupings

In [86]:
tidy_aligned_qc_rma_df.loc[tidy_aligned_qc_rma_df.duplicated(subset=['TTHERM_ID'])]

Unnamed: 0,TTHERM_ID,Ll_GSM283687,Ll_GSM284355,Lm_GSM283690,Lm_GSM284357,Lm_GSM284363,Lh_GSM283691,Lh_GSM284360,Lh_GSM284364,S0_GSM285363,...,C12_GSM656237,C14_GSM285580,C14_GSM285593,C14_GSM656238,C16_GSM285582,C16_GSM285595,C16_GSM656239,C18_GSM285583,C18_GSM285596,C18_GSM656240
25,TTHERM_000001243,11.010224,10.615034,11.571942,10.914245,11.028227,11.567426,10.286103,11.119518,12.999794,...,11.477410,11.944433,12.444798,11.999280,11.858337,12.189705,10.653678,11.797972,11.989084,11.543787
46,TTHERM_00001480,4.604156,4.649048,4.947534,4.984898,4.697049,4.839143,4.547329,4.462486,5.059207,...,5.781916,4.987976,4.947308,4.910165,5.222816,5.204991,5.380697,5.563786,5.433355,5.709609
49,TTHERM_000001490,5.661733,5.468077,5.022218,5.299499,5.211810,6.119225,4.987920,5.570107,5.545534,...,4.905782,5.336420,5.178425,5.456848,5.556015,5.517316,5.786967,5.525727,5.446647,6.025336
51,TTHERM_000001490,4.978872,5.039225,4.769650,5.056447,4.887250,5.158080,4.515884,5.035027,4.676271,...,4.706774,4.651592,4.819671,4.842227,4.935135,4.539516,4.889064,4.987904,4.839688,4.722618
52,TTHERM_000001490,6.377411,6.484921,5.047821,6.103175,5.827730,7.181782,5.303707,5.387099,6.408652,...,6.140583,5.660803,5.030208,6.690558,5.949094,5.073415,7.150277,5.380938,5.490533,6.985870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27994,TTHERM_01165210,4.961442,5.467384,5.209268,5.901941,5.173997,5.284308,4.789263,4.409054,4.725652,...,4.721218,4.964579,4.390811,5.480340,4.532945,4.756447,4.816476,5.480938,4.516903,4.900749
27995,TTHERM_01165210,4.788406,5.623849,5.280782,5.334104,5.044578,5.601404,4.837998,4.607034,4.633221,...,4.794042,5.029986,4.630788,5.712666,4.481300,4.583400,4.985547,5.341226,4.625252,5.065370
28004,TTHERM_00648600,4.528655,4.859361,4.636289,4.614241,5.140115,5.000056,4.771473,4.466387,4.644313,...,4.472405,4.602740,4.685350,5.139992,4.614345,4.399709,4.683272,4.586994,4.795928,4.556415
28018,TTHERM_01165210,5.123335,5.929288,4.897653,5.371273,4.481371,5.999508,4.846173,4.493824,4.758171,...,4.981491,4.997924,4.595175,5.898437,4.471635,4.951884,4.490945,5.189611,4.437325,4.817895


In [87]:
aggregated_tidy_aligned_qc_rma_df = tidy_aligned_qc_rma_df.groupby('TTHERM_ID').aggregate(st.mstats.gmean).reset_index()

In [88]:
len(aggregated_tidy_aligned_qc_rma_df)

20038

In [91]:
growth = ['Ll', 'Lm', 'Lh']
starvation = ['S0', 'S3', 'S6', 'S9', 'S12', 'S15', 'S24']
conjugation = ['C0', 'C15m', 'C2', 'C4', 'C6', 'C8', 'C10', 'C12', 'C14', 'C16', 'C18']
vegetative = grow + starve

grow_cols = [c for c in aggregated_tidy_aligned_qc_rma_df.columns if c.split('_')[0] in growth]
starve_cols = [c for c in aggregated_tidy_aligned_qc_rma_df.columns if c.split('_')[0] in starvation]
sex_cols = [c for c in aggregated_tidy_aligned_qc_rma_df.columns if c.split('_')[0] in conjugation]
veg_cols = [c for c in aggregated_tidy_aligned_qc_rma_df.columns if c.split('_')[0] in vegetative]

In [92]:
aggregated_tidy_aligned_qc_rma_df_grow = aggregated_tidy_aligned_qc_rma_df[['TTHERM_ID'] + grow_cols]
aggregated_tidy_aligned_qc_rma_df_starve = aggregated_tidy_aligned_qc_rma_df[['TTHERM_ID'] + starve_cols]
aggregated_tidy_aligned_qc_rma_df_sex = aggregated_tidy_aligned_qc_rma_df[['TTHERM_ID'] + sex_cols]
aggregated_tidy_aligned_qc_rma_df_veg = aggregated_tidy_aligned_qc_rma_df[['TTHERM_ID'] + veg_cols]

In [97]:
aggregated_tidy_aligned_qc_rma_df.to_csv('./agg_tidy_2021aligned_qc_rma_expression_full.csv', index=False)
aggregated_tidy_aligned_qc_rma_df_grow.to_csv('./agg_tidy_2021aligned_qc_rma_expression_grow.csv', index=False)
aggregated_tidy_aligned_qc_rma_df_starve.to_csv('./agg_tidy_2021aligned_qc_rma_expression_starve.csv', index=False)
aggregated_tidy_aligned_qc_rma_df_veg.to_csv('./agg_tidy_2021aligned_qc_rma_expression_veg.csv', index=False)
aggregated_tidy_aligned_qc_rma_df_sex.to_csv('./agg_tidy_2021aligned_qc_rma_expression_sex.csv', index=False)

In [95]:
%load_ext watermark

In [96]:
%watermark --iversions

scipy    : 1.7.3
numpy    : 1.21.2
json     : 2.0.9
requests : 2.27.1
bs4      : 4.10.0
watermark: 2.3.0
pandas   : 1.3.5
re       : 2.2.1

