In [1]:
import subprocess
import re
import json
import watermark
import requests
import bs4

import glob

import scipy.stats as st
import numpy as np

from functools import reduce

## Step 0: install necessary tools

Install [hisat 2.2.1](https://daehwankimlab.github.io/hisat2/download/) in addition to conda environment

## Step 1: make probe fasta

In [2]:
import pandas as pd

Load the probe dataset

The .ndf file describes the design of the Roche Nimblegen microarray chips that were used in this study. This file is renamed from the GPL6759.ndf file corresponding to the raw data repository GEO accession GSE11300, which can be downloaded from https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file

In [3]:
probe_df = pd.read_csv('../../new_raw_data/GSE11300/GPL6759.ndf.gz', compression='gzip', sep='\t')

Take a look

In [4]:
probe_df.head()

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
0,5314_0001_0001,BLOCK1,rank_selected,rank:09;score:376;uniq:03;count:37;freq:00;rul...,TETRA00S0021925,AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGC...,0,64456195,64456195,1,1,experimental,TETRAP00318583,2262,5314,1,1
1,5314_0023_0001,BLOCK1,rank_selected,rank:09;score:402;uniq:26;count:37;freq:00;rul...,TETRA00S0012676,AAGGGCGCATTAGTGCCAAATGGGACACGTAGAAAGGTAGAGGGAT...,0,64330340,64330340,1,23,experimental,TETRAP00183246,3282,5314,23,1
2,5314_0025_0001,BLOCK1,rank_selected,rank:08;score:115;uniq:01;count:31;freq:00;rul...,TETRA00S0002513,TATCATATAAAAATGTGTCTTGTTTATAAGACATTAATTGCTTAAT...,0,64193974,64193974,1,25,experimental,TETRAP00036232,262,5314,25,1
3,5314_0027_0001,BLOCK1,rank_selected,rank:11;score:341;uniq:23;count:37;freq:00;rul...,TETRA00S0000062,AGATTTGATGTAATATAAGTTATCCGAAACATGGCAGTAGAATGCA...,0,64161203,64161203,1,27,experimental,TETRAP00000895,825,5314,27,1
4,5314_0029_0001,BLOCK1,rank_selected,rank:01;score:560;uniq:21;count:37;freq:00;rul...,TETRA00S0006635,TAAATAACTAAGAGCATAAGGCTGGTGCAAGTAACTCAATGCAACC...,0,64249458,64249458,1,29,experimental,TETRAP00096103,3290,5314,29,1


Note: the SEQ_ID is what the chip designers thought the genes were (later converted to TTHERM_ format); the PROBE_ID is unique for each PROBE_SEQUENCE. Consequently, for all of the quality control and filtering, we want to work with the PROBE_IDs, which we can then map to our current understanding of the genome by their sequence.

The seq ids are not unique because multiple probes can target a single gene. Keep in mind that our understanding of what genes exist has dramatically changed since these microarrays were designed.

In [5]:
len(probe_df)

392778

This length is indicative of a 1:2 design for the chip. See http://mtweb.cs.ucl.ac.uk/mus/mus/binnaz/CNV/NimbleGene/DATA/OID8421-2/Documentation/NimbleGen_data_formats.pdf for more info

In [6]:
probe_df.loc[probe_df['SEQ_ID'] == 'TETRA00S0000001']

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
6400,5314_0222_0018,BLOCK1,rank_selected,rank:04;score:424;uniq:15;count:37;freq:00;rul...,TETRA00S0000001,CATACAATCTTCTAATTACTTGCACTAGAGATTTTCAACGTATTGG...,0,64160384,64160384,18,222,experimental,TETRAP00000010,791,5314,222,18
74576,5314_0008_0196,BLOCK1,rank_selected,rank:05;score:387;uniq:15;count:37;freq:00;rul...,TETRA00S0000001,ATAGATTAAAGCTCTGATGACTCTCCTTCGCCTTAAATATAATCAT...,0,64160388,64160388,196,8,experimental,TETRAP00000015,1230,5314,8,196
132820,5314_0527_0347,BLOCK1,rank_selected,rank:07;score:359;uniq:05;count:37;freq:00;rul...,TETRA00S0000001,TTTTTCAATGAAGTGAAGATGCTTAGAACATTGAACCACAAGCTAA...,0,64160379,64160379,347,527,experimental,TETRAP00000005,517,5314,527,347
137513,5314_0697_0359,BLOCK1,rank_selected,rank:02;score:468;uniq:14;count:37;freq:00;rul...,TETRA00S0000001,CTTAATGATAAGATAACCTACTAAATGATAATTGATGACGAGACGA...,0,64160376,64160376,359,697,experimental,TETRAP00000002,211,5314,697,359
145827,5314_0429_0381,BLOCK1,rank_selected,rank:01;score:482;uniq:15;count:37;freq:00;rul...,TETRA00S0000001,CTTTTATAGGCTGGTTGCAGGAAAGACATATCAATAATCCTTGACA...,0,64160386,64160386,381,429,experimental,TETRAP00000012,954,5314,429,381
151844,5314_0175_0397,BLOCK1,rank_selected,rank:03;score:434;uniq:13;count:37;freq:00;rul...,TETRA00S0000001,TTAACATTGTAAGAAGTCTGCGTGATAATAGAGCAAATATGCTAAG...,0,64160381,64160381,397,175,experimental,TETRAP00000007,667,5314,175,397
178349,5314_0194_0466,BLOCK1,rank_selected,rank:12;score:327;uniq:12;count:37;freq:00;rul...,TETRA00S0000001,ACAAAGAGTTTAACATTGTAAGAAGTCTGCGTGATAATAGAGCAAA...,0,64160380,64160380,466,194,experimental,TETRAP00000006,658,5314,194,466
178366,5314_0228_0466,BLOCK1,rank_selected,rank:11;score:333;uniq:12;count:37;freq:00;rul...,TETRA00S0000001,GTATAACAACTCATACAATCTTCTAATTACTTGCACTAGAGATTTT...,0,64160382,64160382,466,228,experimental,TETRAP00000008,780,5314,228,466
191342,5314_0130_0500,BLOCK1,rank_selected,rank:06;score:382;uniq:20;count:34;freq:00;rul...,TETRA00S0000001,GAGTTTACACGGTTCAGGACACCTCAGGCTGCATGAAAATAAAATA...,0,64160375,64160375,500,130,experimental,TETRAP00000001,126,5314,130,500
209910,5314_0011_0549,BLOCK1,rank_selected,rank:08;score:355;uniq:05;count:37;freq:00;rul...,TETRA00S0000001,AAGTAAATGCTTGATAATAACCCTGATAAGAGACCTTCTGCAGATG...,0,64160387,64160387,549,11,experimental,TETRAP00000014,1084,5314,11,549


In [7]:
len(probe_df['SEQ_ID'].values) == len(probe_df['SEQ_ID'].unique())

False

Can find negative controls if need be

In [8]:
probe_df['DESIGN_NOTE'].unique()

array(['rank_selected', nan, '-', 'upper left fiducial', '0 cycles',
       '04 cycles', '08 cycles', '12 cycles', '16 cycles', '20 cycles',
       '24 cycles', '28 cycles', '32 cycles', '36 cycles', '40 cycles',
       '44 cycles', '48 cycles', '52 cycles', '56 cycles', '60 cycles',
       '64 cycles', '68 cycles', '72 cycles', '76 cycles', '80 cycles',
       '84 cycles', '88 cycles', '92 cycles', '96 cycles', '100 cycles',
       '104 cycles', 'synthesis control', 'upper center fiducial',
       'upper right fiducial', 'REPLICATE1', 'EMPTY',
       'upper right chip_id', '+', 'uniformity control',
       'vertical design_id', 'left center fiducial',
       'center cross fiducial', 'right center fiducial',
       'lower center fiducial', 'REPLICATE2', 'lower left fiducial',
       'lower right fiducial', 'lower left chip_id',
       'horizontal design_id'], dtype=object)

In [9]:
probe_df.loc[probe_df['DESIGN_NOTE'] == '-']

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
158,5314_0491_0001,NGS_CONTROLS,-,T/A,ARRAY_QC_A,ACGTCCCCCTCTGGaTGTTCATACGGTATG,10004,1100011,62205230,1,491,control:reseq_qc:synthesis,XENOSYNTH0093,12,5314,491,1
523,5314_0492_0002,NGS_CONTROLS,-,T/C,ARRAY_QC_A,ACGTCCCCCTCTGGcTGTTCATACGGTATG,10005,1100011,62205231,2,492,control:reseq_qc:synthesis,XENOSYNTH0094,12,5314,492,2
818,5314_0491_0003,NGS_CONTROLS,-,T/G,ARRAY_QC_A,ACGTCCCCCTCTGGgTGTTCATACGGTATG,10006,1100011,62205232,3,491,control:reseq_qc:synthesis,XENOSYNTH0095,12,5314,491,3
1190,5314_0492_0004,NGS_CONTROLS,-,T/T,ARRAY_QC_A,ACGTCCCCCTCTGGtTGTTCATACGGTATG,10007,1100011,62205233,4,492,control:reseq_qc:synthesis,XENOSYNTH0096,12,5314,492,4
6079,5314_0341_0017,NGS_CONTROLS,-,G/A,ARRAY_QC_C,GCGCGGCGTTGGACaTCTGACTAATACATCAA,10004,1100089,62205854,17,341,control:reseq_qc:synthesis,XENOSYNTH0717,90,5314,341,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389799,5314_0434_1016,NGS_CONTROLS,-,G/T,ARRAY_QC_C,ATGTGCGCGGCGTTtGACGTCTGACTAAT,10007,1100093,62205889,1016,434,control:reseq_qc:synthesis,XENOSYNTH0752,94,5314,434,1016
390294,5314_0661_1017,NGS_CONTROLS,-,A/A,LABEL_QC_B,ATTAGGCCCTTCGCaCGCAGCGGCGTGCG,10004,1100155,62206382,1017,661,control:reseq_qc:label,XENOLABEL0445,56,5314,661,1017
390675,5314_0662_1018,NGS_CONTROLS,-,A/C,LABEL_QC_B,ATTAGGCCCTTCGCcCGCAGCGGCGTGCG,10005,1100155,62206383,1018,662,control:reseq_qc:label,XENOLABEL0446,56,5314,662,1018
391056,5314_0661_1019,NGS_CONTROLS,-,A/G,LABEL_QC_B,ATTAGGCCCTTCGCgCGCAGCGGCGTGCG,10006,1100155,62206384,1019,661,control:reseq_qc:label,XENOLABEL0447,56,5314,661,1019


Example of many probes to a single id

In [10]:
probe_df.loc[probe_df['SEQ_ID']=='TETRA00S0021925']

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
0,5314_0001_0001,BLOCK1,rank_selected,rank:09;score:376;uniq:03;count:37;freq:00;rul...,TETRA00S0021925,AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGC...,0,64456195,64456195,1,1,experimental,TETRAP00318583,2262,5314,1,1
12641,5314_0474_0034,BLOCK1,rank_selected,rank:14;score:346;uniq:13;count:37;freq:00;rul...,TETRA00S0021925,TAATAATTTAATAGCGGATAGTCGATAATGTCAAACAGCATTTAAA...,0,64456191,64456191,34,474,experimental,TETRAP00318579,850,5314,474,34
67117,5314_0450_0176,BLOCK1,rank_selected,rank:08;score:378;uniq:05;count:37;freq:00;rul...,TETRA00S0021925,TCAAATATGTACCCCATTACATATCAAACAATGATGAGTTAAAACC...,0,64456194,64456194,176,450,experimental,TETRAP00318582,1777,5314,450,176
82746,5314_0219_0217,BLOCK1,rank_selected,rank:12;score:353;uniq:00;count:37;freq:00;rul...,TETRA00S0021925,CTGTCAGCTCAATCTTCTTACTTTCTGATGGTCAGGACAATAATTC...,0,64456200,64456200,217,219,experimental,TETRAP00318588,5102,5314,219,217
128974,5314_0515_0337,BLOCK1,rank_selected,rank:05;score:423;uniq:06;count:37;freq:00;rul...,TETRA00S0021925,AATTTAGCAATATGAAATCAATAACTAGGCCAAGATATATGCACAA...,0,64456196,64456196,337,515,experimental,TETRAP00318584,2706,5314,515,337
149412,5314_0688_0390,BLOCK1,rank_selected,rank:02;score:487;uniq:09;count:37;freq:00;rul...,TETRA00S0021925,TCGTTTTAGAAATATAGATAGGGTGGAGTTACCAATAGATTGATTA...,0,64456190,64456190,390,688,experimental,TETRAP00318578,742,5314,688,390
203654,5314_0520_0532,BLOCK1,rank_selected,rank:13;score:349;uniq:00;count:37;freq:00;rul...,TETRA00S0021925,TTGATTAAAATGGCAATGATGCTTTCAATCAAAGTGATGATTTAGC...,0,64456192,64456192,532,520,experimental,TETRAP00318580,1133,5314,520,532
208163,5314_0358_0544,BLOCK1,rank_selected,rank:04;score:433;uniq:09;count:37;freq:00;rul...,TETRA00S0021925,GCAATTCATAACATAATTAGATTTAATTCTATCCATAGCCAACATC...,0,64456193,64456193,544,358,experimental,TETRAP00318581,1487,5314,358,544
208452,5314_0167_0545,BLOCK1,rank_selected,rank:03;score:466;uniq:10;count:37;freq:00;rul...,TETRA00S0021925,GAATAGCATAATAATGGATGGCATTATCATAGAGAGTATTGACAAG...,0,64456198,64456198,545,167,experimental,TETRAP00318586,3792,5314,167,545
221084,5314_0088_0578,BLOCK1,rank_selected,rank:11;score:353;uniq:08;count:37;freq:00;rul...,TETRA00S0021925,TGAATATATAGATGGTTAAAACAATATAATTTACGACTCGAATGAG...,0,64456197,64456197,578,88,experimental,TETRAP00318585,3168,5314,88,578


The PROBE_IDs are also not unique

In [11]:
len(probe_df['PROBE_ID'].values) == len(probe_df['PROBE_ID'].unique())

False

In [12]:
probe_df[probe_df.duplicated(subset=['PROBE_ID'])]

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
284,5314_0005_0002,NGS_CONTROLS,upper left fiducial,bright,FIDUCIAL_UPPER_LEFT,TGAGTTGTTTGATAGGATTATTCATAGAGGTCATTACAGCGAGAGG...,0,2000197,62205062,2,5,fiducial,CPK6,0,5314,5,2
285,5314_0007_0002,NGS_CONTROLS,upper left fiducial,bright,FIDUCIAL_UPPER_LEFT,TGAGTTGTTTGATAGGATTATTCATAGAGGTCATTACAGCGAGAGG...,0,2000198,62205063,2,7,fiducial,CPK6,0,5314,7,2
286,5314_0009_0002,NGS_CONTROLS,upper left fiducial,bright,FIDUCIAL_UPPER_LEFT,TGAGTTGTTTGATAGGATTATTCATAGAGGTCATTACAGCGAGAGG...,0,2000199,62205064,2,9,fiducial,CPK6,0,5314,9,2
287,5314_0011_0002,NGS_CONTROLS,upper left fiducial,bright,FIDUCIAL_UPPER_LEFT,TGAGTTGTTTGATAGGATTATTCATAGAGGTCATTACAGCGAGAGG...,0,2000200,62205065,2,11,fiducial,CPK6,0,5314,11,2
288,5314_0013_0002,NGS_CONTROLS,upper left fiducial,bright,FIDUCIAL_UPPER_LEFT,TGAGTTGTTTGATAGGATTATTCATAGAGGTCATTACAGCGAGAGG...,0,2000201,62205066,2,13,fiducial,CPK6,0,5314,13,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392507,5314_0118_1024,H_CODE,horizontal design_id,dark,DESIGN_ID1,N,0,1060094,63771045,3,31,encoded number,empty,15,5314,118,1024
392508,5314_0120_1024,H_CODE,horizontal design_id,dark,DESIGN_ID1,N,0,1060100,63771051,3,33,encoded number,empty,16,5314,120,1024
392509,5314_0122_1024,H_CODE,horizontal design_id,dark,DESIGN_ID1,N,0,1060106,63771057,3,35,encoded number,empty,17,5314,122,1024
392510,5314_0124_1024,H_CODE,horizontal design_id,dark,DESIGN_ID1,N,0,1060112,63771063,3,37,encoded number,empty,18,5314,124,1024


In [13]:
probe_df['PROBE_CLASS'].unique()

array(['experimental', nan, 'control:reseq_qc:synthesis', 'fiducial',
       'linker', 'synthesis', 'control:sample_tracking:A',
       'control:empty', 'encoded number', 'control:reseq_qc:label',
       'uniformity', 'control', 'control:sample_tracking:B'], dtype=object)

In [14]:
probe_df[probe_df.duplicated(subset=['PROBE_ID'])]['PROBE_CLASS'].unique()

array(['fiducial', 'linker', 'synthesis', 'control:empty',
       'encoded number', 'uniformity', 'control',
       'control:sample_tracking:B'], dtype=object)

These are all controls of various sorts, etc. and I can exclude them.

In [15]:
experimental_probe_df = probe_df.loc[probe_df['PROBE_CLASS']=='experimental']

In [16]:
experimental_probe_df.head()

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
0,5314_0001_0001,BLOCK1,rank_selected,rank:09;score:376;uniq:03;count:37;freq:00;rul...,TETRA00S0021925,AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGC...,0,64456195,64456195,1,1,experimental,TETRAP00318583,2262,5314,1,1
1,5314_0023_0001,BLOCK1,rank_selected,rank:09;score:402;uniq:26;count:37;freq:00;rul...,TETRA00S0012676,AAGGGCGCATTAGTGCCAAATGGGACACGTAGAAAGGTAGAGGGAT...,0,64330340,64330340,1,23,experimental,TETRAP00183246,3282,5314,23,1
2,5314_0025_0001,BLOCK1,rank_selected,rank:08;score:115;uniq:01;count:31;freq:00;rul...,TETRA00S0002513,TATCATATAAAAATGTGTCTTGTTTATAAGACATTAATTGCTTAAT...,0,64193974,64193974,1,25,experimental,TETRAP00036232,262,5314,25,1
3,5314_0027_0001,BLOCK1,rank_selected,rank:11;score:341;uniq:23;count:37;freq:00;rul...,TETRA00S0000062,AGATTTGATGTAATATAAGTTATCCGAAACATGGCAGTAGAATGCA...,0,64161203,64161203,1,27,experimental,TETRAP00000895,825,5314,27,1
4,5314_0029_0001,BLOCK1,rank_selected,rank:01;score:560;uniq:21;count:37;freq:00;rul...,TETRA00S0006635,TAAATAACTAAGAGCATAAGGCTGGTGCAAGTAACTCAATGCAACC...,0,64249458,64249458,1,29,experimental,TETRAP00096103,3290,5314,29,1


In [17]:
len(probe_df)

392778

In [18]:
len(experimental_probe_df)

384999

In [19]:
experimental_probe_df['MISMATCH'].unique()

array([0])

Extract the probe ids and sequences to build a fasta file

In [20]:
probe_ids = experimental_probe_df['PROBE_ID'].values
probe_seqs = experimental_probe_df['PROBE_SEQUENCE'].values

Build the fasta file

In [21]:
with open('./2007-02-28_microarray_experimental_probes.fna', 'w') as f:
    for i, p in zip(probe_ids, probe_seqs):
        f.write(f">{i}\n")
        f.write(f"{p}\n\n")

## Step 2: use hisat 2 to align probes to newest genome

Note: running this on Mac OS.

Bash command to index the 2021 _T. thermophila_ genome CDS fasta

In [22]:
index_genome_command = "hisat2-build -f ../../new_raw_data/Tthermophila_MAC_CDS_2021.fasta ttherm_2021"

In [23]:
index_genome_command.split()

['/Users/michaelbertagna/Documents/MyDocuments/git/hisat2-2.2.1',
 'hisat2-build',
 '-f',
 '../../new_raw_data/Tthermophila_MAC_CDS_2021.fasta',
 'ttherm_2021']

In [24]:
import os
print(os.environ['PATH'])

/Users/michaelbertagna/anaconda3/bin:/Users/michaelbertagna/anaconda3/condabin:/usr/bin:/bin:/usr/sbin:/sbin


In [25]:
r = subprocess.run(args=index_genome_command.split(), capture_output=True)

PermissionError: [Errno 13] Permission denied: '/Users/michaelbertagna/Documents/MyDocuments/git/hisat2-2.2.1'

In [None]:
print(r.stdout.decode('utf-8'))

In [None]:
print(r.stderr.decode('utf-8'))

Bash command to align the probe sequences to the CDS regions

In [None]:
align_to_genome_command = """
hisat2 -f -x ttherm_2021 --no-hd
-U ./2007-02-28_microarray_experimental_probes.fna 
-S microarray_probe_alignment.sam"""

In [None]:
align_to_genome_command.split()

In [None]:
r2 = subprocess.run(args=align_to_genome_command.split(), capture_output=True)

In [None]:
print(r2.stdout.decode('utf-8'))

In [None]:
print(r2.stderr.decode('utf-8'))

In [26]:
with open('./microarray_probe_alignment.sam', 'r') as f:
    lines = f.readlines()
    single_alignments = [line for line in lines if l.split()[-1] == 'NH:i:1']

Sanity check that there are fewer single alignments than total alignments

In [27]:
len(lines)

405651

In [28]:
lines[0]

'TETRAP00318583\t0\tTTHERM_00709600\t807\t60\t60M\t*\t0\t0\tAGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGCAAATATTTTAAGCC\tIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII\tAS:i:0\tXN:i:0\tXM:i:0\tXO:i:0\tXG:i:0\tNM:i:0\tMD:Z:60\tYT:Z:UU\tNH:i:1\n'

In [29]:
test = []
for line in lines:
    if l.split()[2] == 'TTHERM_000486279':
        test.append(l.split()[-1])

In [30]:
test

['NH:i:1',
 'NH:i:1',
 'NH:i:1',
 'NH:i:1',
 'NH:i:1',
 'NH:i:1',
 'NH:i:1',
 'NH:i:1']

In [31]:
re.search(r'NH:i:1$', test[0])

<re.Match object; span=(0, 6), match='NH:i:1'>

In [32]:
len(single_alignments)

297488

In [33]:
test = []
for line in single_alignments:
    if line.split()[2] == 'TTHERM_000486279':
        test.append((line.split()[0], line.split()[2]))

In [34]:
print(test)

[('TETRAP00177701', 'TTHERM_000486279'), ('TETRAP00177699', 'TTHERM_000486279'), ('TETRAP00177705', 'TTHERM_000486279'), ('TETRAP00177703', 'TTHERM_000486279'), ('TETRAP00177702', 'TTHERM_000486279'), ('TETRAP00177704', 'TTHERM_000486279'), ('TETRAP00177697', 'TTHERM_000486279'), ('TETRAP00177700', 'TTHERM_000486279')]


In [35]:
single_alignments[0].split()

['TETRAP00318583',
 '0',
 'TTHERM_00709600',
 '807',
 '60',
 '60M',
 '*',
 '0',
 '0',
 'AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGCAAATATTTTAAGCC',
 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII',
 'AS:i:0',
 'XN:i:0',
 'XM:i:0',
 'XO:i:0',
 'XG:i:0',
 'NM:i:0',
 'MD:Z:60',
 'YT:Z:UU',
 'NH:i:1']

Take a look at the formatting

In [36]:
single_alignments[0].split()

['TETRAP00318583',
 '0',
 'TTHERM_00709600',
 '807',
 '60',
 '60M',
 '*',
 '0',
 '0',
 'AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGCAAATATTTTAAGCC',
 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII',
 'AS:i:0',
 'XN:i:0',
 'XM:i:0',
 'XO:i:0',
 'XG:i:0',
 'NM:i:0',
 'MD:Z:60',
 'YT:Z:UU',
 'NH:i:1']

Build probe_id to ttherm_id dictionary

In [37]:
single_aligned_probes = [line.split()[0] for line in single_alignments]

In [38]:
align_dict = {}
for alignment in single_alignments:
    split_alignment = alignment.split()
    align_dict[split_alignment[0]] = split_alignment[2]

In [39]:
align_dict['TETRAP00177701']

'TTHERM_000486279'

Build seq_id to probe_id dictionary

In [40]:
seq_probe_dict = {s: p for p, s in zip(experimental_probe_df['PROBE_ID'].values, experimental_probe_df['SEQ_ID'].values)}

In [41]:
probe_seq_dict = {p: s for p, s in zip(experimental_probe_df['PROBE_ID'].values, experimental_probe_df['SEQ_ID'].values)}

In [42]:
probe_seq_dict['TETRAP00177701']

'TETRA00S0012296'

In [43]:
seq_probe_dict['TETRA00S0021925']

'TETRAP00318587'

Build a seq to gene dict

In [44]:
for probe, seq in probe_seq_dict.items():
    if probe in align_dict.keys():
        if probe in [t[0] for t in test]:
            print(align_dict[probe])

TTHERM_000486279
TTHERM_000486279
TTHERM_000486279
TTHERM_000486279
TTHERM_000486279
TTHERM_000486279
TTHERM_000486279
TTHERM_000486279


In [45]:
seq_gene_dict = {}
gene_seq_dict = {}
for probe, seq in probe_seq_dict.items():
#     print(probe, seq)
    if probe in align_dict.keys():
        genes = seq_gene_dict.get(seq, [])
        gene = align_dict[probe]
        genes.append(gene)
        seq_gene_dict[seq] = genes
        
#         seq_gene_dict[seq] = align_dict[probe]
        
        seqs = gene_seq_dict.get(align_dict[probe], [])
        seqs.append(seq)
        gene_seq_dict[align_dict[probe]] = seqs

In [46]:
list(seq_gene_dict.items())[:5]

[('TETRA00S0021925',
  ['TTHERM_00709600',
   'TTHERM_00709600',
   'TTHERM_000709619',
   'TTHERM_00709600',
   'TTHERM_00709600',
   'TTHERM_00709600',
   'TTHERM_000709599',
   'TTHERM_000709599',
   'TTHERM_000709619',
   'TTHERM_000709619']),
 ('TETRA00S0012676',
  ['TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480',
   'TTHERM_00529480']),
 ('TETRA00S0000062',
  ['TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620',
   'TTHERM_00002620']),
 ('TETRA00S0006635',
  ['TTHERM_01013320',
   'TTHERM_01013320',
   'TTHERM_01013320',
   'TTHERM_01013320',
   

In [47]:
list(gene_seq_dict.items())[:5]

[('TTHERM_00709600',
  ['TETRA00S0021925',
   'TETRA00S0021925',
   'TETRA00S0021925',
   'TETRA00S0021925',
   'TETRA00S0021925']),
 ('TTHERM_00529480',
  ['TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676',
   'TETRA00S0012676']),
 ('TTHERM_00002620',
  ['TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062',
   'TETRA00S0000062']),
 ('TTHERM_01013320',
  ['TETRA00S0006635',
   'TETRA00S0006635',
   'TETRA00S0006635',
   'TETRA00S0006635',
   'TETRA00S0006635',
   'TETRA00S0006635',
   'TETRA00S0006635',
   'TETRA00S0006635',
   'TETRA00S0006635',
   'TETR

Check that all probes are accounted for

In [48]:
len(seq_gene_dict)

23997

In [49]:
probe_count = 0
for v in gene_seq_dict.values():
    probe_count += len(v)
probe_count

297488

In [50]:
gene_seq_dict['TTHERM_000486279']

['TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296']

In [51]:
gene_seq_dict['TTHERM_00486270']

['TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296',
 'TETRA00S0012296']

In [52]:
seq_gene_dict['TETRA00S0012296']

['TTHERM_000486279',
 'TTHERM_000486279',
 'TTHERM_00486270',
 'TTHERM_00486270',
 'TTHERM_000486279',
 'TTHERM_00486270',
 'TTHERM_000486279',
 'TTHERM_000486279',
 'TTHERM_000486279',
 'TTHERM_00486270',
 'TTHERM_00486270',
 'TTHERM_000486279',
 'TTHERM_000486279',
 'TTHERM_00486270']

In [53]:
gene_seq_dict['TTHERM_00321680']

['TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026',
 'TETRA00S0011026']

In [54]:
seq_gene_dict['TETRA00S0011026']

['TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680',
 'TTHERM_00321680']

There are cases where the measurement, as identified by the sequence (which is what we have in the GEO dataset), actually hits multiple genes in the current genome. We can't use these at all because we don't know which gene they're supposed to be hitting. Hence, we can use the probe IDs directly (align_dict) with the newly QC'ed microarray data

### Step 3: Download raw data

#### To truly correct against the genome, will need to regenerate .xys files using corrected probe assignments and the raw data, perform robust multiarray averaging (RMA) normalization, and then do filtering, etc.

Each microarray chip is based on the same design, so the .ndf file can give the XY coordinates for each unique probe.

In [55]:
probe_XY_df = experimental_probe_df[['PROBE_ID', 'PROBE_CLASS', 'X', 'Y']]
probe_XY_df.head()

Unnamed: 0,PROBE_ID,PROBE_CLASS,X,Y
0,TETRAP00318583,experimental,1,1
1,TETRAP00183246,experimental,23,1
2,TETRAP00036232,experimental,25,1
3,TETRAP00000895,experimental,27,1
4,TETRAP00096103,experimental,29,1


In [56]:
len(probe_XY_df)

384999

Take only the probes that align uniquely to the 2021 genome

In [57]:
probe_XY_df['aligned'] = [p in align_dict.keys() for p in probe_XY_df['PROBE_ID']]
probe_XY_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  probe_XY_df['aligned'] = [p in align_dict.keys() for p in probe_XY_df['PROBE_ID']]


Unnamed: 0,PROBE_ID,PROBE_CLASS,X,Y,aligned
0,TETRAP00318583,experimental,1,1,True
1,TETRAP00183246,experimental,23,1,True
2,TETRAP00036232,experimental,25,1,False
3,TETRAP00000895,experimental,27,1,True
4,TETRAP00096103,experimental,29,1,True


In [58]:
corrected_probe_XY_df = probe_XY_df.loc[probe_XY_df['aligned'] == True]
len(corrected_probe_XY_df)

297488

Note: this downloading is not done by the notebook in the top level directory!

Download the raw data in the formats that are available to me from GEO. Ron Pearlman submitted the .pair files. Wei Miao submitted some pre-processed tables that are very annoying for most of the chips, and then .pair files for the extra S0 and S24 chips he added in 2011. Yifan Liu submitted .pair files. Ron's data: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE26650; Wei's data 1: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE11300: Wei's data 2: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE26384; Yifan's data: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE26385

Based on the size of the microarray, this is a standard 1:2 design, which means the the COUNT field in the xys file should be set to 1 for experimental probes and NA for control features. (http://mtweb.cs.ucl.ac.uk/mus/mus/binnaz/CNV/NimbleGene/DATA/OID8421-2/Documentation/NimbleGen_data_formats.pdf)


In [59]:
gse_convert_dict = {
    'L1-L': 'GSM283687',
    'L1-M': 'GSM283690',
    'L1-H': 'GSM283691',
    'L2-L': 'GSM284355',
    'L2-M': 'GSM284357',
    'L2-H': 'GSM284360',
    'L3-L': 'GSM284362',
    'L3-M': 'GSM284363',
    'L3-H': 'GSM284364',
    'S1-0': 'GSM285363',
    'S1-3': 'GSM285542',
    'S1-6': 'GSM285543',
    'S1-9': 'GSM285544',
    'S1-12': 'GSM285545',
    'S1-15': 'GSM285546',
    'S1-24': 'GSM285547',
    'S2-0': 'GSM285554',
    'S2-3': 'GSM285555',
    'S2-6': 'GSM285556',
    'S2-9': 'GSM285557',
    'S2-12': 'GSM285558',
    'S2-15': 'GSM285559',
    'S2-24': 'GSM285560',
    'S3-0': 'GSM285561',
    'S3-3': 'GSM285562',
    'S3-6': 'GSM285563',
    'S3-9': 'GSM285564',
    'S3-12': 'GSM285565',
    'S3-15': 'GSM285566',
    'S3-24': 'GSM285567',
    'C1-0': 'GSM285570',
    'C1-2': 'GSM285572',
    'C1-4': 'GSM285574',
    'C1-6': 'GSM285575',
    'C1-8': 'GSM285576',
    'C1-10': 'GSM285578',
    'C1-12': 'GSM285579',
    'C1-14': 'GSM285580',
    'C1-16': 'GSM285582',
    'C1-18': 'GSM285583',
    'C2-0': 'GSM285586',
    'C2-2': 'GSM285587',
    'C2-4': 'GSM285588',
    'C2-6': 'GSM285589',
    'C2-8': 'GSM285590',
    'C2-10': 'GSM285591',
    'C2-12': 'GSM285592',
    'C2-14': 'GSM285593',
    'C2-16': 'GSM285595',
    'C2-18': 'GSM285596',
}

In [60]:
raw_microarray_data_files = glob.glob('../../new_raw_data/expression/*.gz')
raw_microarray_data_files

['../../new_raw_data/expression/GSM656231_4257502_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM647245.pair.gz',
 '../../new_raw_data/expression/GSE11300_Raw_C1_Samples.txt.gz',
 '../../new_raw_data/expression/GSE11300_Raw_L1_Samples.txt.gz',
 '../../new_raw_data/expression/GSM656232_4257702_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM647244.pair.gz',
 '../../new_raw_data/expression/GSM656237_4258302_532_pair.txt.gz',
 '../../new_raw_data/expression/GSE11300_Raw_L3_Samples.txt.gz',
 '../../new_raw_data/expression/GSM656239_4261302_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM656234_4257802_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM656240_4261102_532_pair.txt.gz',
 '../../new_raw_data/expression/GSE11300_Raw_C2_Samples.txt.gz',
 '../../new_raw_data/expression/GSM647653_13401502_532_pair.txt.gz',
 '../../new_raw_data/expression/GSE11300_Raw_L2_Samples.txt.gz',
 '../../new_raw_data/expression/GSM647654_13401702_532_pair.txt.gz',
 '../../new_raw_data/expr

Separate out the partially pre-processed files

In [61]:
for f in raw_microarray_data_files:
    if 'Samples' in f:
        print(f)

../../new_raw_data/expression/GSE11300_Raw_C1_Samples.txt.gz
../../new_raw_data/expression/GSE11300_Raw_L1_Samples.txt.gz
../../new_raw_data/expression/GSE11300_Raw_L3_Samples.txt.gz
../../new_raw_data/expression/GSE11300_Raw_C2_Samples.txt.gz
../../new_raw_data/expression/GSE11300_Raw_L2_Samples.txt.gz
../../new_raw_data/expression/GSE11300_Raw_S2_Samples.txt.gz
../../new_raw_data/expression/GSE11300_Raw_S3_Samples.txt.gz
../../new_raw_data/expression/GSE11300_Raw_S1_Samples.txt.gz


In [62]:
def pair_to_xys(probe_XY_df, path_to_pair, xys_filename):
    
    """
    Function to convert .pair format into .xys format
    """
    
    pair = pd.read_csv(path_to_pair, comment='#', sep='\t')
    probe_signal_df = pair[['PROBE_ID', 'PM']]
    probe_signal_df = probe_signal_df.rename(columns={'PM': 'SIGNAL'})
    
    m = probe_XY_df.merge(probe_signal_df, on='PROBE_ID')
    
    counts = [1 if p == 'experimental' else 'NA' for p in m['PROBE_CLASS'].values]
    
    m['COUNT'] = counts
    
    m = m.dropna()
    
    xys = m[['X', 'Y', 'SIGNAL', 'COUNT']]
    
    xys.to_csv(xys_filename, sep='\t', index=False)
    
    with open(xys_filename, 'r+') as f:
        
        content = f.read()
        f.seek(0, 0)
        f.write('# designname=2021_tetrahymena_expr_corrected date=2021_11_07' + '\n' + content)

In [63]:
def samples_to_xys(probe_XY_df, path_to_samples, xys_file_prefix):
    
    """
    Function to convert partially pre-processed files into .xys format.
    """
    
    sample_df = pd.read_csv(path_to_samples, sep='\t')
    
    samples = sample_df.columns.to_list()[3:]
    
    xys_dfs = []
    for s in samples:
        
        raw = sample_df[['probe_ID', s]]
        raw = raw.rename(columns={'probe_ID': 'PROBE_ID', s: 'SIGNAL'})
        m = probe_XY_df.merge(raw, on='PROBE_ID')
        
        counts = [1 if p == 'experimental' else 'NA' for p in m['PROBE_CLASS'].values]
        m['COUNT'] = counts
        
        m = m.dropna()
        
        m_xys = m[['X', 'Y', 'SIGNAL', 'COUNT']]
        
        xys_dfs.append(m_xys)
        
    for df, s in zip(xys_dfs, samples):
        
        gsm = gse_convert_dict[s]
        phase = inverse_all_geo[gsm]
        identifier = f'{phase}_{gsm}'
        
        out_filename = f'../microarray_QC/{identifier}.xys'
        print(out_filename)
        df.to_csv(out_filename, sep='\t', index=False)
        
        with open(out_filename, 'r+') as f:
            content = f.read()
            f.seek(0, 0)
            f.write('# designname=2021_tetrahymena_expr_corrected date=2021_11_07' + '\n' + content)

In [64]:
with open('../../new_raw_data/microarray_accessions_all.json', 'r') as f:
    # Including single REP measurement for C-15m (GSM656231) even though there are no replicates for it
    # in order to replicate the 2011 analysis
    # Format: keys are the physiological phase; values are the geo accessions for each microarray
    all_geo = json.load(f)

In [65]:
inverse_all_geo = {}

for k, v in all_geo.items():
    for code in v:
        inverse_all_geo[code] = k
        
inverse_all_geo

{'GSM283687': 'Ll',
 'GSM284355': 'Ll',
 'GSM284362': 'Ll',
 'GSM283690': 'Lm',
 'GSM284357': 'Lm',
 'GSM284363': 'Lm',
 'GSM283691': 'Lh',
 'GSM284360': 'Lh',
 'GSM284364': 'Lh',
 'GSM285363': 'S0',
 'GSM285554': 'S0',
 'GSM285561': 'S0',
 'GSM647244': 'S0',
 'GSM647651': 'S0',
 'GSM647652': 'S0',
 'GSM285542': 'S3',
 'GSM285555': 'S3',
 'GSM285562': 'S3',
 'GSM285543': 'S6',
 'GSM285556': 'S6',
 'GSM285563': 'S6',
 'GSM285544': 'S9',
 'GSM285557': 'S9',
 'GSM285564': 'S9',
 'GSM647653': 'S9',
 'GSM647654': 'S9',
 'GSM285545': 'S12',
 'GSM285558': 'S12',
 'GSM285565': 'S12',
 'GSM285546': 'S15',
 'GSM285559': 'S15',
 'GSM285566': 'S15',
 'GSM285547': 'S24',
 'GSM285560': 'S24',
 'GSM285567': 'S24',
 'GSM647245': 'S24',
 'GSM285570': 'C0',
 'GSM285586': 'C0',
 'GSM656230': 'C0',
 'GSM656231': 'C15m',
 'GSM285572': 'C2',
 'GSM285587': 'C2',
 'GSM656233': 'C2',
 'GSM285574': 'C4',
 'GSM285588': 'C4',
 'GSM656234': 'C4',
 'GSM285575': 'C6',
 'GSM285589': 'C6',
 'GSM656232': 'C6',
 'GSM285

In [66]:
for f in raw_microarray_data_files:
    parts = f.split('/')
    code = parts[-1].split('_')[0].split('.')[0]
    
#     if 'Samples' in f:
#         code = gse_convert_dict[code]
    
#     phase = inverse_all_geo[code]
    
#     identifier = f'{phase}_{code}'
#     print(identifier)
    
#     print(code)

    if 'Samples' in f:
        samples_to_xys(corrected_probe_XY_df, f, code)
        
    else:
        phase = inverse_all_geo[code]
        identifier = f'{phase}_{code}'
        
        out_xys_path = f'../microarray_QC/{identifier}.xys'
        print(out_xys_path)
        pair_to_xys(corrected_probe_XY_df, f, out_xys_path)

../microarray_QC/C15m_GSM656231.xys
../microarray_QC/S24_GSM647245.xys
../microarray_QC/C0_GSM285570.xys
../microarray_QC/C2_GSM285572.xys
../microarray_QC/C4_GSM285574.xys
../microarray_QC/C6_GSM285575.xys
../microarray_QC/C8_GSM285576.xys
../microarray_QC/C10_GSM285578.xys
../microarray_QC/C12_GSM285579.xys
../microarray_QC/C14_GSM285580.xys
../microarray_QC/C16_GSM285582.xys
../microarray_QC/C18_GSM285583.xys
../microarray_QC/Ll_GSM283687.xys
../microarray_QC/Lm_GSM283690.xys
../microarray_QC/Lh_GSM283691.xys
../microarray_QC/C6_GSM656232.xys
../microarray_QC/S0_GSM647244.xys
../microarray_QC/C12_GSM656237.xys
../microarray_QC/Ll_GSM284362.xys
../microarray_QC/Lm_GSM284363.xys
../microarray_QC/Lh_GSM284364.xys
../microarray_QC/C16_GSM656239.xys
../microarray_QC/C4_GSM656234.xys
../microarray_QC/C18_GSM656240.xys
../microarray_QC/C0_GSM285586.xys
../microarray_QC/C2_GSM285587.xys
../microarray_QC/C4_GSM285588.xys
../microarray_QC/C6_GSM285589.xys
../microarray_QC/C8_GSM285590.xys
../

# CORRECTED TTHERM_ID MAPPING

CREATE A DF WITH SINGLELY ALIGNED PROBES AND THEIR CORRESPONDING SEQ_ID

In [67]:
valid_probe_ids = corrected_probe_XY_df['PROBE_ID'].values
valid_probe_ids

array(['TETRAP00318583', 'TETRAP00183246', 'TETRAP00000895', ...,
       'TETRAP00050545', 'TETRAP00273866', 'TETRAP00263235'], dtype=object)

In [68]:
p_s_df = experimental_probe_df.loc[:,['PROBE_ID', 'SEQ_ID']]
p_s_df.head()

Unnamed: 0,PROBE_ID,SEQ_ID
0,TETRAP00318583,TETRA00S0021925
1,TETRAP00183246,TETRA00S0012676
2,TETRAP00036232,TETRA00S0002513
3,TETRAP00000895,TETRA00S0000062
4,TETRAP00096103,TETRA00S0006635


In [69]:
p_s_df['aligned'] = p_s_df['PROBE_ID'].isin(align_dict.keys())
p_s_df.head()

Unnamed: 0,PROBE_ID,SEQ_ID,aligned
0,TETRAP00318583,TETRA00S0021925,True
1,TETRAP00183246,TETRA00S0012676,True
2,TETRAP00036232,TETRA00S0002513,False
3,TETRAP00000895,TETRA00S0000062,True
4,TETRAP00096103,TETRA00S0006635,True


In [70]:
corrected_p_s_df = p_s_df.loc[p_s_df['aligned'] == True]
corrected_p_s_df.head()

Unnamed: 0,PROBE_ID,SEQ_ID,aligned
0,TETRAP00318583,TETRA00S0021925,True
1,TETRAP00183246,TETRA00S0012676,True
3,TETRAP00000895,TETRA00S0000062,True
4,TETRAP00096103,TETRA00S0006635,True
5,TETRAP00268899,TETRA00S0018517,True


BUILD A SEQ_ID TO LIST OF PROBE_IDs DICT

In [71]:
epd_probe_id = corrected_p_s_df['PROBE_ID'].values
epd_seq_id = corrected_p_s_df['SEQ_ID'].values

In [72]:
len(epd_probe_id) == len(epd_seq_id)

True

In [73]:
seq_probe_list_dict = {}
for idx in range(len(epd_probe_id)):
    s_id = epd_seq_id[idx]
    p_id = epd_probe_id[idx]
    if s_id not in seq_probe_list_dict:
        seq_probe_list_dict[s_id] = []
    seq_probe_list_dict[s_id].append(p_id)

In [74]:
seq_probe_list_dict

{'TETRA00S0021925': ['TETRAP00318583',
  'TETRAP00318582',
  'TETRAP00318588',
  'TETRAP00318584',
  'TETRAP00318581',
  'TETRAP00318585',
  'TETRAP00318576',
  'TETRAP00318575',
  'TETRAP00318589',
  'TETRAP00318587'],
 'TETRA00S0012676': ['TETRAP00183246',
  'TETRAP00183248',
  'TETRAP00183244',
  'TETRAP00183240',
  'TETRAP00183250',
  'TETRAP00183242',
  'TETRAP00183249',
  'TETRAP00183239',
  'TETRAP00183251',
  'TETRAP00183252',
  'TETRAP00183243',
  'TETRAP00183241',
  'TETRAP00183238'],
 'TETRA00S0000062': ['TETRAP00000895',
  'TETRAP00000896',
  'TETRAP00000900',
  'TETRAP00000893',
  'TETRAP00000904',
  'TETRAP00000897',
  'TETRAP00000902',
  'TETRAP00000894',
  'TETRAP00000898',
  'TETRAP00000891',
  'TETRAP00000892',
  'TETRAP00000899',
  'TETRAP00000903',
  'TETRAP00000890'],
 'TETRA00S0006635': ['TETRAP00096103',
  'TETRAP00096098',
  'TETRAP00096100',
  'TETRAP00096091',
  'TETRAP00096099',
  'TETRAP00096092',
  'TETRAP00096097',
  'TETRAP00096102',
  'TETRAP00096090',
 

BUILD A TTHERM_ID TO LIST OF PROBE_IDs DICT

In [75]:
def getValueKeyList(target_dict):
    value_key_list_dict = {}

    for key, value in target_dict.items():
        if value not in value_key_list_dict:
            value_key_list_dict[value] = []
        value_key_list_dict[value].append(key)

    return value_key_list_dict

In [76]:
align_list_dict = getValueKeyList(align_dict)

In [77]:
align_list_dict

{'TTHERM_00709600': ['TETRAP00318583',
  'TETRAP00318582',
  'TETRAP00318584',
  'TETRAP00318581',
  'TETRAP00318585'],
 'TTHERM_00529480': ['TETRAP00183246',
  'TETRAP00183248',
  'TETRAP00183244',
  'TETRAP00183240',
  'TETRAP00183250',
  'TETRAP00183242',
  'TETRAP00183249',
  'TETRAP00183239',
  'TETRAP00183251',
  'TETRAP00183252',
  'TETRAP00183243',
  'TETRAP00183241',
  'TETRAP00183238'],
 'TTHERM_00002620': ['TETRAP00000895',
  'TETRAP00000896',
  'TETRAP00000900',
  'TETRAP00000893',
  'TETRAP00000904',
  'TETRAP00000897',
  'TETRAP00000902',
  'TETRAP00000894',
  'TETRAP00000898',
  'TETRAP00000891',
  'TETRAP00000892',
  'TETRAP00000899',
  'TETRAP00000903',
  'TETRAP00000890'],
 'TTHERM_01013320': ['TETRAP00096103',
  'TETRAP00096098',
  'TETRAP00096100',
  'TETRAP00096091',
  'TETRAP00096099',
  'TETRAP00096092',
  'TETRAP00096097',
  'TETRAP00096102',
  'TETRAP00096090',
  'TETRAP00096101',
  'TETRAP00096095',
  'TETRAP00096104',
  'TETRAP00096093',
  'TETRAP00096094'],


In [78]:
print(len(align_list_dict))
print(len(seq_probe_list_dict))

23614
23997


SERIALIZE THE TWO DICTIONARIES

In [79]:
import pickle

with open('align_list_dict.pkl', 'wb') as file:
    pickle.dump(align_list_dict, file)
    
with open('seq_probe_list_dict.pkl', 'wb') as file:
    pickle.dump(seq_probe_list_dict, file)

### Step 5: Go to R and do the microarray QC and RMA normalization

This is in ../microarray_QC/microarray_QC.Rmd

### Step 5.5: Run tetra_ttherm_mapping.py