In [1]:
import json

import glob

import pandas as pd

import numpy as np

Load the probe dataset

The .ndf file describes the design of the Roche Nimblegen microarray chips that were used in this study. This file is renamed from the GPL6759.ndf file corresponding to the raw data repository GEO accession GSE11300, which can be downloaded from https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE11300&format=file

In [2]:
probe_df = pd.read_csv('../../new_raw_data/GSE11300/GPL6759.ndf.gz', compression='gzip', sep='\t')

Take a look

In [3]:
probe_df.head()

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
0,5314_0001_0001,BLOCK1,rank_selected,rank:09;score:376;uniq:03;count:37;freq:00;rul...,TETRA00S0021925,AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGC...,0,64456195,64456195,1,1,experimental,TETRAP00318583,2262,5314,1,1
1,5314_0023_0001,BLOCK1,rank_selected,rank:09;score:402;uniq:26;count:37;freq:00;rul...,TETRA00S0012676,AAGGGCGCATTAGTGCCAAATGGGACACGTAGAAAGGTAGAGGGAT...,0,64330340,64330340,1,23,experimental,TETRAP00183246,3282,5314,23,1
2,5314_0025_0001,BLOCK1,rank_selected,rank:08;score:115;uniq:01;count:31;freq:00;rul...,TETRA00S0002513,TATCATATAAAAATGTGTCTTGTTTATAAGACATTAATTGCTTAAT...,0,64193974,64193974,1,25,experimental,TETRAP00036232,262,5314,25,1
3,5314_0027_0001,BLOCK1,rank_selected,rank:11;score:341;uniq:23;count:37;freq:00;rul...,TETRA00S0000062,AGATTTGATGTAATATAAGTTATCCGAAACATGGCAGTAGAATGCA...,0,64161203,64161203,1,27,experimental,TETRAP00000895,825,5314,27,1
4,5314_0029_0001,BLOCK1,rank_selected,rank:01;score:560;uniq:21;count:37;freq:00;rul...,TETRA00S0006635,TAAATAACTAAGAGCATAAGGCTGGTGCAAGTAACTCAATGCAACC...,0,64249458,64249458,1,29,experimental,TETRAP00096103,3290,5314,29,1


Note: the SEQ_ID is what the chip designers thought the genes were (later converted to TTHERM_ format); the PROBE_ID is unique for each PROBE_SEQUENCE. Consequently, for all of the quality control and filtering, we want to work with the PROBE_IDs, which we can then map to our current understanding of the genome by their sequence.

The seq ids are not unique because multiple probes can target a single gene. Keep in mind that our understanding of what genes exist has dramatically changed since these microarrays were designed.

In [4]:
len(probe_df)

392778

This length is indicative of a 1:2 design for the chip. See http://mtweb.cs.ucl.ac.uk/mus/mus/binnaz/CNV/NimbleGene/DATA/OID8421-2/Documentation/NimbleGen_data_formats.pdf for more info

In [5]:
probe_df.loc[probe_df['SEQ_ID'] == 'TETRA00S0000001']

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
6400,5314_0222_0018,BLOCK1,rank_selected,rank:04;score:424;uniq:15;count:37;freq:00;rul...,TETRA00S0000001,CATACAATCTTCTAATTACTTGCACTAGAGATTTTCAACGTATTGG...,0,64160384,64160384,18,222,experimental,TETRAP00000010,791,5314,222,18
74576,5314_0008_0196,BLOCK1,rank_selected,rank:05;score:387;uniq:15;count:37;freq:00;rul...,TETRA00S0000001,ATAGATTAAAGCTCTGATGACTCTCCTTCGCCTTAAATATAATCAT...,0,64160388,64160388,196,8,experimental,TETRAP00000015,1230,5314,8,196
132820,5314_0527_0347,BLOCK1,rank_selected,rank:07;score:359;uniq:05;count:37;freq:00;rul...,TETRA00S0000001,TTTTTCAATGAAGTGAAGATGCTTAGAACATTGAACCACAAGCTAA...,0,64160379,64160379,347,527,experimental,TETRAP00000005,517,5314,527,347
137513,5314_0697_0359,BLOCK1,rank_selected,rank:02;score:468;uniq:14;count:37;freq:00;rul...,TETRA00S0000001,CTTAATGATAAGATAACCTACTAAATGATAATTGATGACGAGACGA...,0,64160376,64160376,359,697,experimental,TETRAP00000002,211,5314,697,359
145827,5314_0429_0381,BLOCK1,rank_selected,rank:01;score:482;uniq:15;count:37;freq:00;rul...,TETRA00S0000001,CTTTTATAGGCTGGTTGCAGGAAAGACATATCAATAATCCTTGACA...,0,64160386,64160386,381,429,experimental,TETRAP00000012,954,5314,429,381
151844,5314_0175_0397,BLOCK1,rank_selected,rank:03;score:434;uniq:13;count:37;freq:00;rul...,TETRA00S0000001,TTAACATTGTAAGAAGTCTGCGTGATAATAGAGCAAATATGCTAAG...,0,64160381,64160381,397,175,experimental,TETRAP00000007,667,5314,175,397
178349,5314_0194_0466,BLOCK1,rank_selected,rank:12;score:327;uniq:12;count:37;freq:00;rul...,TETRA00S0000001,ACAAAGAGTTTAACATTGTAAGAAGTCTGCGTGATAATAGAGCAAA...,0,64160380,64160380,466,194,experimental,TETRAP00000006,658,5314,194,466
178366,5314_0228_0466,BLOCK1,rank_selected,rank:11;score:333;uniq:12;count:37;freq:00;rul...,TETRA00S0000001,GTATAACAACTCATACAATCTTCTAATTACTTGCACTAGAGATTTT...,0,64160382,64160382,466,228,experimental,TETRAP00000008,780,5314,228,466
191342,5314_0130_0500,BLOCK1,rank_selected,rank:06;score:382;uniq:20;count:34;freq:00;rul...,TETRA00S0000001,GAGTTTACACGGTTCAGGACACCTCAGGCTGCATGAAAATAAAATA...,0,64160375,64160375,500,130,experimental,TETRAP00000001,126,5314,130,500
209910,5314_0011_0549,BLOCK1,rank_selected,rank:08;score:355;uniq:05;count:37;freq:00;rul...,TETRA00S0000001,AAGTAAATGCTTGATAATAACCCTGATAAGAGACCTTCTGCAGATG...,0,64160387,64160387,549,11,experimental,TETRAP00000014,1084,5314,11,549


In [6]:
filtered_probe_df = (pd.read_csv('../../new_raw_data/GSE11300/filtered_GPL6759.ndf.gz', sep='\t'))
filtered_probe_df.head()

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y
0,5314_0001_0001,BLOCK1,rank_selected,rank:09;score:376;uniq:03;count:37;freq:00;rul...,TTHERM_00709600,AGATATGTATGACTCTATACATAGAAATGTGAATGATATAGTTAGC...,0,64456195,64456195,1,1,experimental,TETRAP00318583,2262,5314,1,1
1,5314_0023_0001,BLOCK1,rank_selected,rank:09;score:402;uniq:26;count:37;freq:00;rul...,TTHERM_00529480,AAGGGCGCATTAGTGCCAAATGGGACACGTAGAAAGGTAGAGGGAT...,0,64330340,64330340,1,23,experimental,TETRAP00183246,3282,5314,23,1
2,5314_0027_0001,BLOCK1,rank_selected,rank:11;score:341;uniq:23;count:37;freq:00;rul...,TTHERM_00002620,AGATTTGATGTAATATAAGTTATCCGAAACATGGCAGTAGAATGCA...,0,64161203,64161203,1,27,experimental,TETRAP00000895,825,5314,27,1
3,5314_0029_0001,BLOCK1,rank_selected,rank:01;score:560;uniq:21;count:37;freq:00;rul...,TTHERM_01013320,TAAATAACTAAGAGCATAAGGCTGGTGCAAGTAACTCAATGCAACC...,0,64249458,64249458,1,29,experimental,TETRAP00096103,3290,5314,29,1
4,5314_0031_0001,BLOCK1,rank_selected,rank:04;score:459;uniq:13;count:37;freq:00;rul...,TTHERM_00455220,CTTTTTCAATCCTACTGCAATCTTGAAGCCCTGTATAATAGATAAT...,0,64409997,64409997,1,31,experimental,TETRAP00268899,754,5314,31,1


In [7]:
filtered_probe_df.loc[(filtered_probe_df['X'] == 45) & (filtered_probe_df['Y'] == 1)]

Unnamed: 0,PROBE_DESIGN_ID,CONTAINER,DESIGN_NOTE,SELECTION_CRITERIA,SEQ_ID,PROBE_SEQUENCE,MISMATCH,MATCH_INDEX,FEATURE_ID,ROW_NUM,COL_NUM,PROBE_CLASS,PROBE_ID,POSITION,DESIGN_ID,X,Y


In [8]:
# max_seq_id_num = max([int(num[8:]) for num in (probe_df.loc[(probe_df['PROBE_CLASS'] == 'experimental')]['SEQ_ID'])])
# max_seq_id_num

In [9]:
len(np.unique([num for num in (probe_df.loc[(probe_df['PROBE_CLASS'] == 'experimental')]['SEQ_ID'])]))

28064

In [10]:
{col: len(np.unique([num for num in (probe_df.loc[(probe_df['PROBE_CLASS'] == 'experimental')][col])])) for col in filtered_probe_df.columns}

{'PROBE_DESIGN_ID': 384999,
 'CONTAINER': 1,
 'DESIGN_NOTE': 1,
 'SELECTION_CRITERIA': 264886,
 'SEQ_ID': 28064,
 'PROBE_SEQUENCE': 372676,
 'MISMATCH': 1,
 'MATCH_INDEX': 384999,
 'FEATURE_ID': 384999,
 'ROW_NUM': 1024,
 'COL_NUM': 768,
 'PROBE_CLASS': 1,
 'PROBE_ID': 384999,
 'POSITION': 8432,
 'DESIGN_ID': 1,
 'X': 768,
 'Y': 1024}

In [11]:
(min([int(num) for num in (probe_df.loc[(probe_df['PROBE_CLASS'] == 'experimental')]['POSITION'])]))

1

In [12]:
(np.unique([num for num in (probe_df.loc[(probe_df['PROBE_CLASS'] == 'experimental')]['SEQ_ID'])]))[len(np.unique([num for num in (probe_df.loc[(probe_df['PROBE_CLASS'] == 'experimental')]['SEQ_ID'])]))-1]

'TETRA00S0028875'

In [13]:
max([int(num[6:]) for num in (probe_df.loc[(probe_df['PROBE_CLASS'] == 'experimental')]['PROBE_ID'])])

415371

In [14]:
# max_seq_id_num = max([int(num[8:]) for num in (probe_df.loc[(probe_df['PROBE_CLASS'] == 'experimental')]['SEQ_ID'])])
# seq_id_num = max_seq_id_num

# for index, row in filtered_probe_df.iterrows():
#     if row['CONTAINER'] == 'RANDOM':
#         seq_id_num += 1
#         filtered_probe_df.at[index, 'SEQ_ID'] = ('TETRA00S00' + str(seq_id_num))
#         filtered_probe_df.at[index, 'CONTAINER'] = 'BLOCK1'
#         filtered_probe_df.at[index, 'DESIGN_NOTE'] = 'rank_selected'
#         filtered_probe_df.at[index, 'PROBE_CLASS'] = 'experimental'
#         filtered_probe_df.at[index, 'POSITION'] = '1'

In [15]:
# filtered_probe_df[filtered_probe_df['SEQ_ID'].str[8:].astype(int) > max_seq_id_num]

In [16]:
filtered_probe_df.shape

(313654, 17)

In [17]:
len(np.unique(filtered_probe_df['SEQ_ID']))

23660

Note: this downloading is not done by the notebook in the top level directory!

Download the raw data in the formats that are available to me from GEO. Ron Pearlman submitted the .pair files. Wei Miao submitted some pre-processed tables that are very annoying for most of the chips, and then .pair files for the extra S0 and S24 chips he added in 2011. Yifan Liu submitted .pair files. Ron's data: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE26650; Wei's data 1: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE11300: Wei's data 2: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE26384; Yifan's data: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE26385

Based on the size of the microarray, this is a standard 1:2 design, which means the the COUNT field in the xys file should be set to 1 for experimental probes and NA for control features. (http://mtweb.cs.ucl.ac.uk/mus/mus/binnaz/CNV/NimbleGene/DATA/OID8421-2/Documentation/NimbleGen_data_formats.pdf)


In [18]:
gse_convert_dict = {
    'L1-L': 'GSM283687',
    'L1-M': 'GSM283690',
    'L1-H': 'GSM283691',
    'L2-L': 'GSM284355',
    'L2-M': 'GSM284357',
    'L2-H': 'GSM284360',
    'L3-L': 'GSM284362',
    'L3-M': 'GSM284363',
    'L3-H': 'GSM284364',
    'S1-0': 'GSM285363',
    'S1-3': 'GSM285542',
    'S1-6': 'GSM285543',
    'S1-9': 'GSM285544',
    'S1-12': 'GSM285545',
    'S1-15': 'GSM285546',
    'S1-24': 'GSM285547',
    'S2-0': 'GSM285554',
    'S2-3': 'GSM285555',
    'S2-6': 'GSM285556',
    'S2-9': 'GSM285557',
    'S2-12': 'GSM285558',
    'S2-15': 'GSM285559',
    'S2-24': 'GSM285560',
    'S3-0': 'GSM285561',
    'S3-3': 'GSM285562',
    'S3-6': 'GSM285563',
    'S3-9': 'GSM285564',
    'S3-12': 'GSM285565',
    'S3-15': 'GSM285566',
    'S3-24': 'GSM285567',
    'C1-0': 'GSM285570',
    'C1-2': 'GSM285572',
    'C1-4': 'GSM285574',
    'C1-6': 'GSM285575',
    'C1-8': 'GSM285576',
    'C1-10': 'GSM285578',
    'C1-12': 'GSM285579',
    'C1-14': 'GSM285580',
    'C1-16': 'GSM285582',
    'C1-18': 'GSM285583',
    'C2-0': 'GSM285586',
    'C2-2': 'GSM285587',
    'C2-4': 'GSM285588',
    'C2-6': 'GSM285589',
    'C2-8': 'GSM285590',
    'C2-10': 'GSM285591',
    'C2-12': 'GSM285592',
    'C2-14': 'GSM285593',
    'C2-16': 'GSM285595',
    'C2-18': 'GSM285596',
}

In [19]:
# NEED TO MAKE SURE THIS IS CORRECT
# I ASSUMED THAT L-l, L-m and L-h CORRESPOND TO L-1, L-2 and L-3
l_convert_dict = {
'L1-1': 'L1-L',
'L1-2': 'L1-M',
'L1-3': 'L1-H',
'L2-1': 'L2-L',
'L2-2': 'L2-M',
'L2-3': 'L2-H',
'L3-1': 'L3-L',
'L3-2': 'L3-M',
'L3-3': 'L3-H',
}

In [20]:
inv_gse_convert_dict = {phase: gsm_id for phase, gsm_id in gse_convert_dict.items()}
inv_gse_convert_dict

{'L1-L': 'GSM283687',
 'L1-M': 'GSM283690',
 'L1-H': 'GSM283691',
 'L2-L': 'GSM284355',
 'L2-M': 'GSM284357',
 'L2-H': 'GSM284360',
 'L3-L': 'GSM284362',
 'L3-M': 'GSM284363',
 'L3-H': 'GSM284364',
 'S1-0': 'GSM285363',
 'S1-3': 'GSM285542',
 'S1-6': 'GSM285543',
 'S1-9': 'GSM285544',
 'S1-12': 'GSM285545',
 'S1-15': 'GSM285546',
 'S1-24': 'GSM285547',
 'S2-0': 'GSM285554',
 'S2-3': 'GSM285555',
 'S2-6': 'GSM285556',
 'S2-9': 'GSM285557',
 'S2-12': 'GSM285558',
 'S2-15': 'GSM285559',
 'S2-24': 'GSM285560',
 'S3-0': 'GSM285561',
 'S3-3': 'GSM285562',
 'S3-6': 'GSM285563',
 'S3-9': 'GSM285564',
 'S3-12': 'GSM285565',
 'S3-15': 'GSM285566',
 'S3-24': 'GSM285567',
 'C1-0': 'GSM285570',
 'C1-2': 'GSM285572',
 'C1-4': 'GSM285574',
 'C1-6': 'GSM285575',
 'C1-8': 'GSM285576',
 'C1-10': 'GSM285578',
 'C1-12': 'GSM285579',
 'C1-14': 'GSM285580',
 'C1-16': 'GSM285582',
 'C1-18': 'GSM285583',
 'C2-0': 'GSM285586',
 'C2-2': 'GSM285587',
 'C2-4': 'GSM285588',
 'C2-6': 'GSM285589',
 'C2-8': 'GSM28559

In [21]:
len(inv_gse_convert_dict) == len(gse_convert_dict)

True

In [22]:
with open('../../new_raw_data/microarray_accessions_all.json', 'r') as f:
    # Including single REP measurement for C-15m (GSM656231) even though there are no replicates for it
    # in order to replicate the 2011 analysis
    # Format: keys are the physiological phase; values are the geo accessions for each microarray
    all_geo = json.load(f)

In [23]:
inverse_all_geo = {}

for k, v in all_geo.items():
    for code in v:
        inverse_all_geo[code] = k
        
inverse_all_geo

{'GSM283687': 'Ll',
 'GSM284355': 'Ll',
 'GSM284362': 'Ll',
 'GSM283690': 'Lm',
 'GSM284357': 'Lm',
 'GSM284363': 'Lm',
 'GSM283691': 'Lh',
 'GSM284360': 'Lh',
 'GSM284364': 'Lh',
 'GSM285363': 'S0',
 'GSM285554': 'S0',
 'GSM285561': 'S0',
 'GSM647244': 'S0',
 'GSM647651': 'S0',
 'GSM647652': 'S0',
 'GSM285542': 'S3',
 'GSM285555': 'S3',
 'GSM285562': 'S3',
 'GSM285543': 'S6',
 'GSM285556': 'S6',
 'GSM285563': 'S6',
 'GSM285544': 'S9',
 'GSM285557': 'S9',
 'GSM285564': 'S9',
 'GSM647653': 'S9',
 'GSM647654': 'S9',
 'GSM285545': 'S12',
 'GSM285558': 'S12',
 'GSM285565': 'S12',
 'GSM285546': 'S15',
 'GSM285559': 'S15',
 'GSM285566': 'S15',
 'GSM285547': 'S24',
 'GSM285560': 'S24',
 'GSM285567': 'S24',
 'GSM647245': 'S24',
 'GSM285570': 'C0',
 'GSM285586': 'C0',
 'GSM656230': 'C0',
 'GSM656231': 'C15m',
 'GSM285572': 'C2',
 'GSM285587': 'C2',
 'GSM656233': 'C2',
 'GSM285574': 'C4',
 'GSM285588': 'C4',
 'GSM656234': 'C4',
 'GSM285575': 'C6',
 'GSM285589': 'C6',
 'GSM656232': 'C6',
 'GSM285

In [24]:
raw_microarray_data_files = (glob.glob('../../new_raw_data/expression/*pair.txt.gz') + glob.glob('../../new_raw_data/expression/All_pair*.txt') + glob.glob('../../new_raw_data/expression/*pair.gz'))
raw_microarray_data_files

['../../new_raw_data/expression/GSM656231_4257502_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM656232_4257702_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM656237_4258302_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM656239_4261302_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM656234_4257802_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM656240_4261102_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM647653_13401502_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM647654_13401702_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM656236_4257902_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM656238_4259002_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM656235_4258102_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM656230_4257602_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM647652_13399602_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM647651_13398502_532_pair.txt.gz',
 '../../new_raw_data/expression/GSM656233_42

In [25]:
raw_data_key_files = glob.glob('../../new_raw_data/expression/All_pair_sample_keys/*')
total_num_rows = 0
cid_desc_dict = {}

for file in raw_data_key_files:
    raw_data_key_df = pd.read_csv(file, comment='#', sep='\t')
    num_rows = raw_data_key_df.shape[0]
    total_num_rows += num_rows
    cid_desc_dict.update({cid: desc for cid, desc in zip(raw_data_key_df['CHIP_ID'].values, raw_data_key_df['SAMPLE_DESCRIPTION'].values)})
    
len(cid_desc_dict) == total_num_rows

True

In [26]:
cid_desc_dict

{124893: 'C1-12',
 124906: 'C1-4',
 1713302: 'C1-0',
 1714502: 'C1-2',
 1715102: 'C1-8',
 1715202: 'C1-6',
 1715902: 'C1-10',
 1717502: 'C1-14',
 1718802: 'C1-16',
 1719102: 'C1-18',
 1719202: 'C2-0',
 1724202: 'C2-2',
 1724302: 'C2-4',
 1725202: 'C2-6',
 1725502: 'C2-8',
 1725702: 'C2-12',
 1725802: 'C2-10',
 1727002: 'C2-14',
 1728902: 'C2-16',
 1732102: 'C2-18',
 108636: 'S2-0',
 116404: 'S2-9',
 124652: 'S2-3',
 124656: 'S2-6',
 124888: 'S3-0',
 124889: 'S3-15',
 124890: 'S3-24',
 124894: 'S3-3',
 124895: 'S3-6',
 124896: 'S3-9',
 124903: 'S3-12',
 124905: 'S2-24',
 124907: 'S2-12',
 1980802: 'S2-15',
 2242902: 'L3-2',
 2243002: 'L3-1',
 2243302: 'L3-3',
 2243602: 'S1-0',
 2243702: 'S1-6',
 2244402: 'S1-9',
 2244502: 'S1-12',
 2244602: 'S1-3',
 2254002: 'S1-15',
 2261602: 'S1-24',
 1473402: 'L1-2',
 1473702: 'L1-1',
 1491802: 'L1-3',
 1704502: 'L2-1',
 1704602: 'L2-3',
 1704902: 'L2-2'}

In [27]:
def pair_to_xys(filtered_probe_df, path_to_pair, xys_filename):
    
    """
    Function to convert .pair format into .xys format
    """
    
    pair = pd.read_csv(path_to_pair, comment='#', sep='\t')
    probe_signal_df = pair[['PROBE_ID', 'PM']]
    probe_signal_df = probe_signal_df.rename(columns={'PM': 'SIGNAL'})
    
    m = filtered_probe_df.merge(probe_signal_df, on='PROBE_ID') # FIXME random probes named differently
    
#     counts = [1 if p == 'experimental' or c == 'RANDOM' else 'NA' for p, c in zip(m['PROBE_CLASS'].values, m['CONTAINER'].values)]
    
    m['COUNT'] = [1 for _ in range(m.shape[0])]
    
#     m = m.dropna()
    
    xys = m[['X', 'Y', 'SIGNAL', 'COUNT']]
    
#     xys = m[['PROBE_ID', 'X', 'Y', 'SIGNAL', 'COUNT']]

    
    xys.to_csv(xys_filename, sep='\t', index=False)
    
    with open(xys_filename, 'r+') as f:
        
        content = f.read()
        f.seek(0, 0)
        f.write('# designname=2021_tetrahymena_expr_corrected date=2021_11_07' + '\n' + content)

In [28]:
def All_pair_to_xys(filtered_probe_df, f):
    
    All_pair_df = pd.read_csv(f, comment='#', sep='\t')
    
    merged_df = filtered_probe_df.merge(All_pair_df, on='PROBE_ID')
    
#     counts = [1 if p == 'experimental' or c == 'RANDOM' else 'NA' for p, c in zip(merged_df['PROBE_CLASS'].values, merged_df['CONTAINER'].values)]

    merged_df['COUNT'] = [1 for _ in range(merged_df.shape[0])]

#     merged_df = merged_df.dropna()
    
    columns = list(merged_df.columns)
    
    for col in columns:
        
        split_col = col.split('_')
        
        if split_col[len(split_col)-1] == '532':
            sample_desc = cid_desc_dict[int(split_col[0])]
            
            if sample_desc not in inv_gse_convert_dict:
                sample_desc = l_convert_dict[sample_desc]
                
            gsm_id = inv_gse_convert_dict[sample_desc]
            
            phase = inverse_all_geo[gsm_id]
            
            identifier = f'{phase}_{gsm_id}'
            
            out_filename = f'../microarray_QC/{identifier}.xys'
            
            print(out_filename)
            
            df = merged_df.loc[:, ['X', 'Y', col, 'COUNT']]
            
#             df = merged_df.loc[:, ['PROBE_ID', 'X', 'Y', col, 'COUNT']]
            
            df.rename(columns={col: 'SIGNAL'}, inplace=True)

            df.to_csv(out_filename, sep='\t', index=False)
        
            with open(out_filename, 'r+') as f:
                content = f.read()
                f.seek(0, 0)
                f.write('# designname=2021_tetrahymena_expr_corrected date=2021_11_07' + '\n' + content)
            

In [29]:
for f in raw_microarray_data_files:

    if 'All' in f:                
        probe_id_list = All_pair_to_xys(filtered_probe_df, f)
        
    else:
        parts = f.split('/')
        gsm_code = parts[-1].split('_')[0].split('.')[0]
        
        phase = inverse_all_geo[gsm_code]
        identifier = f'{phase}_{gsm_code}'
        
        out_xys_path = f'../microarray_QC/{identifier}.xys'
        print(out_xys_path)
        probe_id_list = pair_to_xys(filtered_probe_df, f, out_xys_path)


../microarray_QC/C15m_GSM656231.xys
../microarray_QC/C6_GSM656232.xys
../microarray_QC/C12_GSM656237.xys
../microarray_QC/C16_GSM656239.xys
../microarray_QC/C4_GSM656234.xys
../microarray_QC/C18_GSM656240.xys
../microarray_QC/S9_GSM647653.xys
../microarray_QC/S9_GSM647654.xys
../microarray_QC/C8_GSM656236.xys
../microarray_QC/C14_GSM656238.xys
../microarray_QC/C10_GSM656235.xys
../microarray_QC/C0_GSM656230.xys
../microarray_QC/S0_GSM647652.xys
../microarray_QC/S0_GSM647651.xys
../microarray_QC/C2_GSM656233.xys
../microarray_QC/S0_GSM285554.xys
../microarray_QC/S9_GSM285557.xys
../microarray_QC/S3_GSM285555.xys
../microarray_QC/S6_GSM285556.xys
../microarray_QC/S0_GSM285561.xys
../microarray_QC/S15_GSM285566.xys
../microarray_QC/S24_GSM285567.xys
../microarray_QC/S3_GSM285562.xys
../microarray_QC/S6_GSM285563.xys
../microarray_QC/S9_GSM285564.xys
../microarray_QC/S12_GSM285565.xys
../microarray_QC/S24_GSM285560.xys
../microarray_QC/S12_GSM285558.xys
../microarray_QC/S15_GSM285559.xys
.

In [30]:
# max_probe_id_num = max([int(num[6:]) for num in (probe_df.loc[(probe_df['PROBE_CLASS'] == 'experimental')]['PROBE_ID'])])
# probe_id_num = max_probe_id_num

# for index, row in filtered_probe_df.iterrows():
#     if row['PROBE_ID'][:5] != 'TETRA':
#         probe_id_num += 1
#         filtered_probe_df.at[index, 'PROBE_ID'] = ('TETRAP00' + str(probe_id_num))

In [31]:
# filtered_probe_df.to_csv('../../new_raw_data/GSE11300/filtered_GPL6759.ndf.gz', sep='\t', index=False)

In [32]:
# new_df = (pd.read_csv('../../new_raw_data/GSE11300/filtered_GPL6759.ndf.gz', sep='\t'))
# new_df.loc[new_df['SEQ_ID'].str[8:].astype(int) > max_seq_id_num]