In [1]:
import pandas as pd
import numpy as np
import glob
import os
from scipy.stats import zscore, percentileofscore

In [2]:
file_paths = sorted(glob.glob('./rna_seq_count_data/kallisto_quant/kallisto_quant_p_trimmed_*'))
file_paths

['./rna_seq_count_data/kallisto_quant/kallisto_quant_p_trimmed_000min_A',
 './rna_seq_count_data/kallisto_quant/kallisto_quant_p_trimmed_000min_B',
 './rna_seq_count_data/kallisto_quant/kallisto_quant_p_trimmed_030min_A',
 './rna_seq_count_data/kallisto_quant/kallisto_quant_p_trimmed_030min_B',
 './rna_seq_count_data/kallisto_quant/kallisto_quant_p_trimmed_060min_A',
 './rna_seq_count_data/kallisto_quant/kallisto_quant_p_trimmed_060min_B',
 './rna_seq_count_data/kallisto_quant/kallisto_quant_p_trimmed_090min_A',
 './rna_seq_count_data/kallisto_quant/kallisto_quant_p_trimmed_090min_B',
 './rna_seq_count_data/kallisto_quant/kallisto_quant_p_trimmed_120min_A',
 './rna_seq_count_data/kallisto_quant/kallisto_quant_p_trimmed_120min_B',
 './rna_seq_count_data/kallisto_quant/kallisto_quant_p_trimmed_150min_A',
 './rna_seq_count_data/kallisto_quant/kallisto_quant_p_trimmed_150min_B',
 './rna_seq_count_data/kallisto_quant/kallisto_quant_p_trimmed_180min_A',
 './rna_seq_count_data/kallisto_quant/

In [3]:
dict_samtools_tpms = {}

for fp in file_paths:
    split = ((os.path.splitext(os.path.basename(fp))[0]).split('_'))
    key = '_'.join(split[len(split)-2:len(split)])
    print(key)
    dict_samtools_tpms[key] = (pd.read_csv(os.path.join(fp, 'abundance.tsv'), delimiter='\t'))

print(np.unique([df.shape for df in dict_samtools_tpms.values()]))

000min_A
000min_B
030min_A
030min_B
060min_A
060min_B
090min_A
090min_B
120min_A
120min_B
150min_A
150min_B
180min_A
180min_B
210min_A
210min_B
240min_A
240min_B
[    5 26258]


In [4]:
dict_samtools_tpms['000min_A'].head()

Unnamed: 0,target_id,length,eff_length,est_counts,tpm
0,TTHERM_01528530,4731,4542.35,76.0,0.176458
1,TTHERM_01528510,822,633.345,45704.0,761.064
2,TTHERM_01528500,660,471.395,932.0,20.8516
3,TTHERM_001528499,642,453.418,1.0,0.02326
4,TTHERM_0015284992,864,675.345,174.011,2.71743


In [5]:
ttherm_id = 'TTHERM_01528510'

dict_samtools_tpms['000min_A'].loc[dict_samtools_tpms['000min_A']['target_id'] == ttherm_id]

Unnamed: 0,target_id,length,eff_length,est_counts,tpm
1,TTHERM_01528510,822,633.345,45704.0,761.064


In [6]:
dict_samtools_tpms['000min_A'].shape

(26258, 5)

In [7]:
num_nonzero = (dict_samtools_tpms['000min_A'].loc[dict_samtools_tpms['000min_A']['tpm'] != 0.0].shape)[0]

mean = dict_samtools_tpms['000min_A']['tpm'].mean()

std = dict_samtools_tpms['000min_A']['tpm'].std()

str_summary_wzeros = f'''WITH ZEROS...
MEAN: {mean}
MEDIAN: {dict_samtools_tpms['000min_A']['tpm'].median()}
STDEV: {std}
CV: {(std/mean)*100}
MAX: {dict_samtools_tpms['000min_A']['tpm'].max()}
MIN: {dict_samtools_tpms['000min_A']['tpm'].min()}
# GENES EXPRESSED: {num_nonzero}
'''

df_tpm_wozeros = dict_samtools_tpms['000min_A'].loc[dict_samtools_tpms['000min_A']['tpm'] != 0.0]

mean_wozeros = df_tpm_wozeros['tpm'].mean()

std_wozeros = df_tpm_wozeros['tpm'].std()

str_summary_wozeros = f'''WITHOUT ZEROS...
MEAN: {mean_wozeros}
MEDIAN: {df_tpm_wozeros['tpm'].median()}
STDEV: {std_wozeros}
CV: {(std_wozeros/mean_wozeros)*100}
MAX: {df_tpm_wozeros['tpm'].max()}
MIN: {df_tpm_wozeros['tpm'].min()}
# GENES EXPRESSED: {num_nonzero}
'''

print(str_summary_wzeros, '\n', str_summary_wozeros, sep='')


WITH ZEROS...
MEAN: 38.083634511677026
MEDIAN: 0.965159
STDEV: 305.1788603308457
CV: 801.3385913502377
MAX: 13508.5
MIN: 0.0
# GENES EXPRESSED: 23397

WITHOUT ZEROS...
MEAN: 42.74052549504703
MEDIAN: 1.38843
STDEV: 322.99238134689284
CV: 755.7052179536788
MAX: 13508.5
MIN: 4.09199e-09
# GENES EXPRESSED: 23397



In [8]:
columns = sorted(list(dict_samtools_tpms.keys()))
columns

['000min_A',
 '000min_B',
 '030min_A',
 '030min_B',
 '060min_A',
 '060min_B',
 '090min_A',
 '090min_B',
 '120min_A',
 '120min_B',
 '150min_A',
 '150min_B',
 '180min_A',
 '180min_B',
 '210min_A',
 '210min_B',
 '240min_A',
 '240min_B']

In [9]:
curr_df = dict_samtools_tpms['000min_A'].sort_values(by='target_id')
dict_df_all_tpm = {'TTHERM_ID': curr_df['target_id'].values}
print(curr_df.head(3))

for col in columns:
    curr_df = dict_samtools_tpms[col].sort_values(by='target_id')
    print(curr_df.head(3))
    dict_df_all_tpm[col] = (curr_df)['tpm'].values

df_all_tpm = pd.DataFrame(dict_df_all_tpm)

df_all_tpm.head()

              target_id  length  eff_length  est_counts       tpm
25657  TTHERM_000000023     450     263.370         0.0  0.000000
25655  TTHERM_000000031     360     177.202         0.0  0.000000
25653  TTHERM_000000042    2505    2316.340        30.0  0.136592
              target_id  length  eff_length  est_counts       tpm
25657  TTHERM_000000023     450     263.370         0.0  0.000000
25655  TTHERM_000000031     360     177.202         0.0  0.000000
25653  TTHERM_000000042    2505    2316.340        30.0  0.136592
              target_id  length  eff_length  est_counts       tpm
25657  TTHERM_000000023     450     265.294         0.0  0.000000
25655  TTHERM_000000031     360     179.258         0.0  0.000000
25653  TTHERM_000000042    2505    2318.780        17.0  0.070409
              target_id  length  eff_length  est_counts      tpm
25657  TTHERM_000000023     450     259.932         0.0  0.00000
25655  TTHERM_000000031     360     174.632         0.0  0.00000
25653  TTHERM

Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,120min_B,150min_A,150min_B,180min_A,180min_B,210min_A,210min_B,240min_A,240min_B
0,TTHERM_000000023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TTHERM_000000031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TTHERM_000000042,0.136592,0.070409,0.14783,0.164264,0.178087,0.214322,0.448846,0.55575,1.06863,0.77572,1.97505,0.796338,2.04667,1.17789,0.663651,1.12222,0.229403,0.683663
3,TTHERM_000000045,15.143,11.0441,14.0834,12.4635,10.7785,9.54018,8.98899,9.7859,9.66753,8.32047,8.61215,7.21779,8.66085,9.36559,10.4298,10.4721,10.1336,10.1815
4,TTHERM_000000090,0.066294,0.075172,0.07949,0.085412,0.0,0.013989,0.017948,0.064938,0.035283,0.044208,0.055672,0.141305,0.046786,0.111566,0.020646,0.049867,0.057828,0.061902


In [10]:
all_cols = list(df_all_tpm.columns)
all_cols

['TTHERM_ID',
 '000min_A',
 '000min_B',
 '030min_A',
 '030min_B',
 '060min_A',
 '060min_B',
 '090min_A',
 '090min_B',
 '120min_A',
 '120min_B',
 '150min_A',
 '150min_B',
 '180min_A',
 '180min_B',
 '210min_A',
 '210min_B',
 '240min_A',
 '240min_B']

In [11]:
rows_with_all_zeros = df_all_tpm[(df_all_tpm.iloc[:, 1:] == 0.0).all(axis=1)]

print(len(rows_with_all_zeros))

rows_with_all_zeros.head()

869


Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,120min_B,150min_A,150min_B,180min_A,180min_B,210min_A,210min_B,240min_A,240min_B
0,TTHERM_000000023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99,TTHERM_000013669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
122,TTHERM_000016169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
138,TTHERM_000019719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
144,TTHERM_000024119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
rows_with_one_nonzero = df_all_tpm[(df_all_tpm.iloc[:, 1:] == 0.0).sum(axis=1) == 17]

print(len(rows_with_one_nonzero))

rows_with_one_nonzero.head()

405


Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,120min_B,150min_A,150min_B,180min_A,180min_B,210min_A,210min_B,240min_A,240min_B
1,TTHERM_000000031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,TTHERM_000001459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,TTHERM_000002618,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.041273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,TTHERM_000002749,0.0,0.0,0.0,0.0,0.0,0.156516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
153,TTHERM_000024479,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.072192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
num_zeros_counts = {}

for i in range(20):
    num_zeros = (df_all_tpm.iloc[:, 1:] == 0.0).sum(axis=1)
    
    rows_with_num_zeros = df_all_tpm[num_zeros == i]
    
    num_zeros_counts[i] = len(rows_with_num_zeros)

for num_zeros, count in num_zeros_counts.items():
    print(f"Number of rows with {num_zeros} zeros: {count}")

Number of rows with 0 zeros: 20734
Number of rows with 1 zeros: 587
Number of rows with 2 zeros: 369
Number of rows with 3 zeros: 323
Number of rows with 4 zeros: 265
Number of rows with 5 zeros: 244
Number of rows with 6 zeros: 217
Number of rows with 7 zeros: 209
Number of rows with 8 zeros: 187
Number of rows with 9 zeros: 218
Number of rows with 10 zeros: 223
Number of rows with 11 zeros: 173
Number of rows with 12 zeros: 195
Number of rows with 13 zeros: 197
Number of rows with 14 zeros: 227
Number of rows with 15 zeros: 287
Number of rows with 16 zeros: 329
Number of rows with 17 zeros: 405
Number of rows with 18 zeros: 869
Number of rows with 19 zeros: 0


In [14]:
df_no_all_zeros = df_all_tpm[~df_all_tpm.isin(rows_with_all_zeros)].dropna()
df_no_all_zeros.reset_index()
print(df_all_tpm.shape[0]-df_no_all_zeros.shape[0])
df_no_all_zeros.head()

869


Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,120min_B,150min_A,150min_B,180min_A,180min_B,210min_A,210min_B,240min_A,240min_B
1,TTHERM_000000031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.052389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TTHERM_000000042,0.136592,0.070409,0.14783,0.164264,0.178087,0.214322,0.448846,0.55575,1.06863,0.77572,1.97505,0.796338,2.04667,1.17789,0.663651,1.12222,0.229403,0.683663
3,TTHERM_000000045,15.143,11.0441,14.0834,12.4635,10.7785,9.54018,8.98899,9.7859,9.66753,8.32047,8.61215,7.21779,8.66085,9.36559,10.4298,10.4721,10.1336,10.1815
4,TTHERM_000000090,0.066294,0.075172,0.07949,0.085412,0.0,0.013989,0.017948,0.064938,0.035283,0.044208,0.055672,0.141305,0.046786,0.111566,0.020646,0.049867,0.057828,0.061902
5,TTHERM_00000010,2.89001,0.777601,3.45926,1.85791,0.646916,0.661484,0.218135,0.551436,0.279023,0.349789,0.327631,0.449172,0.557599,0.512233,0.42456,0.562777,0.560902,0.610772


In [15]:
df_no_all_zeros_or_one_nonzero = df_no_all_zeros[~df_no_all_zeros.isin(rows_with_one_nonzero)].dropna()
df_no_all_zeros_or_one_nonzero.reset_index()
print(df_no_all_zeros.shape[0]-df_no_all_zeros_or_one_nonzero.shape[0])
df_no_all_zeros_or_one_nonzero.head()

405


Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,120min_B,150min_A,150min_B,180min_A,180min_B,210min_A,210min_B,240min_A,240min_B
2,TTHERM_000000042,0.136592,0.070409,0.14783,0.164264,0.178087,0.214322,0.448846,0.55575,1.06863,0.77572,1.97505,0.796338,2.04667,1.17789,0.663651,1.12222,0.229403,0.683663
3,TTHERM_000000045,15.143,11.0441,14.0834,12.4635,10.7785,9.54018,8.98899,9.7859,9.66753,8.32047,8.61215,7.21779,8.66085,9.36559,10.4298,10.4721,10.1336,10.1815
4,TTHERM_000000090,0.066294,0.075172,0.07949,0.085412,0.0,0.013989,0.017948,0.064938,0.035283,0.044208,0.055672,0.141305,0.046786,0.111566,0.020646,0.049867,0.057828,0.061902
5,TTHERM_00000010,2.89001,0.777601,3.45926,1.85791,0.646916,0.661484,0.218135,0.551436,0.279023,0.349789,0.327631,0.449172,0.557599,0.512233,0.42456,0.562777,0.560902,0.610772
6,TTHERM_00000020,0.071436,0.019483,0.116082,0.029454,0.071675,0.078132,0.100596,0.00698,0.137113,0.050942,0.120326,0.033927,0.033484,0.075352,0.097243,0.056909,0.273291,0.086634


In [16]:
df_cols = df_no_all_zeros_or_one_nonzero.columns[1:]
df_cols

Index(['000min_A', '000min_B', '030min_A', '030min_B', '060min_A', '060min_B',
       '090min_A', '090min_B', '120min_A', '120min_B', '150min_A', '150min_B',
       '180min_A', '180min_B', '210min_A', '210min_B', '240min_A', '240min_B'],
      dtype='object')

In [17]:
df_cols_a = [col for col in df_cols if 'A' in col]
df_cols_a

['000min_A',
 '030min_A',
 '060min_A',
 '090min_A',
 '120min_A',
 '150min_A',
 '180min_A',
 '210min_A',
 '240min_A']

In [18]:
disagree_diff_list = []

for col_a in df_cols_a:

    col_b = col_a.replace('A', 'B')
    dup_disagree_col_a = []
    dup_disagree_col_b = []
    both_dup_zeros = []

    for idx, row in df_no_all_zeros_or_one_nonzero.iterrows():
        if row[col_a] == 0 and row[col_b] == 0:
            both_dup_zeros.append(1)
        else:
            both_dup_zeros.append(0)

        if row[col_a] == 0 and row[col_b] != 0:
            dup_disagree_col_b.append(1)
            dup_disagree_col_a.append(0)
            disagree_diff_list.append(abs(row[col_a]-row[col_b]))
            continue
        if row[col_a] != 0 and row[col_b] == 0:
            dup_disagree_col_b.append(0)
            dup_disagree_col_a.append(1)
            disagree_diff_list.append(abs(row[col_a]-row[col_b]))
            continue
        dup_disagree_col_b.append(0)
        dup_disagree_col_a.append(0)

    df_no_all_zeros_or_one_nonzero[f'{col_a.replace("_A", "")}_dup_disagree_A'] = dup_disagree_col_a
    df_no_all_zeros_or_one_nonzero[f'{col_a.replace("_A", "")}_dup_disagree_B'] = dup_disagree_col_b
    df_no_all_zeros_or_one_nonzero[f'{col_a.replace("_A", "")}_both_dup_zeros'] = both_dup_zeros

In [19]:
disagree_diff_list

[0.110607,
 0.0425801,
 0.1448,
 0.0957452,
 0.0223926,
 0.163308,
 0.00486561,
 0.05904,
 0.0528052,
 0.0216261,
 0.00221789,
 0.0170343,
 0.103766,
 0.0749004,
 0.153668,
 0.0844313,
 0.0228084,
 0.0512065,
 0.0243883,
 0.0497158,
 0.0478173,
 0.0249193,
 0.00529191,
 0.0401753,
 0.112412,
 0.00729328,
 0.675323,
 0.118896,
 0.056172,
 0.131921,
 0.0145199,
 0.00141339,
 0.136847,
 0.105861,
 0.0481412,
 0.00865279,
 0.013412,
 0.0836241,
 0.033147,
 0.0716048,
 0.0160119,
 0.0237152,
 0.00889407,
 0.0183003,
 0.0141688,
 0.0970513,
 0.0392923,
 0.146851,
 0.00737649,
 0.00936228,
 0.0911867,
 0.0603212,
 0.00363879,
 0.00311425,
 0.163815,
 0.0346438,
 0.0911121,
 0.0799976,
 0.0110797,
 0.00733747,
 0.0651841,
 0.177833,
 0.148513,
 0.358513,
 0.0141068,
 0.0162558,
 0.0222371,
 0.0200489,
 0.00353866,
 0.029882,
 0.0165898,
 0.0578758,
 0.0614672,
 0.00497589,
 0.00329748,
 0.0883347,
 0.0716138,
 0.20027,
 0.16591,
 0.0921232,
 0.0155454,
 0.0116439,
 0.00968301,
 0.00391382,
 0.

In [20]:
len(disagree_diff_list)

13380

In [21]:
max(disagree_diff_list)

92.3898

In [22]:
min(disagree_diff_list)

4.06716e-10

In [23]:
np.mean(disagree_diff_list)

0.11080852237790931

In [24]:
np.median(disagree_diff_list)

0.0293283

In [25]:
sorted(disagree_diff_list, reverse=True)

[92.3898,
 83.2114,
 68.4363,
 52.3875,
 48.1487,
 19.4785,
 17.9159,
 16.5444,
 15.2821,
 11.7998,
 11.3433,
 10.8746,
 10.317,
 10.1115,
 8.68874,
 8.40294,
 7.9348,
 6.94269,
 6.45446,
 4.89072,
 4.88826,
 4.53282,
 4.11434,
 3.91781,
 3.08975,
 2.98242,
 2.37309,
 2.32203,
 2.06189,
 2.03023,
 2.02614,
 1.97705,
 1.91622,
 1.8718,
 1.84599,
 1.83473,
 1.76975,
 1.7431,
 1.67741,
 1.59263,
 1.59213,
 1.52267,
 1.48701,
 1.46795,
 1.46525,
 1.43541,
 1.3962,
 1.37882,
 1.34995,
 1.34553,
 1.32743,
 1.32698,
 1.28787,
 1.28457,
 1.27607,
 1.26199,
 1.2353,
 1.21095,
 1.18402,
 1.17734,
 1.15988,
 1.14335,
 1.134,
 1.11946,
 1.10136,
 1.03956,
 1.02472,
 1.02204,
 1.02162,
 1.02053,
 1.00691,
 1.00544,
 1.00299,
 1.00246,
 1.00075,
 0.995304,
 0.99285,
 0.979089,
 0.966338,
 0.963332,
 0.954042,
 0.949877,
 0.945716,
 0.931469,
 0.930476,
 0.926538,
 0.925511,
 0.92067,
 0.916474,
 0.906473,
 0.895078,
 0.889293,
 0.88275,
 0.876268,
 0.873221,
 0.865005,
 0.865003,
 0.860327,
 0.82808

In [26]:
df_no_all_zeros_or_one_nonzero.head()

Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,...,150min_both_dup_zeros,180min_dup_disagree_A,180min_dup_disagree_B,180min_both_dup_zeros,210min_dup_disagree_A,210min_dup_disagree_B,210min_both_dup_zeros,240min_dup_disagree_A,240min_dup_disagree_B,240min_both_dup_zeros
2,TTHERM_000000042,0.136592,0.070409,0.14783,0.164264,0.178087,0.214322,0.448846,0.55575,1.06863,...,0,0,0,0,0,0,0,0,0,0
3,TTHERM_000000045,15.143,11.0441,14.0834,12.4635,10.7785,9.54018,8.98899,9.7859,9.66753,...,0,0,0,0,0,0,0,0,0,0
4,TTHERM_000000090,0.066294,0.075172,0.07949,0.085412,0.0,0.013989,0.017948,0.064938,0.035283,...,0,0,0,0,0,0,0,0,0,0
5,TTHERM_00000010,2.89001,0.777601,3.45926,1.85791,0.646916,0.661484,0.218135,0.551436,0.279023,...,0,0,0,0,0,0,0,0,0,0
6,TTHERM_00000020,0.071436,0.019483,0.116082,0.029454,0.071675,0.078132,0.100596,0.00698,0.137113,...,0,0,0,0,0,0,0,0,0,0


In [27]:
dup_disagree_cols = [col for col in df_no_all_zeros_or_one_nonzero.columns if 'dup_disagree' in col]
dup_disagree_cols

['000min_dup_disagree_A',
 '000min_dup_disagree_B',
 '030min_dup_disagree_A',
 '030min_dup_disagree_B',
 '060min_dup_disagree_A',
 '060min_dup_disagree_B',
 '090min_dup_disagree_A',
 '090min_dup_disagree_B',
 '120min_dup_disagree_A',
 '120min_dup_disagree_B',
 '150min_dup_disagree_A',
 '150min_dup_disagree_B',
 '180min_dup_disagree_A',
 '180min_dup_disagree_B',
 '210min_dup_disagree_A',
 '210min_dup_disagree_B',
 '240min_dup_disagree_A',
 '240min_dup_disagree_B']

In [28]:
dict_dup_disagree ={}

for col in dup_disagree_cols:
    dict_dup_disagree[col] = np.count_nonzero(df_no_all_zeros_or_one_nonzero[col].values)


dict_dup_disagree

{'000min_dup_disagree_A': 531,
 '000min_dup_disagree_B': 846,
 '030min_dup_disagree_A': 556,
 '030min_dup_disagree_B': 895,
 '060min_dup_disagree_A': 434,
 '060min_dup_disagree_B': 1123,
 '090min_dup_disagree_A': 453,
 '090min_dup_disagree_B': 1177,
 '120min_dup_disagree_A': 451,
 '120min_dup_disagree_B': 1059,
 '150min_dup_disagree_A': 362,
 '150min_dup_disagree_B': 1182,
 '180min_dup_disagree_A': 497,
 '180min_dup_disagree_B': 958,
 '210min_dup_disagree_A': 484,
 '210min_dup_disagree_B': 1022,
 '240min_dup_disagree_A': 596,
 '240min_dup_disagree_B': 754}

In [29]:
global_dup_disagree = np.zeros(df_no_all_zeros_or_one_nonzero.shape[0])
global_dup_disagree_a = np.zeros(df_no_all_zeros_or_one_nonzero.shape[0])
global_dup_disagree_b = np.zeros(df_no_all_zeros_or_one_nonzero.shape[0])

print(len(global_dup_disagree))
print(len(global_dup_disagree_a))
print(len(global_dup_disagree_b))

for col in dup_disagree_cols:
    global_dup_disagree += df_no_all_zeros_or_one_nonzero[col].values
    if 'A' in col:
        global_dup_disagree_a += df_no_all_zeros_or_one_nonzero[col].values
    if 'B' in col:
        global_dup_disagree_b += df_no_all_zeros_or_one_nonzero[col].values

print(len(global_dup_disagree))
print(len(global_dup_disagree_a))
print(len(global_dup_disagree_b))


24984
24984
24984
24984
24984
24984


In [30]:
np.count_nonzero(global_dup_disagree)

4202

In [31]:
np.count_nonzero(global_dup_disagree_a)

2505

In [32]:
np.count_nonzero(global_dup_disagree_b)

3774

In [33]:
both_dup_zeros = [col for col in df_no_all_zeros_or_one_nonzero.columns if 'both_dup_zeros' in col]
both_dup_zeros

['000min_both_dup_zeros',
 '030min_both_dup_zeros',
 '060min_both_dup_zeros',
 '090min_both_dup_zeros',
 '120min_both_dup_zeros',
 '150min_both_dup_zeros',
 '180min_both_dup_zeros',
 '210min_both_dup_zeros',
 '240min_both_dup_zeros']

In [34]:
df_no_all_zeros_or_one_nonzero['row_sum'] = df_no_all_zeros_or_one_nonzero[dup_disagree_cols].sum(axis=1) + df_no_all_zeros_or_one_nonzero[both_dup_zeros].sum(axis=1)
df_no_all_zeros_or_one_nonzero.head()

Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,...,180min_dup_disagree_A,180min_dup_disagree_B,180min_both_dup_zeros,210min_dup_disagree_A,210min_dup_disagree_B,210min_both_dup_zeros,240min_dup_disagree_A,240min_dup_disagree_B,240min_both_dup_zeros,row_sum
2,TTHERM_000000042,0.136592,0.070409,0.14783,0.164264,0.178087,0.214322,0.448846,0.55575,1.06863,...,0,0,0,0,0,0,0,0,0,0
3,TTHERM_000000045,15.143,11.0441,14.0834,12.4635,10.7785,9.54018,8.98899,9.7859,9.66753,...,0,0,0,0,0,0,0,0,0,0
4,TTHERM_000000090,0.066294,0.075172,0.07949,0.085412,0.0,0.013989,0.017948,0.064938,0.035283,...,0,0,0,0,0,0,0,0,0,1
5,TTHERM_00000010,2.89001,0.777601,3.45926,1.85791,0.646916,0.661484,0.218135,0.551436,0.279023,...,0,0,0,0,0,0,0,0,0,0
6,TTHERM_00000020,0.071436,0.019483,0.116082,0.029454,0.071675,0.078132,0.100596,0.00698,0.137113,...,0,0,0,0,0,0,0,0,0,0


In [35]:
df_no_all_zeros_or_one_nonzero['row_sum'].max()

9

In [36]:
df_no_all_zeros_or_one_nonzero.loc[df_no_all_zeros_or_one_nonzero['row_sum'] == df_no_all_zeros_or_one_nonzero['row_sum'].max()]

Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,...,180min_dup_disagree_A,180min_dup_disagree_B,180min_both_dup_zeros,210min_dup_disagree_A,210min_dup_disagree_B,210min_both_dup_zeros,240min_dup_disagree_A,240min_dup_disagree_B,240min_both_dup_zeros,row_sum
50,TTHERM_000010989,0.000000,0.022393,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,...,0,0,1,0,0,1,0,0,1,9
80,TTHERM_00001222,0.000000,0.000000,0.000000,0.260620,0.000000,0.0,0.039079,0.000000,0.0,...,0,0,1,0,0,1,0,0,1,9
95,TTHERM_000013649,0.021626,0.000000,0.010350,0.000000,0.043422,0.0,0.000000,0.000000,0.0,...,1,0,0,0,0,1,0,1,0,9
131,TTHERM_000016328,0.000000,0.153668,0.000000,0.070051,0.000000,0.0,0.000000,0.033424,0.0,...,0,0,1,0,1,0,0,1,0,9
136,TTHERM_000019709,0.000000,0.051207,0.000000,0.116837,0.000000,0.0,0.000000,0.056047,0.0,...,0,0,1,0,0,1,0,1,0,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26192,TTHERM_01639962,0.000000,0.002816,0.000000,0.003190,0.000000,0.0,0.000000,0.000000,0.0,...,0,1,0,0,0,1,0,1,0,9
26197,TTHERM_01645010,0.000000,0.000000,0.000000,0.007963,0.000000,0.0,0.000000,0.003771,0.0,...,1,0,0,0,1,0,0,0,1,9
26212,TTHERM_01662070,0.000000,0.068040,0.000000,0.000000,0.000000,0.0,0.081811,0.000000,0.0,...,0,0,1,0,0,1,0,0,1,9
26224,TTHERM_01683260,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.017551,0.0,...,0,0,1,1,0,0,0,0,1,9


In [37]:
df_no_all_zeros_or_one_nonzero_or_all_dup_disagree = df_no_all_zeros_or_one_nonzero.loc[df_no_all_zeros_or_one_nonzero['row_sum'] < df_no_all_zeros_or_one_nonzero['row_sum'].max()]

In [38]:
df_no_all_zeros_or_one_nonzero_or_all_dup_disagree.head()

Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,...,180min_dup_disagree_A,180min_dup_disagree_B,180min_both_dup_zeros,210min_dup_disagree_A,210min_dup_disagree_B,210min_both_dup_zeros,240min_dup_disagree_A,240min_dup_disagree_B,240min_both_dup_zeros,row_sum
2,TTHERM_000000042,0.136592,0.070409,0.14783,0.164264,0.178087,0.214322,0.448846,0.55575,1.06863,...,0,0,0,0,0,0,0,0,0,0
3,TTHERM_000000045,15.143,11.0441,14.0834,12.4635,10.7785,9.54018,8.98899,9.7859,9.66753,...,0,0,0,0,0,0,0,0,0,0
4,TTHERM_000000090,0.066294,0.075172,0.07949,0.085412,0.0,0.013989,0.017948,0.064938,0.035283,...,0,0,0,0,0,0,0,0,0,1
5,TTHERM_00000010,2.89001,0.777601,3.45926,1.85791,0.646916,0.661484,0.218135,0.551436,0.279023,...,0,0,0,0,0,0,0,0,0,0
6,TTHERM_00000020,0.071436,0.019483,0.116082,0.029454,0.071675,0.078132,0.100596,0.00698,0.137113,...,0,0,0,0,0,0,0,0,0,0


In [39]:
df_no_all_zeros_or_one_nonzero.shape

(24984, 47)

In [40]:
df_no_all_zeros_or_one_nonzero_or_all_dup_disagree.shape

(24102, 47)

In [41]:
df_no_all_zeros_or_one_nonzero.shape[0]- df_no_all_zeros_or_one_nonzero_or_all_dup_disagree.shape[0]

882

In [42]:
df_filtered = df_no_all_zeros_or_one_nonzero_or_all_dup_disagree.loc[:, all_cols]
df_filtered.head()

Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,120min_B,150min_A,150min_B,180min_A,180min_B,210min_A,210min_B,240min_A,240min_B
2,TTHERM_000000042,0.136592,0.070409,0.14783,0.164264,0.178087,0.214322,0.448846,0.55575,1.06863,0.77572,1.97505,0.796338,2.04667,1.17789,0.663651,1.12222,0.229403,0.683663
3,TTHERM_000000045,15.143,11.0441,14.0834,12.4635,10.7785,9.54018,8.98899,9.7859,9.66753,8.32047,8.61215,7.21779,8.66085,9.36559,10.4298,10.4721,10.1336,10.1815
4,TTHERM_000000090,0.066294,0.075172,0.07949,0.085412,0.0,0.013989,0.017948,0.064938,0.035283,0.044208,0.055672,0.141305,0.046786,0.111566,0.020646,0.049867,0.057828,0.061902
5,TTHERM_00000010,2.89001,0.777601,3.45926,1.85791,0.646916,0.661484,0.218135,0.551436,0.279023,0.349789,0.327631,0.449172,0.557599,0.512233,0.42456,0.562777,0.560902,0.610772
6,TTHERM_00000020,0.071436,0.019483,0.116082,0.029454,0.071675,0.078132,0.100596,0.00698,0.137113,0.050942,0.120326,0.033927,0.033484,0.075352,0.097243,0.056909,0.273291,0.086634


In [43]:
df_filtered.shape

(24102, 19)

In [44]:
disagree_diff_after_list = []

for col_a in df_cols_a:

    col_b = col_a.replace('A', 'B')

    for idx, row in df_no_all_zeros_or_one_nonzero_or_all_dup_disagree.iterrows():

        if row[col_a] == 0 and row[col_b] != 0:
            disagree_diff_after_list.append(abs(row[col_a]-row[col_b]))
            continue

        if row[col_a] != 0 and row[col_b] == 0:
            disagree_diff_after_list.append(abs(row[col_a]-row[col_b]))
            continue

In [45]:
sorted(disagree_diff_after_list, reverse=True)

[92.3898,
 83.2114,
 68.4363,
 52.3875,
 48.1487,
 19.4785,
 17.9159,
 16.5444,
 15.2821,
 11.7998,
 11.3433,
 10.8746,
 10.317,
 10.1115,
 8.68874,
 8.40294,
 7.9348,
 6.94269,
 6.45446,
 4.89072,
 4.88826,
 4.53282,
 4.11434,
 3.91781,
 3.08975,
 2.37309,
 2.06189,
 2.03023,
 2.02614,
 1.91622,
 1.8718,
 1.84599,
 1.83473,
 1.76975,
 1.7431,
 1.67741,
 1.59213,
 1.52267,
 1.48701,
 1.46795,
 1.46525,
 1.43541,
 1.3962,
 1.37882,
 1.34995,
 1.34553,
 1.32743,
 1.32698,
 1.28787,
 1.28457,
 1.27607,
 1.26199,
 1.2353,
 1.18402,
 1.17734,
 1.15988,
 1.14335,
 1.134,
 1.11946,
 1.10136,
 1.03956,
 1.02472,
 1.02204,
 1.02162,
 1.00691,
 1.00544,
 1.00299,
 1.00246,
 0.995304,
 0.99285,
 0.979089,
 0.966338,
 0.963332,
 0.954042,
 0.949877,
 0.945716,
 0.931469,
 0.930476,
 0.926538,
 0.925511,
 0.92067,
 0.916474,
 0.906473,
 0.895078,
 0.889293,
 0.88275,
 0.876268,
 0.873221,
 0.865005,
 0.865003,
 0.860327,
 0.828086,
 0.826637,
 0.822131,
 0.814716,
 0.807545,
 0.805157,
 0.803984,
 

In [46]:
len(disagree_diff_after_list)

10323

In [47]:
percentileofscore(disagree_diff_after_list, 1e-2)

21.45694081177952

In [48]:
df_filtered.to_csv('./tpm_kallisto.csv', index=False)

In [49]:
for col in list(df_filtered.columns)[1:]:
    print(df_filtered[col].max())

13508.5
24398.7
15270.1
22449.2
15305.5
24273.0
16113.2
23832.1
18082.2
24274.9
16028.1
29467.0
16944.2
20913.7
15298.7
15879.7
15489.9
15632.0


In [50]:
df_filtered_num_only = (df_filtered.loc[:, list(df_filtered.columns)[1:]])
df_filtered_num_only

Unnamed: 0,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,120min_B,150min_A,150min_B,180min_A,180min_B,210min_A,210min_B,240min_A,240min_B
2,0.136592,0.070409,0.147830,0.164264,0.178087,0.214322,0.448846,0.555750,1.068630,0.775720,1.975050,0.796338,2.046670,1.177890,0.663651,1.122220,0.229403,0.683663
3,15.143000,11.044100,14.083400,12.463500,10.778500,9.540180,8.988990,9.785900,9.667530,8.320470,8.612150,7.217790,8.660850,9.365590,10.429800,10.472100,10.133600,10.181500
4,0.066294,0.075172,0.079490,0.085412,0.000000,0.013989,0.017948,0.064938,0.035283,0.044208,0.055672,0.141305,0.046786,0.111566,0.020646,0.049867,0.057828,0.061902
5,2.890010,0.777601,3.459260,1.857910,0.646916,0.661484,0.218135,0.551436,0.279023,0.349789,0.327631,0.449172,0.557599,0.512233,0.424560,0.562777,0.560902,0.610772
6,0.071436,0.019483,0.116082,0.029454,0.071675,0.078132,0.100596,0.006980,0.137113,0.050942,0.120326,0.033927,0.033484,0.075352,0.097243,0.056909,0.273291,0.086634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26252,1.649220,1.733200,1.156570,2.026130,0.592805,1.411080,0.500443,1.373140,0.691911,1.728850,1.297400,1.410490,0.691233,1.161130,0.724120,0.934368,1.349120,1.164000
26254,9.899850,30.725600,16.823300,19.057400,12.432700,19.602300,5.970300,15.960800,3.297930,13.565800,3.816370,9.908730,3.556830,9.570900,5.529760,10.919600,6.569480,12.297400
26255,0.000000,0.027115,0.000000,0.061795,0.000000,0.076194,0.032445,0.000000,0.031836,0.026548,0.000000,0.028299,0.085001,0.000000,0.037601,0.000000,0.026248,0.028029
26256,0.000000,0.017311,0.000000,0.000000,0.000000,0.000000,0.000000,0.037428,0.000000,0.016966,0.021362,0.036151,0.000000,0.000000,0.023826,0.000000,0.000000,0.000000


In [51]:
all_tpms = (df_filtered_num_only.values).flatten()
all_tpms

array([0.136592 , 0.0704086, 0.14783  , ..., 0.394873 , 0.648661 ,
       0.625925 ])

In [52]:
max(all_tpms)

29467.0

In [53]:
percentileofscore(all_tpms, 93)

94.9109801860611

In [54]:
percentileofscore(all_tpms, 20)

85.66047999704958

In [55]:
percentileofscore(all_tpms, 10)

78.41799666233324

In [56]:
percentileofscore(all_tpms, 1)

44.57698300740372

In [57]:
percentileofscore(all_tpms, 0.1)

18.409491144118977

In [58]:
percentileofscore(all_tpms, 1e-2)
# bottom 5% and one dup 0 prob true 0
# above bottom 5% and one dup 0, something went wrong?

6.698614222886068

In [59]:
end_idx = int(np.floor(len(all_tpms) * 0.05))
end_idx

21691

In [60]:
sorted(all_tpms)[end_idx]

0.00366672

In [61]:
sorted(all_tpms, reverse=True)

[29467.0,
 24398.7,
 24274.9,
 24273.0,
 23832.1,
 22449.2,
 20913.7,
 18082.2,
 16944.2,
 16113.2,
 16028.1,
 15879.7,
 15692.2,
 15632.0,
 15489.9,
 15305.5,
 15298.7,
 15288.8,
 15270.1,
 15127.4,
 14984.4,
 14817.5,
 14747.5,
 14520.3,
 14422.4,
 14114.0,
 14105.5,
 13508.5,
 13301.2,
 13168.9,
 11770.4,
 11730.9,
 11694.3,
 11692.5,
 11556.8,
 11456.1,
 11410.3,
 11348.7,
 11247.2,
 11192.1,
 11110.9,
 11105.2,
 11083.2,
 11033.1,
 11021.9,
 10896.2,
 10765.2,
 10764.6,
 10646.9,
 10405.0,
 10155.4,
 9556.92,
 9549.52,
 9418.3,
 9266.74,
 9194.75,
 9185.62,
 9116.16,
 9050.85,
 9026.97,
 8982.14,
 8940.11,
 8866.24,
 8862.08,
 8850.86,
 8783.79,
 8778.95,
 8746.17,
 8740.45,
 8727.48,
 8697.71,
 8572.47,
 8560.49,
 8555.18,
 8528.99,
 8329.96,
 8316.84,
 8250.25,
 8226.5,
 8061.34,
 8020.83,
 7966.56,
 7917.74,
 7897.68,
 7869.5,
 7837.29,
 7819.8,
 7815.31,
 7803.95,
 7765.94,
 7758.91,
 7751.53,
 7739.26,
 7621.01,
 7619.26,
 7607.12,
 7582.12,
 7573.61,
 7544.25,
 7501.16,
 743

In [62]:
# quantile of 90
# geometric mean

In [63]:
def normalizer(array):
    log_a = [np.log10(tpm + 1) for tpm in array]
    normalized = zscore(log_a)
    return normalized

def normalize_expression_per_gene(expression_df):
    if 'TTHERM_ID' in expression_df.columns:
        ttids = expression_df['TTHERM_ID'].values
        data = expression_df[list(expression_df.columns)[1:]]
        
        norm_expression_df = pd.DataFrame(data.apply(normalizer, axis=1).tolist(), columns=data.columns)

        norm_expression_df['TTHERM_ID'] = ttids

        columns = norm_expression_df.columns.tolist()

        rearrangement = columns[-1:] + columns[:-1]

        norm_expression_df = norm_expression_df[rearrangement]
    else:
        norm_expression_df = pd.DataFrame(expression_df.apply(normalizer, axis=1).tolist(), columns=expression_df.columns)

    return norm_expression_df


In [64]:
normalized_tpm_df = normalize_expression_per_gene(df_filtered)
normalized_tpm_df.head()

Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,120min_B,150min_A,150min_B,180min_A,180min_B,210min_A,210min_B,240min_A,240min_B
0,TTHERM_000000042,-1.088602,-1.277884,-1.057561,-1.012709,-0.975471,-0.879894,-0.322777,-0.098171,0.800782,0.319073,1.947231,0.355495,2.022283,0.96317,0.113394,0.881475,-0.840952,0.151119
1,TTHERM_000000045,2.316764,0.495335,1.894589,1.188106,0.356672,-0.334071,-0.668065,-0.190768,-0.259389,-1.098814,-0.907196,-1.881777,-0.87577,-0.437935,0.169799,0.19277,0.006527,0.033223
2,TTHERM_000000090,0.287673,0.548611,0.674745,0.846946,-1.732447,-1.295248,-1.172623,0.247634,-0.641185,-0.371032,-0.027405,2.427201,-0.293448,1.596277,-1.089308,-0.200951,0.036792,0.157768
3,TTHERM_00000010,2.265341,0.070595,2.648077,1.40128,-0.143402,-0.118722,-0.98858,-0.310775,-0.851889,-0.700971,-0.747358,-0.501872,-0.299665,-0.382501,-0.549876,-0.290364,-0.293728,-0.205591
4,TTHERM_00000020,-0.145475,-1.102558,0.64061,-0.915149,-0.141197,-0.025523,0.371564,-1.340159,1.000075,-0.517364,0.713692,-0.831659,-0.839911,-0.075231,0.312811,-0.408346,3.178098,0.125722


In [65]:
# sanity check
row_to_check = normalized_tpm_df.loc[2, normalized_tpm_df.columns[1:]] 

row_mean = np.mean(row_to_check)
row_std = np.std(row_to_check)

print(row_mean)
print(row_std)

-1.2181613742414913e-16
1.0


In [66]:
normalized_tpm_df.shape

(24102, 19)

In [67]:
normalized_tpm_df.to_csv('./kallisto.csv', index=False)