In [1]:
import pandas as pd
import numpy as np
import glob
import os
from scipy.stats import zscore

In [2]:
file_paths = sorted(glob.glob('./rna_seq_count_data/samtools_idxstats/*'))

column_names = ["reference sequence name", "sequence length", "# mapped read-segments", "# unmapped read-segments"]

dict_samtools_tpms = {}

for fp in file_paths:
    base_name = os.path.splitext(os.path.basename(fp))[0]
    split = base_name.split('_')
    key = '_'.join(split[2:len(split)-1])
    print(key)
    dict_samtools_tpms[key] = (pd.read_csv(fp, header=None, names=column_names, delimiter='\t'))

print(np.unique([df.shape for df in dict_samtools_tpms.values()]))

000min_A
000min_B
030min_A
030min_B
060min_A
060min_B
090min_A
090min_B
120min_A
120min_B
150min_A
150min_B
180min_A
180min_B
210min_A
210min_B
240min_A
240min_B
[    4 26259]


In [3]:
dict_samtools_tpms['000min_A'].head()

Unnamed: 0,reference sequence name,sequence length,# mapped read-segments,# unmapped read-segments
0,TTHERM_01528530,4731,152,0
1,TTHERM_01528510,822,80928,6123
2,TTHERM_01528500,660,1670,138
3,TTHERM_001528499,642,0,0
4,TTHERM_0015284992,864,473,57


In [4]:
dict_samtools_tpms['000min_A'].shape

(26259, 4)

In [5]:
def computeRPK(row):
    if not row['sequence length'] > 0:
        print('ZERO sequence length:', row['reference sequence name'])
        print(row)
        return 'ZERO sequence length'
    return row['# mapped read-segments']/row['sequence length']

def computeTPM(row, normalization_factor):
    return row['rpk']/normalization_factor

def compute_tpm_column(df_dict):
    df_tpm = {}
    for key, df in df_dict.items():
        tpm_df = df
        tpm_df['rpk'] = tpm_df.apply(computeRPK, axis=1)
        tpm_df = tpm_df.loc[tpm_df['rpk']!='ZERO sequence length']

        sum_rpk = tpm_df['rpk'].sum()

        normalization_factor = sum_rpk/1_000_000

        tpm_df['tpm'] = tpm_df.apply(computeTPM, axis=1, args=(normalization_factor,))

        df_tpm[key] = tpm_df  

    return df_tpm


In [6]:
dict_samtools_tpms = compute_tpm_column(dict_samtools_tpms)

NONZERO sequence length: *
reference sequence name           *
sequence length                   0
# mapped read-segments            0
# unmapped read-segments    9166182
Name: 26258, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tpm_df['tpm'] = tpm_df.apply(computeTPM, axis=1, args=(normalization_factor,))


NONZERO sequence length: *
reference sequence name            *
sequence length                    0
# mapped read-segments             0
# unmapped read-segments    12312914
Name: 26258, dtype: object
NONZERO sequence length: *
reference sequence name           *
sequence length                   0
# mapped read-segments            0
# unmapped read-segments    9816882
Name: 26258, dtype: object
NONZERO sequence length: *
reference sequence name           *
sequence length                   0
# mapped read-segments            0
# unmapped read-segments    9324818
Name: 26258, dtype: object
NONZERO sequence length: *
reference sequence name           *
sequence length                   0
# mapped read-segments            0
# unmapped read-segments    8901980
Name: 26258, dtype: object
NONZERO sequence length: *
reference sequence name            *
sequence length                    0
# mapped read-segments             0
# unmapped read-segments    11163740
Name: 26258, dtype: object
NO

In [7]:
print(np.unique([df.shape for df in dict_samtools_tpms.values()]))

[    6 26258]


In [8]:
ttherm_id = 'TTHERM_01528510'

dict_samtools_tpms['000min_A'].loc[dict_samtools_tpms['000min_A']['reference sequence name'] == ttherm_id]

Unnamed: 0,reference sequence name,sequence length,# mapped read-segments,# unmapped read-segments,rpk,tpm
1,TTHERM_01528510,822,80928,6123,98.452555,899.291273


In [9]:
dict_samtools_tpms['000min_A'].shape

(26258, 6)

In [10]:
num_nonzero = (dict_samtools_tpms['000min_A'].loc[dict_samtools_tpms['000min_A']['tpm'] != 0.0].shape)[0]

mean = dict_samtools_tpms['000min_A']['tpm'].mean()

std = dict_samtools_tpms['000min_A']['tpm'].std()

str_summary_wzeros = f'''WITH ZEROS...
MEAN: {mean}
MEDIAN: {dict_samtools_tpms['000min_A']['tpm'].median()}
STDEV: {std}
CV: {(std/mean)*100}
MAX: {dict_samtools_tpms['000min_A']['tpm'].max()}
MIN: {dict_samtools_tpms['000min_A']['tpm'].min()}
# GENES EXPRESSED: {num_nonzero}
'''

df_tpm_wozeros = dict_samtools_tpms['000min_A'].loc[dict_samtools_tpms['000min_A']['tpm'] != 0.0]

mean_wozeros = df_tpm_wozeros['tpm'].mean()

std_wozeros = df_tpm_wozeros['tpm'].std()

str_summary_wozeros = f'''WITHOUT ZEROS...
MEAN: {mean_wozeros}
MEDIAN: {df_tpm_wozeros['tpm'].median()}
STDEV: {std_wozeros}
CV: {(std_wozeros/mean_wozeros)*100}
MAX: {df_tpm_wozeros['tpm'].max()}
MIN: {df_tpm_wozeros['tpm'].min()}
# GENES EXPRESSED: {num_nonzero}
'''

print(str_summary_wzeros, '\n', str_summary_wozeros, sep='')


WITH ZEROS...
MEAN: 38.08363165511461
MEDIAN: 1.3200862198528038
STDEV: 276.9536484350719
CV: 727.2248900608123
MAX: 12929.984835185876
MIN: 0.0
# GENES EXPRESSED: 23409

WITHOUT ZEROS...
MEAN: 42.71861249946599
MEDIAN: 1.9055599941945744
STDEV: 292.9862058718001
CV: 685.8514093252973
MAX: 12929.984835185876
MIN: 0.0008220176752118815
# GENES EXPRESSED: 23409



In [11]:
columns = sorted(list(dict_samtools_tpms.keys()))
columns

['000min_A',
 '000min_B',
 '030min_A',
 '030min_B',
 '060min_A',
 '060min_B',
 '090min_A',
 '090min_B',
 '120min_A',
 '120min_B',
 '150min_A',
 '150min_B',
 '180min_A',
 '180min_B',
 '210min_A',
 '210min_B',
 '240min_A',
 '240min_B']

In [12]:
curr_df = dict_samtools_tpms['000min_A'].sort_values(by='reference sequence name')
dict_df_all_tpm = {'TTHERM_ID': curr_df['reference sequence name'].values}
print(curr_df.head(3))

for col in columns:
    curr_df = dict_samtools_tpms[col].sort_values(by='reference sequence name')
    print(curr_df.head(3))
    dict_df_all_tpm[col] = (curr_df)['tpm'].values

df_all_tpm = pd.DataFrame(dict_df_all_tpm)

df_all_tpm.head()

      reference sequence name  sequence length  # mapped read-segments  \
25657        TTHERM_000000023              450                       0   
25655        TTHERM_000000031              360                       0   
25653        TTHERM_000000042             2505                      59   

       # unmapped read-segments       rpk       tpm  
25657                         0       0.0  0.000000  
25655                         0       0.0  0.000000  
25653                         1  0.023553  0.215138  
      reference sequence name  sequence length  # mapped read-segments  \
25657        TTHERM_000000023              450                       0   
25655        TTHERM_000000031              360                       0   
25653        TTHERM_000000042             2505                      59   

       # unmapped read-segments       rpk       tpm  
25657                         0       0.0  0.000000  
25655                         0       0.0  0.000000  
25653                       

Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,120min_B,150min_A,150min_B,180min_A,180min_B,210min_A,210min_B,240min_A,240min_B
0,TTHERM_000000023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TTHERM_000000031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,TTHERM_000000042,0.215138,0.11136,0.24239,0.260366,0.289015,0.365461,0.703124,0.899421,1.687416,1.270069,3.056255,1.323705,3.169158,1.892353,1.035637,1.775612,0.376082,1.098392
3,TTHERM_000000045,13.199006,10.483369,12.753884,11.539232,9.715775,9.070813,7.992147,9.959137,8.390814,7.727389,7.724427,6.98749,7.777138,8.39244,8.666536,9.449084,9.093164,9.332219
4,TTHERM_000000090,0.033215,0.109074,0.075763,0.072975,0.0,0.01018,0.024681,0.09499,0.048336,0.063742,0.049825,0.184793,0.041975,0.156502,0.027584,0.033908,0.069508,0.084971


In [13]:
all_cols = list(df_all_tpm.columns)
all_cols

['TTHERM_ID',
 '000min_A',
 '000min_B',
 '030min_A',
 '030min_B',
 '060min_A',
 '060min_B',
 '090min_A',
 '090min_B',
 '120min_A',
 '120min_B',
 '150min_A',
 '150min_B',
 '180min_A',
 '180min_B',
 '210min_A',
 '210min_B',
 '240min_A',
 '240min_B']

In [14]:
rows_with_all_zeros = df_all_tpm[(df_all_tpm.iloc[:, 1:] == 0.0).all(axis=1)]

print(len(rows_with_all_zeros))

rows_with_all_zeros.head()

923


Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,120min_B,150min_A,150min_B,180min_A,180min_B,210min_A,210min_B,240min_A,240min_B
0,TTHERM_000000023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,TTHERM_000000031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,TTHERM_000001459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,TTHERM_000002749,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99,TTHERM_000013669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
rows_with_one_nonzero = df_all_tpm[(df_all_tpm.iloc[:, 1:] == 0.0).sum(axis=1) == 17]

print(len(rows_with_one_nonzero))

rows_with_one_nonzero.head()

389


Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,120min_B,150min_A,150min_B,180min_A,180min_B,210min_A,210min_B,240min_A,240min_B
167,TTHERM_000028465,0.0,0.022595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
168,TTHERM_000028466,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032418,0.0,0.0,0.0,0.0,0.0,0.0,0.0
189,TTHERM_000030419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02076,0.0,0.0,0.0,0.0,0.0,0.0,0.0
222,TTHERM_000035707,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
233,TTHERM_000037459,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.025761,0.0


In [16]:
num_zeros_counts = {}

for i in range(20):
    num_zeros = (df_all_tpm.iloc[:, 1:] == 0.0).sum(axis=1)
    
    rows_with_num_zeros = df_all_tpm[num_zeros == i]
    
    num_zeros_counts[i] = len(rows_with_num_zeros)

for num_zeros, count in num_zeros_counts.items():
    print(f"Number of rows with {num_zeros} zeros: {count}")

Number of rows with 0 zeros: 20756
Number of rows with 1 zeros: 602
Number of rows with 2 zeros: 350
Number of rows with 3 zeros: 330
Number of rows with 4 zeros: 257
Number of rows with 5 zeros: 236
Number of rows with 6 zeros: 219
Number of rows with 7 zeros: 204
Number of rows with 8 zeros: 217
Number of rows with 9 zeros: 194
Number of rows with 10 zeros: 223
Number of rows with 11 zeros: 171
Number of rows with 12 zeros: 183
Number of rows with 13 zeros: 205
Number of rows with 14 zeros: 202
Number of rows with 15 zeros: 271
Number of rows with 16 zeros: 326
Number of rows with 17 zeros: 389
Number of rows with 18 zeros: 923
Number of rows with 19 zeros: 0


In [17]:
df_no_all_zeros = df_all_tpm[~df_all_tpm.isin(rows_with_all_zeros)].dropna()
df_no_all_zeros.reset_index()
print(df_all_tpm.shape[0]-df_no_all_zeros.shape[0])
df_no_all_zeros.head()

923


Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,120min_B,150min_A,150min_B,180min_A,180min_B,210min_A,210min_B,240min_A,240min_B
2,TTHERM_000000042,0.215138,0.11136,0.24239,0.260366,0.289015,0.365461,0.703124,0.899421,1.687416,1.270069,3.056255,1.323705,3.169158,1.892353,1.035637,1.775612,0.376082,1.098392
3,TTHERM_000000045,13.199006,10.483369,12.753884,11.539232,9.715775,9.070813,7.992147,9.959137,8.390814,7.727389,7.724427,6.98749,7.777138,8.39244,8.666536,9.449084,9.093164,9.332219
4,TTHERM_000000090,0.033215,0.109074,0.075763,0.072975,0.0,0.01018,0.024681,0.09499,0.048336,0.063742,0.049825,0.184793,0.041975,0.156502,0.027584,0.033908,0.069508,0.084971
5,TTHERM_00000010,3.70987,1.048381,4.493501,2.589711,0.730281,0.890175,0.271825,0.752919,0.371025,0.510555,0.457283,0.624434,0.679416,0.679,0.524731,0.709164,0.702554,0.808205
6,TTHERM_00000020,0.087777,0.021618,0.134072,0.036159,0.079059,0.105932,0.12841,0.005883,0.191602,0.05264,0.1543,0.057228,0.051996,0.105241,0.116176,0.078406,0.373931,0.126308


In [18]:
df_no_all_zeros_or_one_nonzero = df_no_all_zeros[~df_no_all_zeros.isin(rows_with_one_nonzero)].dropna()
df_no_all_zeros_or_one_nonzero.reset_index()
print(df_no_all_zeros.shape[0]-df_no_all_zeros_or_one_nonzero.shape[0])
df_no_all_zeros_or_one_nonzero.head()

389


Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,120min_B,150min_A,150min_B,180min_A,180min_B,210min_A,210min_B,240min_A,240min_B
2,TTHERM_000000042,0.215138,0.11136,0.24239,0.260366,0.289015,0.365461,0.703124,0.899421,1.687416,1.270069,3.056255,1.323705,3.169158,1.892353,1.035637,1.775612,0.376082,1.098392
3,TTHERM_000000045,13.199006,10.483369,12.753884,11.539232,9.715775,9.070813,7.992147,9.959137,8.390814,7.727389,7.724427,6.98749,7.777138,8.39244,8.666536,9.449084,9.093164,9.332219
4,TTHERM_000000090,0.033215,0.109074,0.075763,0.072975,0.0,0.01018,0.024681,0.09499,0.048336,0.063742,0.049825,0.184793,0.041975,0.156502,0.027584,0.033908,0.069508,0.084971
5,TTHERM_00000010,3.70987,1.048381,4.493501,2.589711,0.730281,0.890175,0.271825,0.752919,0.371025,0.510555,0.457283,0.624434,0.679416,0.679,0.524731,0.709164,0.702554,0.808205
6,TTHERM_00000020,0.087777,0.021618,0.134072,0.036159,0.079059,0.105932,0.12841,0.005883,0.191602,0.05264,0.1543,0.057228,0.051996,0.105241,0.116176,0.078406,0.373931,0.126308


In [19]:
df_cols = df_no_all_zeros_or_one_nonzero.columns[1:]
df_cols

Index(['000min_A', '000min_B', '030min_A', '030min_B', '060min_A', '060min_B',
       '090min_A', '090min_B', '120min_A', '120min_B', '150min_A', '150min_B',
       '180min_A', '180min_B', '210min_A', '210min_B', '240min_A', '240min_B'],
      dtype='object')

In [20]:
df_cols_a = [col for col in df_cols if 'A' in col]
df_cols_a

['000min_A',
 '030min_A',
 '060min_A',
 '090min_A',
 '120min_A',
 '150min_A',
 '180min_A',
 '210min_A',
 '240min_A']

In [21]:
for col_a in df_cols_a:

    col_b = col_a.replace('A', 'B')
    dup_disagree_col_a = []
    dup_disagree_col_b = []
    both_dup_zeros = []

    for idx, row in df_no_all_zeros_or_one_nonzero.iterrows():
        if row[col_a] == 0 and row[col_b] == 0:
            both_dup_zeros.append(1)
        else:
            both_dup_zeros.append(0)

        if row[col_a] == 0 and row[col_b] != 0:
            dup_disagree_col_b.append(1)
            dup_disagree_col_a.append(0)
            continue
        if row[col_a] != 0 and row[col_b] == 0:
            dup_disagree_col_b.append(0)
            dup_disagree_col_a.append(1)
            continue
        dup_disagree_col_b.append(0)
        dup_disagree_col_a.append(0)

    df_no_all_zeros_or_one_nonzero[f'{col_a.replace("_A", "")}_dup_disagree_A'] = dup_disagree_col_a
    df_no_all_zeros_or_one_nonzero[f'{col_a.replace("_A", "")}_dup_disagree_B'] = dup_disagree_col_b
    df_no_all_zeros_or_one_nonzero[f'{col_a.replace("_A", "")}_both_dup_zeros'] = both_dup_zeros

In [22]:
df_no_all_zeros_or_one_nonzero.head()

Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,...,150min_both_dup_zeros,180min_dup_disagree_A,180min_dup_disagree_B,180min_both_dup_zeros,210min_dup_disagree_A,210min_dup_disagree_B,210min_both_dup_zeros,240min_dup_disagree_A,240min_dup_disagree_B,240min_both_dup_zeros
2,TTHERM_000000042,0.215138,0.11136,0.24239,0.260366,0.289015,0.365461,0.703124,0.899421,1.687416,...,0,0,0,0,0,0,0,0,0,0
3,TTHERM_000000045,13.199006,10.483369,12.753884,11.539232,9.715775,9.070813,7.992147,9.959137,8.390814,...,0,0,0,0,0,0,0,0,0,0
4,TTHERM_000000090,0.033215,0.109074,0.075763,0.072975,0.0,0.01018,0.024681,0.09499,0.048336,...,0,0,0,0,0,0,0,0,0,0
5,TTHERM_00000010,3.70987,1.048381,4.493501,2.589711,0.730281,0.890175,0.271825,0.752919,0.371025,...,0,0,0,0,0,0,0,0,0,0
6,TTHERM_00000020,0.087777,0.021618,0.134072,0.036159,0.079059,0.105932,0.12841,0.005883,0.191602,...,0,0,0,0,0,0,0,0,0,0


In [23]:
dup_disagree_cols = [col for col in df_no_all_zeros_or_one_nonzero.columns if 'dup_disagree' in col]
dup_disagree_cols

['000min_dup_disagree_A',
 '000min_dup_disagree_B',
 '030min_dup_disagree_A',
 '030min_dup_disagree_B',
 '060min_dup_disagree_A',
 '060min_dup_disagree_B',
 '090min_dup_disagree_A',
 '090min_dup_disagree_B',
 '120min_dup_disagree_A',
 '120min_dup_disagree_B',
 '150min_dup_disagree_A',
 '150min_dup_disagree_B',
 '180min_dup_disagree_A',
 '180min_dup_disagree_B',
 '210min_dup_disagree_A',
 '210min_dup_disagree_B',
 '240min_dup_disagree_A',
 '240min_dup_disagree_B']

In [24]:
dict_dup_disagree ={}

for col in dup_disagree_cols:
    dict_dup_disagree[col] = np.count_nonzero(df_no_all_zeros_or_one_nonzero[col].values)


dict_dup_disagree

{'000min_dup_disagree_A': 517,
 '000min_dup_disagree_B': 820,
 '030min_dup_disagree_A': 527,
 '030min_dup_disagree_B': 881,
 '060min_dup_disagree_A': 448,
 '060min_dup_disagree_B': 1112,
 '090min_dup_disagree_A': 440,
 '090min_dup_disagree_B': 1153,
 '120min_dup_disagree_A': 432,
 '120min_dup_disagree_B': 1036,
 '150min_dup_disagree_A': 365,
 '150min_dup_disagree_B': 1166,
 '180min_dup_disagree_A': 498,
 '180min_dup_disagree_B': 946,
 '210min_dup_disagree_A': 471,
 '210min_dup_disagree_B': 1005,
 '240min_dup_disagree_A': 577,
 '240min_dup_disagree_B': 773}

In [25]:
global_dup_disagree = np.zeros(df_no_all_zeros_or_one_nonzero.shape[0])
global_dup_disagree_a = np.zeros(df_no_all_zeros_or_one_nonzero.shape[0])
global_dup_disagree_b = np.zeros(df_no_all_zeros_or_one_nonzero.shape[0])

print(len(global_dup_disagree))
print(len(global_dup_disagree_a))
print(len(global_dup_disagree_b))

for col in dup_disagree_cols:
    global_dup_disagree += df_no_all_zeros_or_one_nonzero[col].values
    if 'A' in col:
        global_dup_disagree_a += df_no_all_zeros_or_one_nonzero[col].values
    if 'B' in col:
        global_dup_disagree_b += df_no_all_zeros_or_one_nonzero[col].values

print(len(global_dup_disagree))
print(len(global_dup_disagree_a))
print(len(global_dup_disagree_b))


24946
24946
24946
24946
24946
24946


In [26]:
np.count_nonzero(global_dup_disagree)

4134

In [27]:
np.count_nonzero(global_dup_disagree_a)

2478

In [28]:
np.count_nonzero(global_dup_disagree_b)

3702

In [29]:
both_dup_zeros = [col for col in df_no_all_zeros_or_one_nonzero.columns if 'both_dup_zeros' in col]
both_dup_zeros

['000min_both_dup_zeros',
 '030min_both_dup_zeros',
 '060min_both_dup_zeros',
 '090min_both_dup_zeros',
 '120min_both_dup_zeros',
 '150min_both_dup_zeros',
 '180min_both_dup_zeros',
 '210min_both_dup_zeros',
 '240min_both_dup_zeros']

In [30]:
df_no_all_zeros_or_one_nonzero['row_sum'] = df_no_all_zeros_or_one_nonzero[dup_disagree_cols].sum(axis=1) + df_no_all_zeros_or_one_nonzero[both_dup_zeros].sum(axis=1)
df_no_all_zeros_or_one_nonzero.head()

Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,...,180min_dup_disagree_A,180min_dup_disagree_B,180min_both_dup_zeros,210min_dup_disagree_A,210min_dup_disagree_B,210min_both_dup_zeros,240min_dup_disagree_A,240min_dup_disagree_B,240min_both_dup_zeros,row_sum
2,TTHERM_000000042,0.215138,0.11136,0.24239,0.260366,0.289015,0.365461,0.703124,0.899421,1.687416,...,0,0,0,0,0,0,0,0,0,0
3,TTHERM_000000045,13.199006,10.483369,12.753884,11.539232,9.715775,9.070813,7.992147,9.959137,8.390814,...,0,0,0,0,0,0,0,0,0,0
4,TTHERM_000000090,0.033215,0.109074,0.075763,0.072975,0.0,0.01018,0.024681,0.09499,0.048336,...,0,0,0,0,0,0,0,0,0,1
5,TTHERM_00000010,3.70987,1.048381,4.493501,2.589711,0.730281,0.890175,0.271825,0.752919,0.371025,...,0,0,0,0,0,0,0,0,0,0
6,TTHERM_00000020,0.087777,0.021618,0.134072,0.036159,0.079059,0.105932,0.12841,0.005883,0.191602,...,0,0,0,0,0,0,0,0,0,0


In [31]:
df_no_all_zeros_or_one_nonzero['row_sum'].max()

9

In [32]:
df_no_all_zeros_or_one_nonzero.loc[df_no_all_zeros_or_one_nonzero['row_sum'] == df_no_all_zeros_or_one_nonzero['row_sum'].max()]

Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,...,180min_dup_disagree_A,180min_dup_disagree_B,180min_both_dup_zeros,210min_dup_disagree_A,210min_dup_disagree_B,210min_both_dup_zeros,240min_dup_disagree_A,240min_dup_disagree_B,240min_both_dup_zeros,row_sum
23,TTHERM_000002618,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.022365,0.000000,...,0,0,1,0,0,1,0,0,1,9
24,TTHERM_000002619,0.000000,0.000000,0.031004,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0,0,1,0,0,1,1,0,0,9
50,TTHERM_000010989,0.000000,0.029264,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,0,0,1,0,0,1,0,0,1,9
131,TTHERM_000016328,0.000000,0.126486,0.000000,0.060446,0.000000,0.0,0.000000,0.019670,0.000000,...,0,0,1,0,0,1,0,1,0,9
133,TTHERM_000016369,0.014031,0.000000,0.000000,0.030827,0.000000,0.0,0.000000,0.000000,0.015314,...,0,0,1,1,0,0,0,0,1,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26178,TTHERM_01615782,0.069199,0.000000,0.000000,0.038008,0.000000,0.0,0.000000,0.000000,0.000000,...,0,1,0,0,0,1,1,0,0,9
26197,TTHERM_01645010,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,...,1,0,0,0,1,0,0,0,1,9
26212,TTHERM_01662070,0.000000,0.056595,0.000000,0.000000,0.000000,0.0,0.000000,0.154023,0.000000,...,0,0,1,0,0,1,0,1,0,9
26229,TTHERM_01731490,0.000000,0.000000,0.018204,0.000000,0.014377,0.0,0.000000,0.000000,0.005081,...,1,0,0,1,0,0,0,0,1,9


In [33]:
df_no_all_zeros_or_one_nonzero_or_all_dup_disagree = df_no_all_zeros_or_one_nonzero.loc[df_no_all_zeros_or_one_nonzero['row_sum'] < df_no_all_zeros_or_one_nonzero['row_sum'].max()]

In [34]:
df_no_all_zeros_or_one_nonzero_or_all_dup_disagree.head()

Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,...,180min_dup_disagree_A,180min_dup_disagree_B,180min_both_dup_zeros,210min_dup_disagree_A,210min_dup_disagree_B,210min_both_dup_zeros,240min_dup_disagree_A,240min_dup_disagree_B,240min_both_dup_zeros,row_sum
2,TTHERM_000000042,0.215138,0.11136,0.24239,0.260366,0.289015,0.365461,0.703124,0.899421,1.687416,...,0,0,0,0,0,0,0,0,0,0
3,TTHERM_000000045,13.199006,10.483369,12.753884,11.539232,9.715775,9.070813,7.992147,9.959137,8.390814,...,0,0,0,0,0,0,0,0,0,0
4,TTHERM_000000090,0.033215,0.109074,0.075763,0.072975,0.0,0.01018,0.024681,0.09499,0.048336,...,0,0,0,0,0,0,0,0,0,1
5,TTHERM_00000010,3.70987,1.048381,4.493501,2.589711,0.730281,0.890175,0.271825,0.752919,0.371025,...,0,0,0,0,0,0,0,0,0,0
6,TTHERM_00000020,0.087777,0.021618,0.134072,0.036159,0.079059,0.105932,0.12841,0.005883,0.191602,...,0,0,0,0,0,0,0,0,0,0


In [35]:
df_no_all_zeros_or_one_nonzero.shape

(24946, 47)

In [36]:
df_no_all_zeros_or_one_nonzero_or_all_dup_disagree.shape

(24108, 47)

In [37]:
df_no_all_zeros_or_one_nonzero.shape[0]- df_no_all_zeros_or_one_nonzero_or_all_dup_disagree.shape[0]

838

In [38]:
df_filtered = df_no_all_zeros_or_one_nonzero_or_all_dup_disagree.loc[:, all_cols]
df_filtered.head()

Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,120min_B,150min_A,150min_B,180min_A,180min_B,210min_A,210min_B,240min_A,240min_B
2,TTHERM_000000042,0.215138,0.11136,0.24239,0.260366,0.289015,0.365461,0.703124,0.899421,1.687416,1.270069,3.056255,1.323705,3.169158,1.892353,1.035637,1.775612,0.376082,1.098392
3,TTHERM_000000045,13.199006,10.483369,12.753884,11.539232,9.715775,9.070813,7.992147,9.959137,8.390814,7.727389,7.724427,6.98749,7.777138,8.39244,8.666536,9.449084,9.093164,9.332219
4,TTHERM_000000090,0.033215,0.109074,0.075763,0.072975,0.0,0.01018,0.024681,0.09499,0.048336,0.063742,0.049825,0.184793,0.041975,0.156502,0.027584,0.033908,0.069508,0.084971
5,TTHERM_00000010,3.70987,1.048381,4.493501,2.589711,0.730281,0.890175,0.271825,0.752919,0.371025,0.510555,0.457283,0.624434,0.679416,0.679,0.524731,0.709164,0.702554,0.808205
6,TTHERM_00000020,0.087777,0.021618,0.134072,0.036159,0.079059,0.105932,0.12841,0.005883,0.191602,0.05264,0.1543,0.057228,0.051996,0.105241,0.116176,0.078406,0.373931,0.126308


In [39]:
df_filtered.shape

(24108, 19)

In [40]:
df_filtered.to_csv('./tpm_manual.csv', index=False)

In [41]:
def normalizer(array):
    log_a = [np.log10(tpm + 1) for tpm in array]
    normalized = zscore(log_a)
    return normalized

def normalize_expression_per_gene(expression_df):
    if 'TTHERM_ID' in expression_df.columns:
        ttids = expression_df['TTHERM_ID'].values
        data = expression_df[list(expression_df.columns)[1:]]
        
        norm_expression_df = pd.DataFrame(data.apply(normalizer, axis=1).tolist(), columns=data.columns)

        norm_expression_df['TTHERM_ID'] = ttids

        columns = norm_expression_df.columns.tolist()

        rearrangement = columns[-1:] + columns[:-1]

        norm_expression_df = norm_expression_df[rearrangement]
    else:
        norm_expression_df = pd.DataFrame(expression_df.apply(normalizer, axis=1).tolist(), columns=expression_df.columns)

    return norm_expression_df


In [42]:
normalized_tpm_df = normalize_expression_per_gene(df_filtered)
normalized_tpm_df.head()

Unnamed: 0,TTHERM_ID,000min_A,000min_B,030min_A,030min_B,060min_A,060min_B,090min_A,090min_B,120min_A,120min_B,150min_A,150min_B,180min_A,180min_B,210min_A,210min_B,240min_A,240min_B
0,TTHERM_000000042,-1.145278,-1.366622,-1.090288,-1.054669,-0.998943,-0.856095,-0.308216,-0.037751,0.822677,0.404229,1.843396,0.46213,1.911465,1.004889,0.133971,0.902739,-0.836885,0.209251
1,TTHERM_000000045,2.132869,0.755428,1.926188,1.326219,0.306498,-0.096312,-0.831452,0.45222,-0.549956,-1.02538,-1.027583,-1.600241,-0.988495,-0.548833,-0.362177,0.142958,-0.081926,0.069974
2,TTHERM_000000090,-0.696709,0.949364,0.240858,0.180576,-1.455876,-1.220545,-0.889407,0.652444,-0.359168,-0.020221,-0.326191,2.483759,-0.500574,1.922257,-0.823688,-0.681138,0.105379,0.438881
3,TTHERM_00000010,2.212097,0.133578,2.596304,1.534106,-0.287727,-0.067082,-1.056191,-0.255277,-0.868698,-0.626752,-0.716381,-0.445309,-0.362213,-0.362831,-0.603433,-0.31838,-0.328054,-0.177758
4,TTHERM_00000020,-0.207069,-1.126478,0.403628,-0.919401,-0.324971,0.035467,0.330289,-1.353909,1.128683,-0.688178,0.662676,-0.624456,-0.697145,0.026312,0.170559,-0.333835,3.214856,0.302974


In [43]:
# sanity check
row_to_check = normalized_tpm_df.loc[2, normalized_tpm_df.columns[1:]] 

row_mean = np.mean(row_to_check)
row_std = np.std(row_to_check)

print(row_mean)
print(row_std)

5.2427198385076835e-17
0.9999999999999999


In [44]:
normalized_tpm_df.shape

(24108, 19)

In [45]:
normalized_tpm_df.to_csv('./manual.csv', index=False)