In [138]:
import pandas as pd
import numpy as np
import scipy
from statsmodels.stats import multitest 
from statsmodels.sandbox.stats.multicomp import multipletests 

import warnings
warnings.filterwarnings('ignore')

In [139]:
df = pd.read_csv('gene_high_throughput_sequencing.csv')
df.loc()

<pandas.core.indexing._LocIndexer at 0x1d0abd1bf70>

In [140]:
normal_patient = df[df.Diagnosis.values == 'normal']
early_neoplasia_patient = df[df.Diagnosis.values == 'early neoplasia']
cancer_patient = df[df.Diagnosis.values == 'cancer']

In [141]:
normal_neoplasia_pvalue = scipy.stats.ttest_ind(normal_patient.iloc[:,2:],early_neoplasia_patient.iloc[:,2:],equal_var=0).pvalue
cancer_neoplasia_pvalue = scipy.stats.ttest_ind(cancer_patient.iloc[:,2:],early_neoplasia_patient.iloc[:,2:],equal_var=0).pvalue

In [142]:
(normal_neoplasia_pvalue < 0.05).sum()

1575

In [143]:
(cancer_neoplasia_pvalue < 0.05).sum()

3490

In [144]:
normal_neoplasia_pvalue

array([6.90766016e-01, 3.17853128e-05, 6.02726639e-02, ...,
       6.70394526e-01, 7.93924592e-01, 6.61031158e-01])

In [145]:
normal_neoplasia_mask = multitest.multipletests(normal_neoplasia_pvalue,alpha = 0.025,method = 'holm')[0]

In [146]:
cancer_neoplasia_mask = multitest.multipletests(cancer_neoplasia_pvalue,alpha = 0.025,method = 'holm')[0]

In [175]:
def F(c,t,mask):
    c_mean = c.mean()
    t_mean = t.mean()
    c_mask = c_mean[mask]
    t_mask = t_mean[mask]
    res = 0
    for i,j in zip(c_mask,t_mask):
        if i > j:
            if i/j > 1.5:
                res+=1
        else:
            if j/i > 1.5:
                res+=1
    return res
    

In [176]:
F(normal_patient,early_neoplasia_patient,normal_neoplasia_mask)

2

In [177]:
F(early_neoplasia_patient,cancer_patient,cancer_neoplasia_mask)

77

In [173]:
cols = df_normal.columns[2:].to_numpy()
get_fold_changes(early_neoplasia_patient, cancer_patient, cols, 1.5, 'holm', 0.025)

IndexError: boolean index did not match indexed array along dimension 0; dimension is 15748 but corresponding boolean dimension is 1

In [174]:
cols

array(['LOC643837', 'LOC100130417', 'SAMD11', ..., 'CYorf15B', 'KDM5D',
       'EIF1AY'], dtype=object)

In [163]:
df_normal = df[df.Diagnosis == 'normal']
df_early = df[df.Diagnosis == 'early neoplasia']
df_cancer = df[df.Diagnosis == 'cancer']
cols = df_normal.columns[2:].to_numpy()

def compare_df_ttest_columns(df_control, df_treatment, cols):
    p_vals = [scipy.stats.ttest_ind(df_control[col], df_treatment[col], equal_var = False).pvalue for col in cols]
    return np.array(p_vals)

cols = df_normal.columns[2:].to_numpy()
p_vals_normal_early = compare_df_ttest_columns(df_normal, df_early, cols)
p_vals_early_cancer = compare_df_ttest_columns(df_early, df_cancer, cols)

In [164]:
def get_fold_change(C,T):
    if T>C:
        return T/C
    else:
        return -C/T
def get_fold_changes(df_control, df_treatment, cols, p_vals, method, alpha):
    reject = multipletests(p_vals, alpha = alpha, method = method) [0]
    means_control = df_control.loc[:,cols[reject]].mean()
    means_treatment = df_treatment.loc[:,cols[reject]].mean()
    df_means = pd.DataFrame([means_control, means_treatment])
    fold_changes = df_means.apply(lambda cols: get_fold_change(cols[0], cols[1]))
    return fold_changes
def get_practic_fold_change_count(df_control, df_treatment, cols, 
                                  p_vals, method, alpha, threshold):
    fold_changes = get_fold_changes(df_control, df_treatment, cols, p_vals, method, alpha)
    return np.sum(np.abs(fold_changes)>threshold)    

In [165]:
alpha = 0.05 / 2
practic_fc_normal_early = get_practic_fold_change_count(df_normal, df_early, cols, 
                                                        p_vals_normal_early, 'holm', alpha, 1.5)
practic_fc_early_cancer = get_practic_fold_change_count(df_early, df_cancer, cols, 
                                                        p_vals_early_cancer, 'holm', alpha, 1.5)

print(practic_fc_normal_early,'2-1-normal_early-holm.txt')
print(practic_fc_early_cancer,'2-2-early_cancer-holm.txt')

2 2-1-normal_early-holm.txt
77 2-2-early_cancer-holm.txt


In [166]:
p_vals_normal_early

array([6.90766016e-01, 3.17853128e-05, 6.02726639e-02, ...,
       6.70394526e-01, 7.93924592e-01, 6.61031158e-01])

In [167]:
normal_neoplasia_pvalue

array([6.90766016e-01, 3.17853128e-05, 6.02726639e-02, ...,
       6.70394526e-01, 7.93924592e-01, 6.61031158e-01])

In [171]:
early_neoplasia_patient

Unnamed: 0,Patient_id,Diagnosis,LOC643837,LOC100130417,SAMD11,NOC2L,KLHL17,PLEKHN1,C1orf170,HES4,...,CLIC2,RPS4Y1,ZFY,PRKY,USP9Y,DDX3Y,CD24,CYorf15B,KDM5D,EIF1AY
24,STT5424_Breast_001_EN,early neoplasia,2.516305,11.430887,18.50655,13.969049,20.957007,10.374992,8.41433,68.513944,...,4.488498,1.314098,1.314098,1.314098,3.307073,1.314098,25.059902,1.314098,1.314098,1.314098
25,STT5431_Breast_002_EN,early neoplasia,1.93727,9.686352,23.541357,15.295034,18.815807,11.128772,9.155183,67.951908,...,3.709591,1.93727,1.93727,1.93727,1.93727,1.93727,26.349727,1.93727,1.93727,1.93727
26,STT5442_Breast_004_EN,early neoplasia,1.405858,15.119783,17.985461,17.237294,21.824785,4.801919,5.796501,67.064975,...,7.02929,1.405858,1.405858,1.405858,1.405858,1.405858,29.254009,1.405858,1.405858,1.405858
27,STT5445_Breast_005_EN,early neoplasia,2.131757,8.789458,12.731187,6.39527,19.185811,14.922297,4.082003,52.028259,...,9.453726,2.131757,2.131757,2.131757,2.131757,2.131757,23.18949,2.131757,2.131757,2.131757
28,STT5511_Breast_011_EN,early neoplasia,2.421766,7.830416,18.283935,15.22932,29.23497,15.77954,8.098199,50.224123,...,5.976863,1.264726,1.264726,1.264726,1.264726,1.264726,19.84989,1.264726,1.264726,1.264726
29,STT5426_Breast_023_EN,early neoplasia,4.668232,14.949208,27.070696,19.5996,34.110743,13.633645,12.859463,73.66346,...,2.168022,1.132213,1.132213,1.132213,2.849339,1.132213,9.495298,1.132213,1.132213,1.132213
30,STT5440_Breast_003_EN,early neoplasia,3.386331,12.379176,28.35047,14.547256,22.989898,14.25773,6.695258,79.737458,...,6.040419,1.768454,1.768454,1.768454,1.768454,1.768454,17.889667,1.768454,1.768454,1.768454
31,STT5449_Breast_006_EN,early neoplasia,1.24744,4.260819,11.410269,8.96654,16.470617,5.143327,4.722732,43.897364,...,10.461652,1.24744,1.24744,1.24744,1.24744,1.24744,22.349725,1.24744,1.24744,1.24744
32,STT5465_Breast_007_EN,early neoplasia,1.591747,8.37245,13.349184,9.855131,20.197009,11.441405,8.766661,48.376201,...,9.506166,1.591747,1.591747,1.591747,1.591747,1.591747,18.264679,1.591747,1.591747,1.591747
33,STT5476_Breast_009_EN,early neoplasia,0.979074,3.706715,17.68668,12.727965,26.435003,14.598825,11.120145,59.830433,...,7.56278,0.979074,0.979074,0.979074,0.979074,0.979074,22.461877,0.979074,0.979074,0.979074


In [172]:
df_early

Unnamed: 0,Patient_id,Diagnosis,LOC643837,LOC100130417,SAMD11,NOC2L,KLHL17,PLEKHN1,C1orf170,HES4,...,CLIC2,RPS4Y1,ZFY,PRKY,USP9Y,DDX3Y,CD24,CYorf15B,KDM5D,EIF1AY
24,STT5424_Breast_001_EN,early neoplasia,2.516305,11.430887,18.50655,13.969049,20.957007,10.374992,8.41433,68.513944,...,4.488498,1.314098,1.314098,1.314098,3.307073,1.314098,25.059902,1.314098,1.314098,1.314098
25,STT5431_Breast_002_EN,early neoplasia,1.93727,9.686352,23.541357,15.295034,18.815807,11.128772,9.155183,67.951908,...,3.709591,1.93727,1.93727,1.93727,1.93727,1.93727,26.349727,1.93727,1.93727,1.93727
26,STT5442_Breast_004_EN,early neoplasia,1.405858,15.119783,17.985461,17.237294,21.824785,4.801919,5.796501,67.064975,...,7.02929,1.405858,1.405858,1.405858,1.405858,1.405858,29.254009,1.405858,1.405858,1.405858
27,STT5445_Breast_005_EN,early neoplasia,2.131757,8.789458,12.731187,6.39527,19.185811,14.922297,4.082003,52.028259,...,9.453726,2.131757,2.131757,2.131757,2.131757,2.131757,23.18949,2.131757,2.131757,2.131757
28,STT5511_Breast_011_EN,early neoplasia,2.421766,7.830416,18.283935,15.22932,29.23497,15.77954,8.098199,50.224123,...,5.976863,1.264726,1.264726,1.264726,1.264726,1.264726,19.84989,1.264726,1.264726,1.264726
29,STT5426_Breast_023_EN,early neoplasia,4.668232,14.949208,27.070696,19.5996,34.110743,13.633645,12.859463,73.66346,...,2.168022,1.132213,1.132213,1.132213,2.849339,1.132213,9.495298,1.132213,1.132213,1.132213
30,STT5440_Breast_003_EN,early neoplasia,3.386331,12.379176,28.35047,14.547256,22.989898,14.25773,6.695258,79.737458,...,6.040419,1.768454,1.768454,1.768454,1.768454,1.768454,17.889667,1.768454,1.768454,1.768454
31,STT5449_Breast_006_EN,early neoplasia,1.24744,4.260819,11.410269,8.96654,16.470617,5.143327,4.722732,43.897364,...,10.461652,1.24744,1.24744,1.24744,1.24744,1.24744,22.349725,1.24744,1.24744,1.24744
32,STT5465_Breast_007_EN,early neoplasia,1.591747,8.37245,13.349184,9.855131,20.197009,11.441405,8.766661,48.376201,...,9.506166,1.591747,1.591747,1.591747,1.591747,1.591747,18.264679,1.591747,1.591747,1.591747
33,STT5476_Breast_009_EN,early neoplasia,0.979074,3.706715,17.68668,12.727965,26.435003,14.598825,11.120145,59.830433,...,7.56278,0.979074,0.979074,0.979074,0.979074,0.979074,22.461877,0.979074,0.979074,0.979074


In [178]:
alpha = 0.05 / 2
practic_fc_normal_early = get_practic_fold_change_count(df_normal, df_early, cols, 
                                                        p_vals_normal_early, 'fdr_bh', alpha, 1.5)
practic_fc_early_cancer = get_practic_fold_change_count(df_early, df_cancer, cols, 
                                                        p_vals_early_cancer, 'fdr_bh', alpha, 1.5)

print(practic_fc_normal_early,'3-1-normal_early-fdr_bh.txt')
print(practic_fc_early_cancer,'3-2-early_cancer-fdr_bh.txt')

4 3-1-normal_early-fdr_bh.txt
524 3-2-early_cancer-fdr_bh.txt
