In [2]:
import pandas as pd
import numpy as np
import glob
import gzip
import shutil
import os
from tqdm import tqdm
import plotly.express as px
from scipy.stats import ttest_ind

from Bio import SeqIO
from Bio import Entrez

# set infinite display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [3]:
# import the TPM dataframe
TPM = pd.read_csv('../../results/TPM.tsv', sep='\t', index_col=0)
# preview the dataframe only showing the first 10 columns
TPM.iloc[:, :10].head()

Unnamed: 0_level_0,52aef68b-63de-46b1-bb88-fb11d875c803,98460c61-c4ef-43ff-b7ac-b5c1b7b0ae26,5810cc10-f53a-451a-8001-c68765e30565,2042bb7e-ff24-439d-b33c-70c15cc3d201,dba512d3-58c5-47ed-a3f5-ee3d81e0b0c9,4aa493c6-6177-4d64-add9-f0c0198e2530,53b68f98-1d15-42d5-b374-b2163590b3a8,1748692d-651e-4bb1-ab9e-8460d1c8b588,eb0750ce-bed8-41a0-b93c-d72d02c65303,cf4aa3e0-2256-42e3-a7cf-9ea08900b622
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENST00000456328.2,0.088142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147231
ENST00000450305.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENST00000488147.1,3.235923,2.721829,5.28927,13.728161,2.106743,6.399453,2.024906,1.171819,0.643958,4.371894
ENST00000619216.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENST00000473358.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# print shape of dataframe
print('Before removing non-performing transcripts: ', TPM.shape)
# find genes with 0 TPM in all samples
TPM = TPM.loc[(TPM != 0).any(axis=1)]
# print shape of dataframe
print('After removing non-performing transcripts: ', TPM.shape)

# # find genes with 0 TPM in at least 99% of samples
# TPM = TPM.loc[(TPM == 0).sum(axis=1) <= 0.99 * TPM.shape[1]]
# # print shape of dataframe
# print('After removing genes with 0 TPM in at least 99% of samples: ', TPM.shape)

Before removing non-performing transcripts:  (252045, 133)
After removing non-performing transcripts:  (233502, 133)


In [5]:
# transpose the dataframe
TPM = TPM.T

In [6]:
# preview the dataframe only showing the first 10 columns
TPM.iloc[:, :10].head()

Name,ENST00000456328.2,ENST00000488147.1,ENST00000473358.1,ENST00000469289.1,ENST00000417324.1,ENST00000461467.1,ENST00000606857.1,ENST00000642116.1,ENST00000466430.5,ENST00000477740.5
52aef68b-63de-46b1-bb88-fb11d875c803,0.088142,3.235923,0.0,0.0,0.0,0.0,0.0,0.0,0.200852,0.0
98460c61-c4ef-43ff-b7ac-b5c1b7b0ae26,0.0,2.721829,0.0,0.0,0.0,0.0,0.0,0.0,0.094394,0.0
5810cc10-f53a-451a-8001-c68765e30565,0.0,5.28927,0.0,0.0,0.0,0.0,0.0,0.0,0.197039,0.0
2042bb7e-ff24-439d-b33c-70c15cc3d201,0.0,13.728161,0.0,0.0,0.0,0.0,0.0,0.0,0.382214,0.0
dba512d3-58c5-47ed-a3f5-ee3d81e0b0c9,0.0,2.106743,0.0,0.0,0.0,0.0,0.0,0.0,0.073251,0.0


In [7]:
# turn the index into a column
TPM.reset_index(inplace=True)

In [8]:
# import the metadata
metadata = pd.read_csv('../../results/metadata.tsv', sep='\t', index_col=0)
# take tissue_type, gender, race, age_at_diagnosis/365, ajcc_pathologic_stage, primary_diagnosis, morphology, tissue_or_organ_of_origin, tumor_focality, disease_type, primary_site from the metadata and merge it with the TPM_pca_df into a new dataframe called TPM_pre_cluster
columns = ['read_group_id', 'tissue_type', 'gender', 'race', 'age_at_diagnosis', 'ajcc_pathologic_stage', 'primary_diagnosis', 'tissue_or_organ_of_origin', 'tumor_focality', 'disease_type', 'primary_site']
# Create a new dataframe with selected columns from metadata
selected_metadata = metadata[columns]
# preview the dataframe
selected_metadata.head()

Unnamed: 0_level_0,read_group_id,tissue_type,gender,race,age_at_diagnosis,ajcc_pathologic_stage,primary_diagnosis,tissue_or_organ_of_origin,tumor_focality,disease_type,primary_site
case_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
5705efcc-b48f-435c-8a28-9e0d407ecadd,71894d8b-5210-44dc-aadc-a199d3843dd2,Tumor,female,white,22142.0,Stage I,"Endometrioid adenocarcinoma, NOS",Corpus uteri,Unifocal,Adenomas and Adenocarcinomas,"Uterus, NOS"
5705efcc-b48f-435c-8a28-9e0d407ecadd,86774648-bb57-42c3-b835-9fb11b590d8b,Tumor,female,white,22142.0,Stage I,"Endometrioid adenocarcinoma, NOS",Corpus uteri,Unifocal,Adenomas and Adenocarcinomas,"Uterus, NOS"
763e0702-8379-4b5e-95d1-a84f412c51e7,c2980255-7c57-4b79-82a7-f77098ff164e,Tumor,female,white,22179.0,Stage II,"Renal cell carcinoma, NOS","Kidney, NOS",Unifocal,Adenomas and Adenocarcinomas,Kidney
763e0702-8379-4b5e-95d1-a84f412c51e7,8062c6e4-d501-4c91-ab02-f36f4e7fd387,Tumor,female,white,22179.0,Stage II,"Renal cell carcinoma, NOS","Kidney, NOS",Unifocal,Adenomas and Adenocarcinomas,Kidney
763e0702-8379-4b5e-95d1-a84f412c51e7,39c8b5e7-ac68-4009-ab82-e1ee495bdbd9,Normal,female,white,22179.0,Stage II,"Renal cell carcinoma, NOS","Kidney, NOS",Unifocal,Adenomas and Adenocarcinomas,Kidney


In [9]:
def query_maker(dataframe, TPM_dataframe):
    # print the column names, except for the first two columns
    print(dataframe.columns[1:])
    # prompt for a column name
    column = input('Enter a column name: ')
    # check if value is numerical or categorical
    # if numerical, display range of values and prompt for a threshold
    if dataframe[column].dtype == np.float64 or dataframe[column].dtype == np.int64:
        # print the range of values
        print('Range of values: ', dataframe[column].min(), ' - ', dataframe[column].max())
        # prompt for a threshold
        threshold = float(input('Enter a threshold: '))
        # subset the dataframe into two dataframes based on the threshold
        dataframe1 = dataframe.loc[dataframe[column] < threshold]
        dataframe2 = dataframe.loc[dataframe[column] >= threshold]
        # print the shape of the two dataframes
        print('Dataframe 1 shape: ', dataframe1.shape)
        print('Dataframe 2 shape: ', dataframe2.shape)
        # use the read_group_id column to subset the TPM dataframe
        TPM_dataframe1 = TPM_dataframe[TPM_dataframe['index'].isin(dataframe1['read_group_id'])]
        TPM_dataframe2 = TPM_dataframe[TPM_dataframe['index'].isin(dataframe2['read_group_id'])]
        # return the two dataframes
        return TPM_dataframe1, TPM_dataframe2
    # if categorical, display unique values and prompt for a value
    elif dataframe[column].dtype == np.object:
        # print the unique values and their counts
        print(dataframe[column].value_counts())
        # prompt for a value
        value = input('Enter a value: ')
        # subset the dataframe into two dataframes based on the value
        dataframe1 = dataframe.loc[dataframe[column] == value]
        dataframe2 = dataframe.loc[dataframe[column] != value]
        # print the shape of the two dataframes
        print('Dataframe 1 shape: ', dataframe1.shape)
        print('Dataframe 2 shape: ', dataframe2.shape)
        # use the read_group_id column to subset the TPM dataframe
        TPM_dataframe1 = TPM_dataframe[TPM_dataframe['index'].isin(dataframe1['read_group_id'])]
        TPM_dataframe2 = TPM_dataframe[TPM_dataframe['index'].isin(dataframe2['read_group_id'])]
        # return the two dataframes
        return TPM_dataframe1, TPM_dataframe2

def ttester(df1, df2):
    # each column of the incoming dataframe is the mean TPM of a transcript, and each row is a group of samples,
    # and we will run a t-test on each transcript to see if the means of the two groups are significantly different for that transcript
    
    # remove the first column from the dataframes
    df1 = df1.iloc[:, 1:]
    df2 = df2.iloc[:, 1:]
    
    # create a list to store individual DataFrames
    ttest_dfs = []
    
    # iterate through the columns of both dataframes using tqdm to show a progress bar, and only use the first 10000 columns
    for column in tqdm(df1.columns[:10000]):
        # run a t-test on the column
        t_statistic, p_value = ttest_ind(df1[column], df2[column])
        
        # create a DataFrame for the current transcript
        ttest_result = pd.DataFrame({
            'transcript_id': [column],
            't_statistic': [t_statistic],
            'p_value': [p_value]
        })
        
        # append the current DataFrame to the list
        ttest_dfs.append(ttest_result)
    
    # concatenate all the DataFrames in the list along the rows (axis=0)
    ttest_df = pd.concat(ttest_dfs, ignore_index=True)
    
    # return the ttest_df
    return ttest_df

def bonferroni_correction(ttest_df, alpha=0.05):
    # Adjust the significance level (alpha) based on the number of tests
    #significance_level = alpha / len(ttest_df)
    significance_level = alpha / 10000

    # Identify the statistically significant tests after Bonferroni correction
    ttest_df['significant'] = ttest_df['p_value'] < significance_level

    # sort the dataframe by adjusted p-value
    ttest_df.sort_values(by='p_value', inplace=True)

    return ttest_df

In [10]:
# define a master function that runs all the functions
def master(dataframe, TPM_dataframe):
    # run query_maker function on the dataframe
    df1, df2 = query_maker(dataframe, TPM_dataframe)
    
    # run ttester function on the dataframe
    ttest_df = ttester(df1, df2)

    # run bonferroni_correction function on the dataframe
    ttest_df = bonferroni_correction(ttest_df)
    
    # return the ttest_df
    return ttest_df

In [11]:
# run master function on the dataframe
ttest_df = master(selected_metadata, TPM)

Index(['tissue_type', 'gender', 'race', 'age_at_diagnosis',
       'ajcc_pathologic_stage', 'primary_diagnosis',
       'tissue_or_organ_of_origin', 'tumor_focality', 'disease_type',
       'primary_site'],
      dtype='object')


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif dataframe[column].dtype == np.object:


Tumor     2458
Normal     893
Name: tissue_type, dtype: int64
Dataframe 1 shape:  (2458, 11)
Dataframe 2 shape:  (897, 11)


100%|██████████| 10000/10000 [00:04<00:00, 2271.12it/s]


In [12]:
# preview the dataframe only showing the first 10 columns and rows
ttest_df.iloc[:10, :10]
# print shape of dataframe
#print('Before removing non-significant transcripts: ', ttest_df.shape)

Unnamed: 0,transcript_id,t_statistic,p_value,significant
7916,ENST00000616738.4,-8.822822,6.090021e-15,True
9289,ENST00000706843.1,-8.779437,7.764651e-15,True
9893,ENST00000361355.8,-8.549002,2.807879e-14,True
9217,ENST00000529608.1,-8.341897,8.847464e-14,True
9221,ENST00000494134.3,-7.758233,2.147234e-12,True
8600,ENST00000436742.5,-7.605216,4.892374e-12,True
9282,ENST00000706846.1,-7.49901,8.634883e-12,True
719,ENST00000503297.1,-7.35553,1.851619e-11,True
9211,ENST00000212355.9,-7.34552,1.952421e-11,True
8870,ENST00000624216.1,-7.316634,2.274794e-11,True


In [13]:
# count the number of significant transcripts
significant_transcripts = ttest_df['significant'].sum()
# print the number of significant transcripts
print(significant_transcripts)

123


In [15]:
# save the dataframe as a tsv file
ttest_df.to_csv('../../results/transcript_ttest_df.tsv', sep='\t', index=False)