In [1]:
import pandas as pd
import numpy as np
import glob
import gzip
import shutil
import os
from tqdm import tqdm
import plotly.express as px
from scipy.stats import ttest_ind

from Bio import SeqIO
from Bio import Entrez

from pyensembl import EnsemblRelease
from pyensembl import download_cache
from pyensembl import genome

# set infinite display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [2]:
# import the TPM dataframe
TPM = pd.read_csv('../../results/TPM.tsv', sep='\t', index_col=0)
# preview the dataframe only showing the first 10 columns
TPM.iloc[:, :10].head()

Unnamed: 0_level_0,16e72993-470f-4ac2-91fe-562c61615a59,0a3c7dd6-cc30-416d-91f7-d91b22bbbff4,a3a21562-3933-4e92-8ea4-70be74dc19fe,baefbbf5-b891-4dd7-8be3-f6f28f0b24f7,c1d7f3a1-350b-4e57-a02d-4313e4beabe4,1fe5c9cf-bf7a-4e11-a2de-7954b8909f35,a4632995-6ef4-46a6-90ae-ec73cb0ed176,859c8bc1-5a41-4fa2-abde-83988cb8a3fe,0c15c3c2-e396-471e-a292-2f220ed628b2,96f027e2-d58c-471a-ad7e-0ef205323e7b
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENST00000456328.2,0.068287,0.0,0.0,0.0,0.0,0.0,0.023287,0.011446,0.0,0.0
ENST00000450305.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENST00000488147.1,1.660365,2.053805,3.124912,1.658161,0.948201,1.510597,2.632654,3.017217,6.450806,1.573682
ENST00000619216.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENST00000473358.1,0.0,0.0,0.040398,0.0,0.0,0.0,0.0,0.0,0.0,0.037871


In [3]:
# transpose the dataframe
TPM = TPM.T

In [4]:
# preview the dataframe only showing the first 10 columns
TPM.iloc[:, :10].head()
#rename TPM index to read_group_id
TPM.index.names = ['read_group_id']
# turn the index into a column
TPM.reset_index(inplace=True)
#preview the dataframe only showing the first 10 columns
TPM.iloc[:, :10].head()

Name,read_group_id,ENST00000456328.2,ENST00000450305.2,ENST00000488147.1,ENST00000619216.1,ENST00000473358.1,ENST00000469289.1,ENST00000607096.1,ENST00000417324.1,ENST00000461467.1
0,16e72993-470f-4ac2-91fe-562c61615a59,0.068287,0.0,1.660365,0.0,0.0,0.0,0.0,0.0,0.0
1,0a3c7dd6-cc30-416d-91f7-d91b22bbbff4,0.0,0.0,2.053805,0.0,0.0,0.0,0.0,0.0,0.0
2,a3a21562-3933-4e92-8ea4-70be74dc19fe,0.0,0.0,3.124912,0.0,0.040398,0.0,0.0,0.0,0.0
3,baefbbf5-b891-4dd7-8be3-f6f28f0b24f7,0.0,0.0,1.658161,0.0,0.0,0.0,0.0,0.0,0.0
4,c1d7f3a1-350b-4e57-a02d-4313e4beabe4,0.0,0.0,0.948201,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
#preview the dataframe only showing the first 10 columns
TPM.iloc[:, :10].head()

Name,read_group_id,ENST00000456328.2,ENST00000450305.2,ENST00000488147.1,ENST00000619216.1,ENST00000473358.1,ENST00000469289.1,ENST00000607096.1,ENST00000417324.1,ENST00000461467.1
0,16e72993-470f-4ac2-91fe-562c61615a59,0.068287,0.0,1.660365,0.0,0.0,0.0,0.0,0.0,0.0
1,0a3c7dd6-cc30-416d-91f7-d91b22bbbff4,0.0,0.0,2.053805,0.0,0.0,0.0,0.0,0.0,0.0
2,a3a21562-3933-4e92-8ea4-70be74dc19fe,0.0,0.0,3.124912,0.0,0.040398,0.0,0.0,0.0,0.0
3,baefbbf5-b891-4dd7-8be3-f6f28f0b24f7,0.0,0.0,1.658161,0.0,0.0,0.0,0.0,0.0,0.0
4,c1d7f3a1-350b-4e57-a02d-4313e4beabe4,0.0,0.0,0.948201,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
#load metadata dataframe
metadata = pd.read_csv('../../results/metadata.tsv', sep='\t')
#preview the dataframe only showing the first 10 columns
metadata.iloc[:, :10].head()

Unnamed: 0,case_id,aliquot_id,read_group_id,has_blood_cancer,tissue_type,instrument_model,RIN,includes_spike_ins,library_preparation_kit_name,library_preparation_kit_vendor
0,5705efcc-b48f-435c-8a28-9e0d407ecadd,75ac0619-947a-427b-a53f-71e121a7ec8f,71894d8b-5210-44dc-aadc-a199d3843dd2,False,Tumor,Illumina HiSeq 4000,,True,TruSeq Stranded Total RNA Library Prep Kit wit...,Illumina
1,5705efcc-b48f-435c-8a28-9e0d407ecadd,948c4d53-3d91-48a6-bec4-0cc96020e572,86774648-bb57-42c3-b835-9fb11b590d8b,False,Tumor,,,,,
2,763e0702-8379-4b5e-95d1-a84f412c51e7,ce810e2e-4929-4bbc-95ff-6da493477391,c2980255-7c57-4b79-82a7-f77098ff164e,False,Tumor,Illumina HiSeq 4000,,True,TruSeq Stranded Total RNA Library Prep Kit wit...,Illumina
3,763e0702-8379-4b5e-95d1-a84f412c51e7,33c921ea-b743-4d32-9c56-875de6028c71,8062c6e4-d501-4c91-ab02-f36f4e7fd387,False,Tumor,Illumina HiSeq 4000,,True,TruSeq Stranded Total RNA Library Prep Kit wit...,Illumina
4,763e0702-8379-4b5e-95d1-a84f412c51e7,173c0d6a-bc67-4a72-b6d3-b2a411e24785,39c8b5e7-ac68-4009-ab82-e1ee495bdbd9,False,Normal,Illumina HiSeq 4000,,True,TruSeq Stranded Total RNA Library Prep Kit wit...,Illumina


In [7]:
# Convert 'read_group_id' to the same data type in both DataFrames
metadata['read_group_id'] = metadata['read_group_id'].astype(str)

# Check if 'read_group_id' is present in TPM dataframe
if 'read_group_id' in TPM.columns:
    TPM['read_group_id'] = TPM['read_group_id'].astype(str)
    # Merge TPM and metadata DataFrames on read_group_id
    TPM = pd.merge(TPM, metadata[['case_id', 'read_group_id']], on='read_group_id', how='left')
else:
    print("read_group_id column not found in TPM dataframe")


In [17]:
#move case_ID to 1st column 
TPM = TPM[['case_id'] + [col for col in TPM.columns if col != 'case_id']]
#remove duplicate case_id's from TPM dataframe
TPM.drop_duplicates(subset='case_id', keep='first', inplace=True)
#sort by case_id
TPM.sort_values(by=['case_id'], inplace=True)
#remove read_group_id column
TPM.drop(columns=['read_group_id'], inplace=True)
#preview first 10 columns of TPM dataframe
TPM.iloc[:, :10].head()


Unnamed: 0,case_id,ENST00000456328.2,ENST00000450305.2,ENST00000488147.1,ENST00000619216.1,ENST00000473358.1,ENST00000469289.1,ENST00000607096.1,ENST00000417324.1,ENST00000461467.1
157,020db2d3-bb73-46c7-89ea-4648e0d3f2cb,0.0,0.0,3.51429,0.0,0.0,0.0,0.0,0.0,0.0
311,0215c1e2-70aa-495b-a1f1-25bd989a9f12,0.0,0.0,2.866582,0.0,0.0,0.0,0.0,0.0,0.0
579,02208cc6-6221-4e84-bf66-2d32fb49a358,0.0,0.0,1.398049,0.0,0.0,0.0,0.0,0.0,0.0
576,022c3490-811c-4f82-ad9b-8004a3df7e5c,0.0,0.0,1.903676,0.0,0.0,0.0,0.0,0.0,0.0
584,026644c2-a548-4f0d-95bf-716c567f055c,0.0,0.0,3.724858,0.0,0.123148,0.053909,0.0,0.0,0.0


In [12]:
# take tissue_type, gender, race, age_at_diagnosis/365, ajcc_pathologic_stage, primary_diagnosis, morphology, tissue_or_organ_of_origin, tumor_focality, disease_type, primary_site from the metadata and merge it with the TPM_pca_df into a new dataframe called TPM_pre_cluster
columns = ['case_id', 'tissue_type', 'gender', 'race', 'age_at_diagnosis', 'ajcc_pathologic_stage', 'primary_diagnosis', 'tissue_or_organ_of_origin', 'tumor_focality', 'disease_type', 'primary_site']
# Create a new dataframe with selected columns from metadata
selected_metadata = metadata[columns]
#remove duplicate case_id's from selected metadata dataframe
selected_metadata.drop_duplicates(subset='case_id', keep='first', inplace=True)
# preview the dataframe
selected_metadata.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_metadata.drop_duplicates(subset='case_id', keep='first', inplace=True)


Unnamed: 0,case_id,tissue_type,gender,race,age_at_diagnosis,ajcc_pathologic_stage,primary_diagnosis,tissue_or_organ_of_origin,tumor_focality,disease_type,primary_site
0,5705efcc-b48f-435c-8a28-9e0d407ecadd,Tumor,female,white,22142.0,Stage I,"Endometrioid adenocarcinoma, NOS",Corpus uteri,Unifocal,Adenomas and Adenocarcinomas,"Uterus, NOS"
2,763e0702-8379-4b5e-95d1-a84f412c51e7,Tumor,female,white,22179.0,Stage II,"Renal cell carcinoma, NOS","Kidney, NOS",Unifocal,Adenomas and Adenocarcinomas,Kidney
5,8710ce04-6b7f-4e37-adc0-0df1f2798b30,Tumor,male,asian,20772.0,Stage IB,"Adenocarcinoma, NOS","Lung, NOS",Unifocal,Adenomas and Adenocarcinomas,Bronchus and lung
7,c1898677-7a92-48cf-a09f-71d91c1cf8dc,Tumor,female,white,21979.0,Stage IIB,"Infiltrating duct carcinoma, NOS",Head of pancreas,Unifocal,Ductal and Lobular Neoplasms,Pancreas
8,e1d68cfb-04e7-43cb-b8c5-b523cf917636,Tumor,male,white,23545.0,Stage IVA,"Squamous cell carcinoma, NOS","Larynx, NOS",Unifocal,Squamous Cell Neoplasms,Other and ill-defined sites


In [19]:
def query_maker(dataframe, TPM_dataframe):
    # print the column names, except for the first two columns
    print(dataframe.columns[0:])
    # prompt for a column name
    column = input('Enter a column name: ')
    # check if value is numerical or categorical
    # if numerical, display range of values and prompt for a threshold
    if dataframe[column].dtype == np.float64 or dataframe[column].dtype == np.int64:
        # print the range of values
        print('Range of values: ', dataframe[column].min(), ' - ', dataframe[column].max())
        # prompt for a threshold
        threshold = float(input('Enter a threshold: '))
        # subset the dataframe into two dataframes based on the threshold
        dataframe1 = dataframe.loc[dataframe[column] < threshold]
        dataframe2 = dataframe.loc[dataframe[column] >= threshold]
        # print the shape of the two dataframes
        print('Dataframe 1 shape: ', dataframe1.shape)
        print('Dataframe 2 shape: ', dataframe2.shape)
        # use the case_id column to subset the TPM dataframe
        TPM_dataframe1 = TPM_dataframe[TPM_dataframe['index'].isin(dataframe1['case_id'])]
        TPM_dataframe2 = TPM_dataframe[TPM_dataframe['index'].isin(dataframe2['case_id'])]
        # return the two dataframes
        return TPM_dataframe1, TPM_dataframe2
    # if categorical, display unique values and prompt for a value
    elif dataframe[column].dtype == np.object:
        print(dataframe[column].value_counts())
        #make an empty list to store selected values until user says stop
        selected_values = []
        #make a variable to store user input
        user_input = ''
        #make a while loop that will continue until user says stop
        while user_input != 'stop':
            #prompt user for input
            user_input = input('Enter a value or type stop: ')
            #add user input to selected values list
            selected_values.append(user_input)
        #remove stop from selected values list
        selected_values.remove('stop')
        #subset the dataframe into two dataframes based on the selected values
        dataframe1 = dataframe.loc[dataframe[column].isin(selected_values)]
        dataframe2 = dataframe.loc[~dataframe[column].isin(selected_values)]
        # print the shape of the two dataframes
        print('Dataframe 1 shape: ', dataframe1.shape)
        print('Dataframe 2 shape: ', dataframe2.shape)
        # use the case_id column to subset the TPM dataframe
        TPM_dataframe1 = TPM_dataframe[TPM_dataframe['case_id'].isin(dataframe1['case_id'])]
        TPM_dataframe2 = TPM_dataframe[TPM_dataframe['case_id'].isin(dataframe2['case_id'])]
        # return the two dataframes
        return TPM_dataframe1, TPM_dataframe2
    
        # # prompt for value
        # value = input('Enter a value: ')
        # # subset the dataframe into two dataframes based on the value
        # dataframe1 = dataframe.loc[dataframe[column] == value]
        # dataframe2 = dataframe.loc[dataframe[column] != value]
        # # print the shape of the two dataframes
        # print('Dataframe 1 shape: ', dataframe1.shape)
        # print('Dataframe 2 shape: ', dataframe2.shape)
        # # use the case_id column to subset the TPM dataframe
        # TPM_dataframe1 = TPM_dataframe[TPM_dataframe['index'].isin(dataframe1['case_id'])]
        # TPM_dataframe2 = TPM_dataframe[TPM_dataframe['index'].isin(dataframe2['case_id'])]
        # # return the two dataframes
        # return TPM_dataframe1, TPM_dataframe2

def ttester(df1, df2):
    # each column of the incoming dataframe is the mean TPM of a transcript, and each row is a group of samples,
    # and we will run a t-test on each transcript to see if the means of the two groups are significantly different for that transcript
    
    # remove the first two columns from the dataframes
    df1 = df1.iloc[:, 2:]
    df2 = df2.iloc[:, 2:]
    
    # create a list to store individual DataFrames
    ttest_dfs = []
    
    # iterate through the columns of both dataframes using tqdm to show a progress bar, and only use the first 10000 columns
    for column in tqdm(df1.columns[:10000]):
        # run a t-test on the column
        t_statistic, p_value = ttest_ind(df1[column], df2[column])
        
        # create a DataFrame for the current transcript
        ttest_result = pd.DataFrame({
            'transcript_id': [column],
            't_statistic': [t_statistic],
            'p_value': [p_value]
        })
        
        # append the current DataFrame to the list
        ttest_dfs.append(ttest_result)
    
    # concatenate all the DataFrames in the list along the rows (axis=0)
    ttest_df = pd.concat(ttest_dfs, ignore_index=True)
    
    # return the ttest_df
    return ttest_df

def bonferroni_correction(ttest_df, alpha=0.05):
    # Adjust the significance level (alpha) based on the number of tests
    #significance_level = alpha / len(ttest_df)
    significance_level = alpha / 10000

    # Identify the statistically significant tests after Bonferroni correction
    ttest_df['significant'] = ttest_df['p_value'] < significance_level

    # sort the dataframe by adjusted p-value
    ttest_df.sort_values(by='p_value', inplace=True)

    return ttest_df

In [20]:
# define a master function that runs all the functions
def master(dataframe, TPM_dataframe):
    # run query_maker function on the dataframe
    df1, df2 = query_maker(dataframe, TPM_dataframe)
    
    # run ttester function on the dataframe
    ttest_df = ttester(df1, df2)

    # run bonferroni_correction function on the dataframe
    ttest_df = bonferroni_correction(ttest_df)
    
    # return the ttest_df
    return ttest_df

In [22]:
# run master function on the dataframe
ttest_df = master(selected_metadata, TPM)

Index(['case_id', 'tissue_type', 'gender', 'race', 'age_at_diagnosis',
       'ajcc_pathologic_stage', 'primary_diagnosis',
       'tissue_or_organ_of_origin', 'tumor_focality', 'disease_type',
       'primary_site'],
      dtype='object')


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  elif dataframe[column].dtype == np.object:


white                               754
asian                               219
other                               203
black or african american            20
not reported                         17
Unknown                               5
american indian or alaska native      1
Name: race, dtype: int64
Dataframe 1 shape:  (973, 11)
Dataframe 2 shape:  (1100, 11)


100%|██████████| 10000/10000 [00:27<00:00, 362.09it/s]


In [23]:
# preview the dataframe only showing the first 10 columns and rows
ttest_df.iloc[:10, :10]
# print shape of dataframe
#print('Before removing non-significant transcripts: ', ttest_df.shape)

Unnamed: 0,transcript_id,t_statistic,p_value,significant
8098,ENST00000371158.6,10.98848,3.082349e-26,True
8113,ENST00000444290.1,10.928725,5.467178999999999e-26,True
5497,ENST00000678287.1,-10.465457,4.302183e-24,True
9153,ENST00000646892.1,10.410178,7.176051e-24,True
2607,ENST00000494503.1,-9.99952,3.012823e-22,True
8095,ENST00000635214.1,9.933391,5.44176e-22,True
516,ENST00000338660.5,9.851587,1.126102e-21,True
8109,ENST00000465798.2,9.731532,3.247067e-21,True
8099,ENST00000642238.2,9.347301,8.998603e-20,True
4282,ENST00000486941.1,-9.2617,1.85964e-19,True


In [24]:
# count the number of significant transcripts
significant_transcripts = ttest_df['significant'].sum()
# print the number of significant transcripts
print(significant_transcripts)

566


In [25]:
# save the dataframe as a tsv file
ttest_df.to_csv('../../results/transcript_ttest_df.tsv', sep='\t', index=False)

In [29]:
os.environ['PYENSEMBL_CACHE_DIR'] = '../../data/'
#os.system('pyensembl install --release 110 --species human')
# Load the Ensembl release
data = EnsemblRelease(110)
# import the transcript_ttest_df.tsv file
transcript_ttest_df = pd.read_csv('../../results/transcript_ttest_df.tsv', sep='\t')
# create a list called transcript_id_list out of the transcript_ttest_df['transcript_id'] column
transcript_id_list = transcript_ttest_df['transcript_id'].tolist()
# remove the . and the number after the . from the transcript_id_list
transcript_id_list = [i.split('.')[0] for i in transcript_id_list]
# Create a dictionary to store the mapping of transcript IDs to gene IDs
transcript_to_gene_mapping = {}

#call relevant classes from pyensembl object
reference_name = data.reference_name
gtf_path = '../../data/pyensembl/GRCh38/ensembl110/Homo_sapiens.GRCh38.110.gtf.db'

# Retrieve gene IDs from transcript IDs
for transcript_id in transcript_id_list:
    ref = genome.Genome(
        reference_name=reference_name,
        annotation_name='ensembl',
        gtf_path_or_url=gtf_path
    )
    try:
        gene_name = ref.gene_name_of_transcript_id(transcript_id)
        transcript_to_gene_mapping[transcript_id] = gene_name
    except ValueError:
        print(f"No results found for transcript ID: {transcript_id}")

No results found for transcript ID: ENST00000602605
No results found for transcript ID: ENST00000616327
No results found for transcript ID: ENST00000372247
No results found for transcript ID: ENST00000331941
No results found for transcript ID: ENST00000425828
No results found for transcript ID: ENST00000640628
No results found for transcript ID: ENST00000619352
No results found for transcript ID: ENST00000621530
No results found for transcript ID: ENST00000373440
No results found for transcript ID: ENST00000371956
No results found for transcript ID: ENST00000361632
No results found for transcript ID: ENST00000458109
No results found for transcript ID: ENST00000612017
No results found for transcript ID: ENST00000294613


OSError: [Errno 23] Too many open files in system

In [None]:
# Print the mapping
# for transcript_id, gene_id in transcript_to_gene_mapping.items():
#     print(f"Transcript ID: {transcript_id}, Gene ID: {gene_id}")
# use transcript_to_gene_mapping to add a new column to the dataframe and add the gene name
transcript_ttest_df['gene_name'] = transcript_ttest_df['transcript_id'].apply(lambda x: next((v for k, v in transcript_to_gene_mapping.items() if k in x), None))
#transcript_ttest_df.head()
len(transcript_ttest_df['gene_name'].unique())
# Group by gene_name and find the index of the maximum p_value in each group
idx = transcript_ttest_df.groupby('gene_name')['p_value'].idxmax()

# Use the index to extract the corresponding rows from the original DataFrame
gene_level_results = transcript_ttest_df.loc[idx, ['gene_name', 'p_value', 't_statistic']]

# Resetting the index if needed
gene_level_results.reset_index(drop=True, inplace=True)
# Adjust the significance level (alpha) based on the number of tests
significance_level = 0.05 / 10000

# Create a new column called 'significant' which is True if p_value is less than the significance level
gene_level_results['significant'] = gene_level_results['p_value'] < significance_level

gene_level_results = gene_level_results.sort_values(by='p_value')
gene_level_results.head()

In [27]:
#preview first 10 columns of gene_level_results dataframe
gene_level_results.iloc[:, :10].head()

NameError: name 'gene_level_results' is not defined