In [26]:
import pandas as pd
import numpy as np
import glob
import os
from tqdm import tqdm

from pyensembl import EnsemblRelease
from pyensembl import download_cache
from pyensembl import genome

# set infinite display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [13]:
os.environ['PYENSEMBL_CACHE_DIR'] = '../../data/'

In [15]:
os.system('pyensembl install --release 110 --species human')

2023-11-11 14:28:40,931 - pyensembl.shell - INFO - Running 'install' for EnsemblRelease(release=110, species='homo_sapiens')
2023-11-11 14:28:40,931 - pyensembl.download_cache - INFO - Fetching ../../data/pyensembl/GRCh38/ensembl110/Homo_sapiens.GRCh38.110.gtf.gz from URL https://ftp.ensembl.org/pub/release-110/gtf/homo_sapiens/Homo_sapiens.GRCh38.110.gtf.gz
2023-11-11 14:28:40,931 - datacache.download - INFO - Downloading https://ftp.ensembl.org/pub/release-110/gtf/homo_sapiens/Homo_sapiens.GRCh38.110.gtf.gz to ../../data/pyensembl/GRCh38/ensembl110/Homo_sapiens.GRCh38.110.gtf.gz
2023-11-11 14:31:01,766 - pyensembl.download_cache - INFO - Fetching ../../data/pyensembl/GRCh38/ensembl110/Homo_sapiens.GRCh38.cdna.all.fa.gz from URL https://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz
2023-11-11 14:31:01,770 - datacache.download - INFO - Downloading https://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa

0

In [18]:
# Load the Ensembl release
data = EnsemblRelease(110)

In [20]:
# import the transcript_ttest_df.tsv file
transcript_ttest_df = pd.read_csv('../../results/transcript_ttest_df.tsv', sep='\t')

In [40]:
# create a list called transcript_id_list out of the transcript_ttest_df['transcript_id'] column
transcript_id_list = transcript_ttest_df['transcript_id'].tolist()
# remove the . and the number after the . from the transcript_id_list
transcript_id_list = [i.split('.')[0] for i in transcript_id_list]

In [41]:
# Create a dictionary to store the mapping of transcript IDs to gene IDs
transcript_to_gene_mapping = {}

# !@#!@#!# (calling relevant classes from pyensembl object)
reference_name = data.reference_name
gtf_path = '../../data/pyensembl/GRCh38/ensembl110/Homo_sapiens.GRCh38.110.gtf.db'

# Retrieve gene IDs from transcript IDs
for transcript_id in transcript_id_list:
    ref = genome.Genome(
        reference_name=reference_name,
        annotation_name='ensembl',
        gtf_path_or_url=gtf_path
    )
    try:
        gene_name = ref.gene_name_of_transcript_id(transcript_id)
        transcript_to_gene_mapping[transcript_id] = gene_name
    except ValueError:
        print(f"No results found for transcript ID: {transcript_id}")

# Print the mapping
# for transcript_id, gene_id in transcript_to_gene_mapping.items():
#     print(f"Transcript ID: {transcript_id}, Gene ID: {gene_id}")

No results found for transcript ID: ENST00000640628
No results found for transcript ID: ENST00000621530
No results found for transcript ID: ENST00000610648
No results found for transcript ID: ENST00000612017
No results found for transcript ID: ENST00000370682
No results found for transcript ID: ENST00000620882
No results found for transcript ID: ENST00000616327
No results found for transcript ID: ENST00000373440
No results found for transcript ID: ENST00000294613
No results found for transcript ID: ENST00000371956
No results found for transcript ID: ENST00000372247
No results found for transcript ID: ENST00000331941
No results found for transcript ID: ENST00000619352
No results found for transcript ID: ENST00000602605
No results found for transcript ID: ENST00000458109
No results found for transcript ID: ENST00000425828
No results found for transcript ID: ENST00000361632
No results found for transcript ID: ENST00000373023


In [45]:
# use transcript_to_gene_mapping to add a new column to the dataframe and add the gene name
transcript_ttest_df['gene_name'] = transcript_ttest_df['transcript_id'].apply(lambda x: next((v for k, v in transcript_to_gene_mapping.items() if k in x), None))

In [59]:
transcript_ttest_df.head()

Unnamed: 0,transcript_id,t_statistic,p_value,significant,gene_name
0,ENST00000616738.4,-8.822822,6.090021e-15,True,LEPR
1,ENST00000706843.1,-8.779437,7.764651e-15,True,EVI5
2,ENST00000361355.8,-8.549002,2.807879e-14,True,AMY2B
3,ENST00000529608.1,-8.341897,8.847464e-14,True,TGFBR3
4,ENST00000494134.3,-7.758233,2.147234e-12,True,RN7SL653P


In [57]:
len(transcript_ttest_df['gene_name'].unique())

1538

In [68]:
# Group by gene_name and find the index of the maximum p_value in each group
idx = transcript_ttest_df.groupby('gene_name')['p_value'].idxmax()

# Use the index to extract the corresponding rows from the original DataFrame
gene_level_results = transcript_ttest_df.loc[idx, ['gene_name', 'p_value', 't_statistic']]

# Resetting the index if needed
gene_level_results.reset_index(drop=True, inplace=True)

In [69]:
# Adjust the significance level (alpha) based on the number of tests
significance_level = 0.05 / 10000

# Create a new column called 'significant' which is True if p_value is less than the significance level
gene_level_results['significant'] = gene_level_results['p_value'] < significance_level


In [70]:
gene_level_results = gene_level_results.sort_values(by='p_value')
gene_level_results.head()

Unnamed: 0,gene_name,p_value,t_statistic,significant
1097,RN7SL653P,2.147234e-12,-7.758233,True
765,MGC27382,5.283065e-11,-7.156555,True
66,ARID3BP1,9.247475e-10,-6.601112,True
391,FAM110D,6.804482e-09,-6.200948,True
1341,SYDE2,3.295455e-08,-5.87532,True
