In [1]:
import pandas as pd
import numpy as np
import glob
import os
from tqdm import tqdm

from pyensembl import EnsemblRelease
from pyensembl import download_cache
from pyensembl import genome

# set infinite display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [2]:
os.environ['PYENSEMBL_CACHE_DIR'] = '../../data/'

In [3]:
#os.system('pyensembl install --release 110 --species human')

In [4]:
# Load the Ensembl release
data = EnsemblRelease(110)

In [5]:
# import the transcript_ttest_df.tsv file
transcript_ttest_df = pd.read_csv('../../results/transcript_ttest_df.tsv', sep='\t')

In [6]:
# create a list called transcript_id_list out of the transcript_ttest_df['transcript_id'] column
transcript_id_list = transcript_ttest_df['transcript_id'].tolist()
# remove the . and the number after the . from the transcript_id_list
transcript_id_list = [i.split('.')[0] for i in transcript_id_list]

In [14]:
# Create a dictionary to store the mapping of transcript IDs to gene IDs
transcript_to_gene_mapping = {}

# !@#!@#!# (calling relevant classes from pyensembl object)
reference_name = data.reference_name
gtf_path = '../../data/pyensembl/GRCh38/ensembl110/Homo_sapiens.GRCh38.110.gtf.db'

# Retrieve gene IDs from transcript IDs
for transcript_id in transcript_id_list:
    ref = genome.Genome(
        reference_name=reference_name,
        annotation_name='ensembl',
        gtf_path_or_url=gtf_path
    )
    try:
        gene_name = ref.gene_name_of_transcript_id(transcript_id)
        transcript_to_gene_mapping[transcript_id] = gene_name
    except ValueError:
        print(f"No results found for transcript ID: {transcript_id}")

# Print the mapping
# for transcript_id, gene_id in transcript_to_gene_mapping.items():
#     print(f"Transcript ID: {transcript_id}, Gene ID: {gene_id}")

No results found for transcript ID: ENST00000640628
No results found for transcript ID: ENST00000621530
No results found for transcript ID: ENST00000610648
No results found for transcript ID: ENST00000612017
No results found for transcript ID: ENST00000370682
No results found for transcript ID: ENST00000620882
No results found for transcript ID: ENST00000616327
No results found for transcript ID: ENST00000373440
No results found for transcript ID: ENST00000294613
No results found for transcript ID: ENST00000371956
No results found for transcript ID: ENST00000372247
No results found for transcript ID: ENST00000331941
No results found for transcript ID: ENST00000619352
No results found for transcript ID: ENST00000602605
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/mayanahar/anaconda3/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3460, in run_code
  File "/var/folders/j4/1x0dc4g11wg0cfys1nmslw040000gn/T/ipykernel_3674/2405353867.py", line 16, in <module>
    gene_name = ref.gene_name_of_transcript_id(transcript_id)
  File "/Users/mayanahar/anaconda3/lib/python3.10/site-packages/pyensembl/genome.py", line 774, in gene_name_of_transcript_id
  File "/Users/mayanahar/anaconda3/lib/python3.10/site-packages/pyensembl/genome.py", line 751, in _query_gene_name
  File "/Users/mayanahar/anaconda3/lib/python3.10/site-packages/pyensembl/genome.py", line 293, in db
  File "/Users/mayanahar/anaconda3/lib/python3.10/site-packages/pyensembl/genome.py", line 221, in _set_local_paths
  File "/Users/mayanahar/anaconda3/lib/python3.10/site-packages/pyensembl/genome.py", line 184, in _get_gtf_path
  File "/Users/mayanahar/anaconda3/lib/python3.10/site-packages/pyensembl/genome.py", line 176, in 

In [15]:
# use transcript_to_gene_mapping to add a new column to the dataframe and add the gene name
transcript_ttest_df['gene_name'] = transcript_ttest_df['transcript_id'].apply(lambda x: next((v for k, v in transcript_to_gene_mapping.items() if k in x), None))

In [9]:
transcript_ttest_df.head()

Unexpected exception formatting exception. Falling back to standard exception
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/mayanahar/anaconda3/lib/python3.10/site-packages/IPython/core/formatters.py", line 221, in catch_format_error
  File "/Users/mayanahar/anaconda3/lib/python3.10/site-packages/IPython/core/formatters.py", line 706, in __call__
  File "/Users/mayanahar/anaconda3/lib/python3.10/site-packages/IPython/lib/pretty.py", line 410, in pretty
  File "/Users/mayanahar/anaconda3/lib/python3.10/site-packages/IPython/lib/pretty.py", line 778, in _repr_pprint
  File "/Users/mayanahar/anaconda3/lib/python3.10/site-packages/pandas/core/frame.py", line 1064, in __repr__
  File "/Users/mayanahar/anaconda3/lib/python3.10/site-packages/pandas/core/frame.py", line 1245, in to_string
  File "/Users/mayanahar/anaconda3/lib/python3.10/site-packages/pandas/io/formats/format.py", line 1133, in to_string
  File "<frozen importlib._bootstrap>", line 1027, in _find_and_load
  File "<frozen importlib._bootstrap>", line 1006, in _find_and_load_unlocked
  File "<frozen i

In [16]:
len(transcript_ttest_df['gene_name'].unique())

1282

In [17]:
# Group by gene_name and find the index of the maximum p_value in each group
idx = transcript_ttest_df.groupby('gene_name')['p_value'].idxmax()

# Use the index to extract the corresponding rows from the original DataFrame
gene_level_results = transcript_ttest_df.loc[idx, ['gene_name', 'p_value', 't_statistic']]

# Resetting the index if needed
gene_level_results.reset_index(drop=True, inplace=True)

In [18]:
# Adjust the significance level (alpha) based on the number of tests
significance_level = 0.05 / 10000

# Create a new column called 'significant' which is True if p_value is less than the significance level
gene_level_results['significant'] = gene_level_results['p_value'] < significance_level


In [19]:
gene_level_results = gene_level_results.sort_values(by='p_value')
gene_level_results.head()

Unnamed: 0,gene_name,p_value,t_statistic,significant
914,RN7SL653P,2.147234e-12,-7.758233,True
643,MGC27382,5.283065e-11,-7.156555,True
56,ARID3BP1,9.247475e-10,-6.601112,True
1003,S1PR1-DT,2.595311e-09,-6.395708,True
337,FAM110D,6.804482e-09,-6.200948,True
