In [1]:
import pandas as pd
import numpy as np
import os
import pickle

from pyensembl import EnsemblRelease
from pyensembl import genome

# set infinite display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## This notebook takes the transcript IDs in our TPM dataframe and uses pyensembl's gene_name_of_transcript_id function to create a dictionary that maps transcript_ids to gene_names. This dictionary is then processed to create a dictionary that maps gene_names to transcript_ids. There is a lot of information loss as many transcript IDs don't seem to match to a gene_name. There might be a better way to do this.

In [2]:
os.environ['PYENSEMBL_CACHE_DIR'] = '../../data/'
# check if pyensembl is installed, if not, run 'pyensembl install --release 110 --species human'
try:
    data = EnsemblRelease(110)
except:
    print('pyensembl not installed, installing now...')
    os.system('pyensembl install --release 110 --species human')
    data = EnsemblRelease(110)

In [3]:
# import the TPM.tsv file
TPM = pd.read_csv('../../results/TPM.tsv', sep='\t')
#preview the dataframe only showing the first 10 columns
TPM.iloc[:, :10].head()

Unnamed: 0,Name,16e72993-470f-4ac2-91fe-562c61615a59,0a3c7dd6-cc30-416d-91f7-d91b22bbbff4,a3a21562-3933-4e92-8ea4-70be74dc19fe,baefbbf5-b891-4dd7-8be3-f6f28f0b24f7,c1d7f3a1-350b-4e57-a02d-4313e4beabe4,1fe5c9cf-bf7a-4e11-a2de-7954b8909f35,a4632995-6ef4-46a6-90ae-ec73cb0ed176,859c8bc1-5a41-4fa2-abde-83988cb8a3fe,0c15c3c2-e396-471e-a292-2f220ed628b2
0,ENST00000456328.2,0.068287,0.0,0.0,0.0,0.0,0.0,0.023287,0.011446,0.0
1,ENST00000450305.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ENST00000488147.1,1.660365,2.053805,3.124912,1.658161,0.948201,1.510597,2.632654,3.017217,6.450806
3,ENST00000619216.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ENST00000473358.1,0.0,0.0,0.040398,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# create a list called transcript_id_list out of the transcript_ttest_df['transcript_id'] column
transcript_id_list = TPM['Name'].tolist()
# unload TPM dataframe from memory
del(TPM)

In [5]:
# Remove the . and the number after the . from the transcript_id_list
transcript_id_list = [i.split('.')[0] for i in transcript_id_list]

# Create a dictionary to store the mapping of transcript IDs to gene IDs
transcript_to_gene_mapping = {}

# call relevant classes from the pyensembl object
reference_name = data.reference_name
gtf_path = '../../data/pyensembl/GRCh38/ensembl110/Homo_sapiens.GRCh38.110.gtf.db'

# Create the Genome object outside the loop
ref = genome.Genome(
    reference_name=reference_name,
    annotation_name='ensembl',
    gtf_path_or_url=gtf_path
)

# Retrieve gene IDs from transcript IDs
for transcript_id in transcript_id_list:
    try:
        gene_name = ref.gene_name_of_transcript_id(transcript_id)
        transcript_to_gene_mapping[transcript_id] = gene_name
    except ValueError:
        print(f"No results found for transcript ID: {transcript_id}")


No results found for transcript ID: ENST00000425828
No results found for transcript ID: ENST00000616327
No results found for transcript ID: ENST00000619352
No results found for transcript ID: ENST00000373440
No results found for transcript ID: ENST00000361632
No results found for transcript ID: ENST00000331941
No results found for transcript ID: ENST00000458109
No results found for transcript ID: ENST00000373023
No results found for transcript ID: ENST00000372247
No results found for transcript ID: ENST00000371956
No results found for transcript ID: ENST00000640628
No results found for transcript ID: ENST00000612017
No results found for transcript ID: ENST00000294613
No results found for transcript ID: ENST00000602605
No results found for transcript ID: ENST00000621530
No results found for transcript ID: ENST00000370682
No results found for transcript ID: ENST00000620882
No results found for transcript ID: ENST00000610648
No results found for transcript ID: ENST00000393277
No results f

In [7]:
# Create a gene to transcript map
gene_to_transcript_mapping = {}

# Iterate through the transcript to gene map
for transcript, gene in transcript_to_gene_mapping.items():
    # Skip empty genes
    if gene:
        # If the gene is not in the gene to transcript map, create an empty list
        if gene not in gene_to_transcript_mapping:
            gene_to_transcript_mapping[gene] = []
        # Append the transcript to the gene in the gene to transcript map
        gene_to_transcript_mapping[gene].append(transcript)

In [None]:
# save these dictionaries as .pkl files
with open('../../results/transcript_to_gene_mapping.pkl', 'wb') as file:
    pickle.dump(transcript_to_gene_mapping, file)
with open('../../results/gene_to_transcript_mapping.pkl', 'wb') as file:
    pickle.dump(gene_to_transcript_mapping, file)