Gene Lengths from MyGene

In [12]:
import pandas as pd
from mygene import MyGeneInfo

# Step 1: Load filtered protein-coding raw counts
df = pd.read_csv("../data/raw_counts_protein_coding.csv")
print(f"Loaded matrix shape: {df.shape}")

# Step 2: Aggregate duplicate gene IDs by summing across rows
if df["name"].duplicated().any():
    print(f"Found {df['name'].duplicated().sum()} duplicated Ensembl IDs. Aggregating by summing counts.")
    df = df.groupby("name", as_index=False).sum()

# Step 3: Query MyGene for gene lengths
mg = MyGeneInfo()
ensembl_ids = df["name"].tolist()

results = mg.querymany(
    ensembl_ids,
    scopes="ensembl.gene",
    fields="genomic_pos",
    species="human",
    as_dataframe=False  # Important: returns dicts with nested structure
)

# Step 4: Extract lengths manually
def extract_length(hit):
    try:
        pos = hit.get("genomic_pos")
        if isinstance(pos, list):  # multiple mappings
            pos = pos[0]
        return abs(pos["end"] - pos["start"]) + 1
    except Exception as e:
        return None

# Step 5: Build and clean gene length DataFrame
lengths = pd.DataFrame({
    "name": [hit["query"] for hit in results],
    "length": [extract_length(hit) for hit in results]
}).dropna()

# Step 6: Merge with counts matrix
df_with_lengths = pd.merge(df, lengths, on="name", how="inner")
print(f"Matrix after merging gene lengths: {df_with_lengths.shape}")


Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed


Loaded matrix shape: (19407, 114)
Found 4 duplicated Ensembl IDs. Aggregating by summing counts.


5 input query terms found dup hits:	[('ENSG00000188660', 2), ('ENSG00000215156', 2), ('ENSG00000243620', 2), ('ENSG00000268674', 3), ('E


Matrix after merging gene lengths: (19409, 115)


TPM Calculation Function

In [13]:
def counts_to_tpm(df, length_col="length"):
    # Drop 'name' and 'length' to get pure count matrix
    counts = df.drop(columns=["name", length_col])
    lengths_kb = df[length_col] / 1000  # Convert to kilobases
    rpk = counts.div(lengths_kb, axis=0)
    per_million = rpk.sum(axis=0) / 1e6
    tpm = rpk.div(per_million, axis=1)
    tpm.insert(0, "name", df["name"])
    return tpm

Apply and Save

In [16]:

import numpy as np


tpm_matrix = counts_to_tpm(df_with_lengths)

# Separate gene column and expression matrix
gene_column = tpm_matrix["name"]
tpm_values = tpm_matrix.drop(columns=["name"])

# Apply log2(TPM + 1)
log2_tpm = np.log2(tpm_values + 1)

# Reattach gene column
log2_tpm.insert(0, "name", gene_column)

# Save to CSV
log2_tpm.to_csv("../data/log2_tpm_matrix_samples.csv", index=False)
print(f"log2 TPM matrix saved. Shape: {log2_tpm.shape}")


log2 TPM matrix saved. Shape: (19409, 114)
