In [2]:
import pandas as pd
from mygene import MyGeneInfo

# Step 1: Load the raw counts
raw_counts = pd.read_csv("../data/raw_counts_no_metastases.csv")
print(f"Original shape: {raw_counts.shape}")

# Step 2: Extract Ensembl IDs (assuming first column is 'name')
ensembl_ids = raw_counts["name"].tolist()

# Step 3: Query MyGene for biotypes
mg = MyGeneInfo()
results = mg.querymany(
    ensembl_ids,
    scopes="ensembl.gene",
    fields="type_of_gene",
    species="human",
    as_dataframe=True
)

# Step 4: Prepare biotype DataFrame
biotypes = results[["type_of_gene"]].reset_index().rename(columns={"query": "name", "type_of_gene": "biotype"})

# Step 5: Merge biotype info with raw counts
merged = pd.merge(raw_counts, biotypes, on="name", how="left")

# Step 6: Filter for protein-coding genes
protein_coding = merged[merged["biotype"] == "protein-coding"].drop(columns=["biotype"])

# Step 7: Save result
protein_coding.to_csv("../data/raw_counts_protein_coding.csv", index=False)

print(f"Filtered shape (protein-coding only): {protein_coding.shape}")


Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed


Original shape: (58735, 85)


31 input query terms found dup hits:	[('ENSG00000188660', 2), ('ENSG00000215156', 2), ('ENSG00000226506', 2), ('ENSG00000226519', 2), ('E
1383 input query terms found no hit:	['ENSG00000112096', 'ENSG00000116883', 'ENSG00000130489', 'ENSG00000130723', 'ENSG00000131484', 'ENS


Filtered shape (protein-coding only): (19407, 85)
