In [9]:
import os

# Change these to match your own paths and filenames
t2g_file = "t2g.txt"  # Transcript-to-gene mapping file, e.g. columns: [transcript_id, gene_name, gene_id]
kallisto_folders = [
    "SRR24163115_kallisto",
    "SRR24163116_kallisto",
    "SRR24163117_kallisto",
    "SRR24163118_kallisto",
    "SRR24163119_kallisto",
    "SRR24163120_kallisto",
    "SRR24163121_kallisto",
    "SRR24163122_kallisto"
]
# Helpful to map sample -> condition. Adjust based on your dataset design.
sample_conditions = {
    "SRR24163115_kallisto": "EV",
    "SRR24163116_kallisto": "EV",
    "SRR24163117_kallisto": "EV",
    "SRR24163118_kallisto": "EV",
    "SRR24163119_kallisto": "KO",
    "SRR24163120_kallisto": "KO",
    "SRR24163121_kallisto": "KO",
    "SRR24163122_kallisto": "KO"
}

# Output directory for merged results
output_dir = "./merged_kallisto_gene_level"
os.makedirs(output_dir, exist_ok=True)

In [10]:
import pandas as pd

# Path to your transcript-to-gene mapping file
t2g_file = "t2g.txt"

# Read in all columns, give them names, then keep only transcript_id and gene_id
t2g_df = pd.read_csv(
    t2g_file,
    sep="\t",
    header=None,
    names=[
        "transcript_id",
        "gene_id",
        "symbol",
        "transcript_name",
        "chr",
        "start",
        "end",
        "strand"
    ]
)

# Filter just the columns you need
t2g_df = t2g_df[["transcript_id", "gene_id"]]

print("[INFO] Transcript-to-gene mapping (first few lines):")
print(t2g_df.head())


[INFO] Transcript-to-gene mapping (first few lines):
          transcript_id               gene_id
0  ENSMUST00000132100.2  ENSMUSG00000086053.2
1  ENSMUST00000185910.2  ENSMUSG00000100764.2
2  ENSMUST00000186289.2  ENSMUSG00000100764.2
3  ENSMUST00000188305.2  ENSMUSG00000102095.2
4  ENSMUST00000188753.2  ENSMUSG00000100635.2


  t2g_df = pd.read_csv(


In [11]:

all_samples_gene_abundance = {}

for folder in kallisto_folders:
    abundance_path = os.path.join(folder, "abundance.tsv")
    
    print(f"[INFO] Processing {abundance_path}")
    if not os.path.exists(abundance_path):
        print(f"[WARNING] {abundance_path} not found. Skipping.")
        continue
    
    # Load the Kallisto transcript-level abundance
    abundance_df = pd.read_csv(abundance_path, sep="\t")
    
    # Merge with t2g mapping
    merged_df = pd.merge(
        abundance_df, t2g_df,
        left_on="target_id",  # Kallisto output uses 'target_id'
        right_on="transcript_id",
        how="inner"
    )
    
    # Sum up by gene_id 'est_counts'
    gene_abundance = merged_df.groupby("gene_id")["est_counts"].sum().reset_index()
    
    # Rename the column to the sample name for clarity
    sample_name = folder.replace("_kallisto","")  # e.g. "SRR24163115"
    gene_abundance.columns = ["gene_id", sample_name]
    
    # Save as an individual CSV (optional)
    out_path = os.path.join(output_dir, f"{sample_name}_gene_abundance.csv")
    gene_abundance.to_csv(out_path, index=False)
    
    # Store in dictionary
    all_samples_gene_abundance[sample_name] = gene_abundance

[INFO] Processing SRR24163115_kallisto\abundance.tsv
[INFO] Processing SRR24163116_kallisto\abundance.tsv
[INFO] Processing SRR24163117_kallisto\abundance.tsv
[INFO] Processing SRR24163118_kallisto\abundance.tsv
[INFO] Processing SRR24163119_kallisto\abundance.tsv
[INFO] Processing SRR24163120_kallisto\abundance.tsv
[INFO] Processing SRR24163121_kallisto\abundance.tsv
[INFO] Processing SRR24163122_kallisto\abundance.tsv


In [None]:
# Start with a list of dataframes
dfs = list(all_samples_gene_abundance.values())

# Merge them one-by-one on 'gene_id'
merged_all = dfs[0]
for df in dfs[1:]:
    merged_all = pd.merge(merged_all, df, on="gene_id", how="outer")

# Sort by gene_id just for cleanliness
merged_all = merged_all.sort_values("gene_id").reset_index(drop=True)

print("[INFO] Final shape of the merged gene-level abundance matrix:", merged_all.shape)
print(merged_all.head())

# Save to CSV
merged_matrix_path = os.path.join(output_dir, "all_samples_gene_abundance.csv")
merged_all.to_csv(merged_matrix_path, index=False)
print(f"[INFO] Merged matrix saved to {merged_matrix_path}")

# This can now be loaded in for differential expression analysis 