<a href="https://colab.research.google.com/github/malcolmfisher103/Bioinformatic-Scripts/blob/main/Parsing_RNA_SEQ_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Xenbase GEO RNA-Seq matrix mapper

This Python script pulls GEO data from the [Xenbase](https://www.xenbase.org/xenbase/) model organism database and maps the existing row and column names into more human readable formats. The script also averages the TPM values across replicates for the same sample/conditions.

The original GEO/SRA data has been processed through Xenbase's [RNA-Seq pipeline](https://gitlab.com/Xenbase/bioinformatics/RNAseq-Pipeline/-/tree/main?ref_type=heads). [Forteide et al. (2020)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7145613/) describes the general process but the pipeline has since been updated.

This Script takes a GSE ID and a Xenopus Genome version Build (currently only v10 builds for *X. tropicalis* and *X. laevis*) and uses them to create a TPM matrix with gene symbols rather than gene model IDs and sample names rather than SRR IDs. This will not map every gene model as some gene models do not yet have Xenbase XB-GENE-IDs associated with them. Some GSE/Build combinations wil not work, you can check on the Xenbase [GEO download site](https://bigfrog.xenbase.org/pub/xenbase/genomics/GEO/) to see which builds are available for a specifc GSE.

In [None]:
# @title Set GSE
GSE_ID = "GSE41338" # @param {type:"string"}
Genome_build = "XENTR_10.0" # @param ["XENLA_10.1", "XENTR_10.0"]

In [None]:
import argparse
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np  # Add this line
from matplotlib.ticker import LogLocator
from scipy.interpolate import interp1d

In [None]:
def load_tpm_matrix():
    return pd.read_csv(f'https://bigfrog.xenbase.org/xenbase/genomics/GEO/{GSE_ID}/{Genome_build}/RNA-Seq/ExpressionFiles/Genes_TPM_Matrix.txt', sep='\t', index_col=0)

def load_gene_info():
    return pd.read_csv('https://xenbase-bio1.ucalgary.ca/cgi-bin/reports/models_gene_info.cgi', sep='\t', index_col=0)

def load_gsm_sample_mapping():
    return pd.read_csv(f'https://bigfrog.xenbase.org/xenbase/genomics/GEO/{GSE_ID}/{Genome_build}/RNA-Seq/gsm_to_track.txt', sep='\t')

def load_gsm_srr_mapping():
    return pd.read_csv('https://download.xenbase.org/xenbase/GenePageReports/geo_srr_metadata_chd.txt', sep='\t')

def load_gse_species_mapping():
    return pd.read_csv('https://download.xenbase.org/xenbase/GenePageReports/geo_metadat_chd.txt', sep='\t')

def create_gsm_to_srr_mapping(gsm_srr_mapping):
    return gsm_srr_mapping[['SRR', 'GSM']]

In [None]:
def substitute_gene_symbols(tpm_matrix, gene_info):

# Drop rows with null values in column 1 and column 2 in file 2
  gene_info = gene_info.dropna(subset=['GENE_SYMBOL', 'MODEL_NAME'])

# Drop duplicates in file 2 based on column 2
  gene_info = gene_info.drop_duplicates(subset=['MODEL_NAME'])

# Create a mapping dictionary from column 2 to column 1 in file 2
  mapping = dict(zip(gene_info['MODEL_NAME'], gene_info['GENE_SYMBOL']))

# Replace values in column 1 of file 1 with corresponding values from file 2
  tpm_matrix['Gene'] = tpm_matrix['Gene'].map(mapping).fillna(tpm_matrix['Gene'])

# Save the modified DataFrame back to file1.csv or use it as needed
  return tpm_matrix

In [None]:
def process_data(tpm_matrix, gene_info, gsm_sample_mapping, gsm_srr_mapping):
    # Substituting gene symbols
    tpm_matrix = substitute_gene_symbols(tpm_matrix, gene_info) #This works

    # Mapping GSMs to SRRs
    gsm_to_srr = create_gsm_to_srr_mapping(gsm_srr_mapping)

    # Select columns 2 and 6 to create a DataFrame with Track Name and GSMs
    gsm_mapping = gsm_sample_mapping[['Track Name', 'GSMs']]

    # Convert GSMs column to list if it contains comma-separated values
    gsm_mapping.loc[:, 'GSMs'] = gsm_mapping['GSMs'].str.split(',')

    # Explode the list of GSMs to create multiple rows for each track name
    gsm_mapping = gsm_mapping.explode('GSMs')
    # Merge gsm_mapping with gsm_srr_mapping on the 'GSM' column to get corresponding SRRs
    track_srr_mapping = pd.merge(left=gsm_mapping, right=gsm_srr_mapping, left_on='GSMs', right_on='GSM', validate="1:m")
    # Group by 'Track Name' and aggregate the corresponding SRRs into lists
    track_srr_mapping = track_srr_mapping.groupby('Track Name')['SRR'].apply(list).reset_index()
    track_srr_mapping_expanded = track_srr_mapping.explode('SRR')
    #print(track_srr_mapping_expanded.head(10))

    srr_columns = tpm_matrix.columns.intersection(track_srr_mapping_expanded['SRR'])
    #print(srr_columns)
    srr_to_track = dict(zip(track_srr_mapping_expanded['SRR'], track_srr_mapping_expanded['Track Name']))

    # Grouping TPM matrix by SRRs
    tpm_matrix.rename(columns={col: srr_to_track.get(col, col) for col in srr_columns}, inplace=True)
    #print(tpm_matrix.head(10))

    # Melt the DataFrame to have a single column for the gene and the rest for values
    tpm_matrix = pd.melt(tpm_matrix, id_vars=['Gene'], var_name='Column')

    # Extract the unique column names excluding the first column (Gene)
    columns_to_merge = tpm_matrix['Column'].unique()[1:]

    # Group by track names and take the mean
    tpm_matrix_grouped = tpm_matrix.groupby(['Gene', 'Column']).mean().unstack()

    # Display the first 10 rows and first 5 columns
    # print(tpm_matrix_grouped.iloc[:10, :5])

    # Remove the hierarchical index and reset index
    tpm_matrix_grouped.columns = tpm_matrix_grouped.columns.droplevel()
    tpm_matrix_grouped.reset_index(inplace=True)

    #print(tpm_matrix_grouped.head(10))

    return tpm_matrix_grouped

In [None]:
def main():

    # Load data
    tpm_matrix = load_tpm_matrix()
    tpm_matrix = tpm_matrix.reset_index()
    gsm_srr_mapping = load_gsm_srr_mapping()
    gene_info = load_gene_info()
    gene_info = gene_info.reset_index()
    gsm_sample_mapping = load_gsm_sample_mapping()

    # Process data
    new_tpm_matrix = process_data(tpm_matrix, gene_info, gsm_sample_mapping, gsm_srr_mapping)

    # Output new_tpm_matrix to file
    new_tpm_matrix.to_csv(f'{GSE_ID}_TPM.tsv', sep='\t')

if __name__ == "__main__":
    main()