In [11]:
#pip install biopython requests goatools
#pip install -e git+https://github.com/biothings/mygene.py#egg=mygene

In [3]:
import mygene
import pandas as pd

# Initialize MyGene.info
mg = mygene.MyGeneInfo()

# Load the gene metadata file
gene_metadata_path = 'gene_metadata_neuroblastoma.xlsx'
gene_metadata_df = pd.read_excel(gene_metadata_path)

# Extract unique gene symbols
gene_symbols = gene_metadata_df['Gene Symbol'].unique().tolist()

# Query MyGene.info for gene annotations including GO terms
gene_info = mg.querymany(gene_symbols, scopes='symbol', fields='go', species='human', returnall=True)

# Extract the list of hits and handle duplicates
gene_hits = gene_info['out']
unique_hits = {}
for hit in gene_hits:
    if hit['query'] not in unique_hits:
        unique_hits[hit['query']] = hit

# Convert the unique hits to a DataFrame
gene_info_df = pd.DataFrame(unique_hits.values())

# Flatten the GO term information if it's nested
def flatten_go_terms(go_info):
    if isinstance(go_info, dict):
        go_terms = []
        for go_type in go_info.values():
            if isinstance(go_type, list):
                go_terms.extend(go_type)
        return ";".join([f"{term['id']}:{term['term']}" for term in go_terms if isinstance(term, dict)])
    return None

gene_info_df['go_terms'] = gene_info_df['go'].apply(flatten_go_terms)

# Merge with the original gene metadata
merged_df = gene_metadata_df.merge(gene_info_df[['query', 'go_terms']], left_on='Gene Symbol', right_on='query', how='left')

# Save the merged dataframe to a new CSV file
output_path = 'gene_metadata_with_go_terms.csv'
merged_df.to_csv(output_path, index=False)

# Display the path to the saved file for the user
output_path


28 input query terms found dup hits:	[('BTN2A3P', 2), ('FCGR2C', 2), ('GLRA4', 2), ('IGHA1', 2), ('IGHG1', 2), ('IGHM', 2), ('MMD2', 2), 
233 input query terms found no hit:	['ACPP', 'ACPT', 'ADSS', 'AGPAT6', 'AGPAT9', 'ALPPL2', 'AMICA1', 'AQPEP', 'ARSE', 'ATP5G1', 'ATP5G2'


'gene_metadata_with_go_terms.csv'

In [7]:
import pandas as pd

# Load the gene metadata and README file
metadata_file = 'gene_metadata_with_go_terms.csv'  # Update with the correct path
readme_file = 'gene_metadata_neuroblastoma_README.xlsx'  # Update with the correct path

# Read the gene metadata CSV file
gene_metadata = pd.read_csv(metadata_file)

# Extract the necessary columns for GO Categories
go_data = gene_metadata[['Gene Symbol', 'go_terms']].copy()

# Split GO terms into individual terms
go_data['go_terms'] = go_data['go_terms'].str.split(';')

# Explode the DataFrame so each row contains a single GO term
exploded_go_data = go_data.explode('go_terms')

# Remove duplicates
unique_go_data = exploded_go_data.drop_duplicates()

# Group by GO term and aggregate unique gene symbols into a comma-separated string
go_categories = unique_go_data.groupby('go_terms')['Gene Symbol'].apply(lambda x: ', '.join(sorted(set(x)))).reset_index()

# Rename columns
go_categories.columns = ['GO Categories', 'Gene Symbol']

# Remove rows where the "GO Categories" field is empty
go_categories_cleaned = go_categories[go_categories['GO Categories'].str.strip() != '']

# Extract necessary columns from the gene metadata
gene_metadata_reduced = gene_metadata[['Gene Symbol', 'Group', 'logFC', 'PValue', 'FDR', 'go_terms']].copy()

# Split GO terms into individual terms
gene_metadata_reduced['go_terms'] = gene_metadata_reduced['go_terms'].str.split(';')

# Explode the DataFrame so each row contains a single GO term
exploded_metadata = gene_metadata_reduced.explode('go_terms')

# Merge with cleaned GO categories data to get the relevant genes for each GO category
merged_data = pd.merge(go_categories_cleaned, exploded_metadata, left_on='GO Categories', right_on='go_terms', how='left')

# Group by 'GO Categories' and 'Group', then calculate the mean of 'logFC', 'PValue', and 'FDR'
grouped_data = merged_data.groupby(['GO Categories', 'Group']).agg({
    'logFC': 'mean',
    'PValue': 'mean',
    'FDR': 'mean'
}).reset_index()

# Pivot the table to get the desired format
pivot_data = grouped_data.pivot(index='GO Categories', columns='Group', values=['logFC', 'PValue', 'FDR'])

# Flatten the MultiIndex columns
pivot_data.columns = [f'{group}_{stat}_mean' for stat, group in pivot_data.columns]

# Merge with the original GO categories data
final_data = pd.merge(go_categories_cleaned, pivot_data, on='GO Categories', how='left')

# Save the final result to a new CSV file
final_output_file = 'final_go_categories_with_gene_means.csv'  # Update with the correct path
final_data.to_csv(final_output_file, index=False)

print(f"Final file saved to: {final_output_file}")


Final file saved to: final_go_categories_with_gene_means.csv
