# MGMT Expression Analysis in TCGA-GBM
This notebook explores the expression levels of the MGMT gene in the TCGA Glioblastoma (GBM) cohort, with a focus on its relationship to radiation therapy status.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu

In [None]:
# Load RNA-seq and clinical data
rna_file = 'Human__TCGA_GBM__UNC__RNAseq__HiSeq_RNA__01_28_2016__BI__Gene__Firehose_RSEM_log2.cct'
clinical_file = 'Human__TCGA_GBM__MS__Clinical__Clinical__01_28_2016__BI__Clinical__Firehose.tsi'

rna_df = pd.read_csv(rna_file, sep='\t', index_col=0)
clinical_df = pd.read_csv(clinical_file, sep='\t')
rna_df.columns = [col.split('-')[0] for col in rna_df.columns]
rna_df = rna_df.T
merged_df = clinical_df.copy()
merged_df['bcr_patient_barcode'] = merged_df['bcr_patient_barcode'].str.strip()
merged_df = merged_df.merge(rna_df[['MGMT']], left_on='bcr_patient_barcode', right_index=True)
merged_df.rename(columns={'MGMT': 'MGMT_expression'}, inplace=True)

In [None]:
# Scatter plot of MGMT expression vs Age
plt.figure(figsize=(8, 6))
plt.scatter(merged_df['age_at_initial_pathologic_diagnosis'], merged_df['MGMT_expression'])
plt.xlabel('Age at Diagnosis')
plt.ylabel('MGMT Expression (log2 RSEM)')
plt.title('MGMT Expression vs. Age in TCGA-GBM')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Compare MGMT expression across radiation therapy groups
comp_df = merged_df.copy()
comp_df['radiation_therapy'] = comp_df['radiation_therapy'].astype(str).str.strip().str.lower()
comp_df = comp_df[comp_df['radiation_therapy'].isin(['yes', 'no'])]
comp_df['MGMT_expression'] = pd.to_numeric(comp_df['MGMT_expression'], errors='coerce')
comp_df = comp_df.dropna(subset=['MGMT_expression'])

plt.figure(figsize=(8, 6))
sns.boxplot(data=comp_df, x='radiation_therapy', y='MGMT_expression')
plt.xlabel('Radiation Therapy')
plt.ylabel('MGMT Expression (log2 RSEM)')
plt.title('MGMT Expression by Radiation Therapy Status in TCGA-GBM')
plt.grid(True, axis='y')
plt.tight_layout()
plt.show()

In [None]:
# Statistical test: Mann–Whitney U test
mgmt_yes = comp_df[comp_df['radiation_therapy'] == 'yes']['MGMT_expression']
mgmt_no = comp_df[comp_df['radiation_therapy'] == 'no']['MGMT_expression']
stat, p_value = mannwhitneyu(mgmt_yes, mgmt_no, alternative='two-sided')
print(f"Mann–Whitney U statistic: {stat:.2f}, p-value: {p_value:.4f}")