In [None]:

import pandas as pd
def parse_sgd_gaf(gaf_file="sgd.gaf"):

   # Simplified parser for for downloaded SGD GAF file
   # Reads the file using pandas and skips comment lines.


    # Expected column names for GAF 
    columns = [
        'DB','DB_Object_ID','DB_Object_Symbol','Qualifier','GO_ID',
        'DB_Reference','Evidence_Code','With_From','Aspect','DB_Object_Name',
        'DB_Object_Synonym','DB_Object_Type','Taxon','Date','Assigned_By',
        'Annotation_Extension','Gene_Product_Form_ID'
    ]

    # Read GAF using pandas
    df = pd.read_csv(
        gaf_file,
        sep="\t",
        comment="!",     # skip comment lines
        header=None,      # file has no header
        names=columns,    # assign column names
        dtype=str         # keep all columns as strings
    )

    # Build Go mapping
    go_mapping = pd.DataFrame({
        "gene_id":   df["DB_Object_ID"],
        "gene_name": df["DB_Object_Symbol"],
        "GO_ID":     df["GO_ID"],
        "GO_category": df["Aspect"].map({
            "P": "biological_process",
            "F": "molecular_function",
            "C": "cellular_component"
        }),
        "evidence": df["Evidence_Code"],
        "description": df["DB_Object_Name"]
    })

    # Drop incomplete rows
    go_mapping = go_mapping.dropna(subset=["gene_name", "GO_ID"])

    return go_mapping


def get_go_summary(go_mapping):
    """Print summary statistics."""

    print("\nGO Annotation Summary")
    print("="*50)
    print(f"Total GO annotations: {len(go_mapping):,}")
    print(f"Unique genes: {go_mapping['gene_name'].nunique():,}")
    print(f"Unique GO terms: {go_mapping['GO_ID'].nunique():,}")

    print("\nAnnotations by category:")
    print(go_mapping["GO_category"].value_counts())

    print("\nTop 10 most annotated genes:")
    print(go_mapping["gene_name"].value_counts().head(10))


if __name__ == "__main__":
    go_mapping = parse_sgd_gaf("sgd.gaf")
    get_go_summary(go_mapping)

    go_mapping.to_csv("go_mapping.txt", sep="\t", index=False)
    print("\nSaved output to go_mapping.txt")

    print("\nPreview:")
    print(go_mapping.head(10).to_string())

In [None]:
import pandas as pd

# Load GO mapping txt
go_mapping = pd.read_csv('go_mapping.txt', sep='\t')

# Filter for gluconeogenesis (GO:0006094)
gluconeogenesis = go_mapping[go_mapping['GO_ID'] == 'GO:0006094']

# Get unique genes
unique_genes = gluconeogenesis[['gene_name', 'gene_id', 'evidence']].drop_duplicates().sort_values('gene_name')

# Output
print(f"GLUCONEOGENESIS GENES (GO:0006094)")
print(f"Unique genes: {len(unique_genes)}")
print(unique_genes.to_string(index=False))

# Save file to HPC directory
unique_genes.to_csv('gluconeogenesis_genes.txt', sep='\t', index=False)

In [None]:
os.chdir("/scratch/grp/msc_appbio/Group18_ABCC/cuffdiff_results_UQ")
print(os.getcwd())
import pandas as pd

# Load data
gene_exp = pd.read_csv('gene_exp.diff', sep='\t')

os.chdir("/scratch/grp/msc_appbio/Group18_ABCC")
gluconeogenesis_genes = pd.read_csv('gluconeogenesis_genes.txt', sep='\t')

# Filter for significant DEGs
degs = gene_exp[gene_exp['q_value'] < 0.05].copy()
print(f"Significant DEGs: {len(degs)}")

# Merge DEGs with gluconeogenesis genes
degs_gluconeogenesis = pd.merge(degs, gluconeogenesis_genes, left_on='gene', right_on='gene_name', how='inner')
print(f"Gluconeogenesis DEGs: {degs_gluconeogenesis['gene_name'].nunique()} unique genes")

# Filter for diauxic shift comparison
diauxic = degs_gluconeogenesis[
    (degs_gluconeogenesis['sample_1'] == 'Exponential') & 
    (degs_gluconeogenesis['sample_2'] == 'Diauxic')
].copy()

# Calculate the fold change
diauxic['fold_change'] = 2 ** diauxic['log2(fold_change)']
diauxic = diauxic.sort_values('fold_change', ascending=False)

# Display results
print("\nGLUCONEOGENESIS DEGS AT DIAUXIC SHIFT")
print(diauxic[['gene_name', 'fold_change', 'log2(fold_change)', 'q_value']].to_string(index=False))

# Save to hpc directory
diauxic.to_csv('gluconeogenesis_degs_diauxic.csv', index=False)

In [None]:
os.chdir("/scratch/grp/msc_appbio/Group18_ABCC/cuffdiff_results_UQ")
print(os.getcwd())
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os

# Load data
os.chdir("/scratch/grp/msc_appbio/Group18_ABCC/cuffdiff_results_UQ")
fpkm_data = pd.read_csv('genes.fpkm_tracking', sep='\t')
os.chdir("/scratch/grp/msc_appbio/Group18_ABCC")
gluconeogenesis_genes = pd.read_csv('gluconeogenesis_genes.txt', sep='\t')

# Filter and extract
gluconeogenesis_fpkm = fpkm_data[fpkm_data['gene_short_name'].isin(gluconeogenesis_genes['gene_name'])]
heatmap_data = gluconeogenesis_fpkm[['gene_short_name', 'Lag_FPKM', 'Exponential_FPKM', 'Diauxic_FPKM']]
heatmap_data = heatmap_data.set_index('gene_short_name')
heatmap_data.columns = ['6h', '14h', '26h']

# Log2 transform then Z-score
log_data = np.log2(heatmap_data + 1)
z_scores = log_data.apply(lambda x: (x - x.mean()) / x.std(), axis=1)

colors = ['#e5f5e0', '#31a354', '#00441b']
custom_cmap = mcolors.LinearSegmentedColormap.from_list('custom_green', colors)
# Clustered heatmap
g = sns.clustermap(z_scores, 
                   cmap=custom_cmap,
                   row_cluster=True,
                   col_cluster=False,
                   linewidths=0.5,
                   linecolor='gray',
                   cbar_kws={'label': 'Row Z-Score'},
                   vmin=-2, vmax=2,
                   figsize=(5, 10))

g.ax_heatmap.set_xlabel('Time Point', fontweight='bold', fontsize=11)
g.ax_heatmap.set_ylabel('Gene', fontweight='bold', fontsize=11)
g.fig.suptitle('Gluconeogenesis Gene Expression', fontweight='bold', y=1.02)

plt.savefig('gluconeogenesis_expression_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()