 1. Making the .gmt file to act as a database to run pre ranked GSEA
    - For Saccharomyces Cerivisae ,the file "http://sgd-archive.yeastgenome.org/curation/literature/gene_association.sgd.gaf.gz"

    the first few lines has to be removed for the code works.
    

    - However the genes follow a SGD ID for the genes instead of the Systematic name that our CYC dataset:

    example:
SGD ID      Systematic Name
---------------------------
S000350094	YDL204W-A


S000350095	YFR035W-A

S000001326	YIL064W

S000350096	YGR016C-A

S000001097	YHR055C

S000007234	YDR034W-B


 So it had to converted before it could be used.

In [1]:
import pandas as pd

def gaf_to_gmt(gaf_file, gmt_file):
    try:
        # Read the GAF file into a pandas DataFrame
        gaf_data = pd.read_csv(gaf_file, sep='\t', header=None, on_bad_lines='skip')

        # Initialize a dictionary to store gene sets by GO term
        gene_sets = {}

        # Iterate through the GAF file rows
        for index, row in gaf_data.iterrows():
            gene_id = row[1]  # Gene ID column (second column)
            term = row[4]  # GO term or other annotation (fifth column)

            # If the term is not in the dictionary, add it
            if term not in gene_sets:
                gene_sets[term] = []

            # Add the gene ID to the corresponding GO term's gene list
            gene_sets[term].append(gene_id)

        # Write the GMT file
        with open(gmt_file, 'w') as f:
            for term, genes in gene_sets.items():
                # Write each GO term and the associated genes
                f.write(f"{term}\tDescription for {term}\t" + "\t".join(genes) + "\n")

        print(f"Conversion complete. {len(gene_sets)} gene sets written to {gmt_file}")

    except Exception as e:
        print(f"Error processing the GAF file: {e}")

# Example usage: replace with the actual file paths
gaf_file = '/content/gene_association.sgd.20250707.gaf'  # Path to the GAF file
gmt_file = '/content/sgdGMT.gmt'  # Path to the output GMT file

# Call the function to convert GAF to GMT
gaf_to_gmt(gaf_file, gmt_file)



Conversion complete. 6092 gene sets written to /content/sgdGMT.gmt


The code below converts the SGD IDs to Systematic names:

in the below code, a text file "convert_output" is used, it carries the converision of sgd id to systematic names only for the genes present in the cyc dataset we used.


# New Section

In [2]:
import csv

def load_mapping(mapping_file):
    """Load SGD ID to systematic name mapping from a tab-delimited text file."""
    mapping = {}
    with open(mapping_file, 'r') as f:
        for line in f:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                sgd_id, sys_name = parts
                mapping[sgd_id] = sys_name
    return mapping

def convert_gmt(input_gmt, output_gmt, mapping):
    """Convert gene IDs in a GMT file using the mapping and save the result."""
    with open(input_gmt, 'r') as infile, open(output_gmt, 'w', newline='') as outfile:
        writer = csv.writer(outfile, delimiter='\t')
        for line in infile:
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                gene_set_name = parts[0]
                description = parts[1]
                # Only include genes with a known mapping
                genes = [mapping[g] for g in parts[2:] if g in mapping]
                writer.writerow([gene_set_name, description] + genes)

if __name__ == "__main__":
    mapping_file = "/content/convert_output.txt"      # Replace with your mapping file name
    input_gmt = "/content/sgdGMT.gmt"           # Replace with your input GMT file
    output_gmt = "/content/converted_output.gmt"    # Output file name
    mapping = load_mapping(mapping_file)
    convert_gmt(input_gmt, output_gmt, mapping)
    print(f"Conversion complete. Output saved to: {output_gmt}")


Conversion complete. Output saved to: /content/converted_output.gmt


There are many lines in the gmt file with no gene, so they had to be removed

In [3]:
def clean_gmt(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            parts = line.strip().split('\t')
            if len(parts) > 2:
                # Keep lines with at least one gene
                outfile.write('\t'.join(parts) + '\n')

    print(f"Cleaned GMT saved to: {output_file}")

# Example usage:
clean_gmt("/content/converted_output.gmt", "/content/cleaned_gene_sets.gmt")

Cleaned GMT saved to: /content/cleaned_gene_sets.gmt


The Script to run GSEA Prerank
 here we used cenrtality measures to sort the genes and linearly scored them in order to run the GSEAS analysis tool, where if there is *n* genes then the highest ranked gene is scored n and the second highest *n-1*, followed by *n-2* and so on.

In [5]:
pip install gseapy

Collecting gseapy
  Downloading gseapy-1.1.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Downloading gseapy-1.1.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (597 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m597.6/597.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gseapy
Successfully installed gseapy-1.1.9


In [6]:
import gseapy as gp

def run_gsea_preranked(rnk_file, gmt_file, output_dir, min_size=15, max_size=500, permutations=1000):
    """Run GSEA Pre-Ranked using GSEApy."""
    # Run preranked GSEA
    pre_res = gp.prerank(
        rnk=rnk_file,
        gene_sets=gmt_file,
        outdir=output_dir,
        format='png',
        permutation_num=permutations,
        min_size=min_size,
        max_size=max_size,
        seed=42,
        verbose=True
    )
    print(f"GSEA Pre-Ranked analysis complete. Results saved to: {output_dir}")

if __name__ == "__main__":
    # Replace these with your actual file paths
    rnk_file = "/content/pageRank_graph.rnk"       # 2-column tab-delimited file: gene \t score
    gmt_file = "/content/cleaned_gene_sets.gmt"         # GMT file of gene sets
    output_dir = "/content/pageRank(Graph)"              # Output directory for GSEA results

    run_gsea_preranked(rnk_file, gmt_file, output_dir)


The order of those genes will be arbitrary, which may produce unexpected results.
2025-07-19 18:33:51,298 [INFO] Parsing data files for GSEA.............................
2025-07-19 18:33:51,424 [INFO] 5747 gene_sets have been filtered out when max_size=500 and min_size=15
2025-07-19 18:33:51,426 [INFO] 0180 gene_sets used for further statistical testing.....
2025-07-19 18:33:51,428 [INFO] Start to run GSEA...Might take a while..................
2025-07-19 18:34:04,950 [INFO] Congratulations. GSEApy runs successfully................



GSEA Pre-Ranked analysis complete. Results saved to: /content/pageRank(Graph)
