In [1]:
import pandas as pd

# Path to the big feature file you downloaded
sgd_features_file = '/content/SGD_features.tab'

# Path for your output mapping file
output_mapping_file = '/content/convert_output.txt'

# Define the column names based on the SGD_features.tab file documentation
# This helps in clarity and avoids using magic numbers like 0 and 3
column_names = [
    'sgd_id', 'feature_type', 'feature_qualifier', 'systematic_name',
    'standard_name', 'aliases', 'parent_feature', 'secondary_sgd_id',
    'chromosome', 'start_coord', 'stop_coord', 'strand', 'genetic_position',
    'coord_version', 'seq_version', 'description'
]

# Read the file, specifying there is no header in the file itself
df = pd.read_csv(sgd_features_file, sep='\t', header=None, names=column_names)

# Select only the two columns we need
mapping_df = df[['sgd_id', 'systematic_name']]

# Drop any rows where the systematic name might be missing (important for clean data)
mapping_df = mapping_df.dropna()

# Save the result to a new file, without the header and without the index
mapping_df.to_csv(
    output_mapping_file,
    sep='\t',          # Use tab as the separator
    header=False,      # Do not write the column names to the file
    index=False        # Do not write the row numbers (index)
)

print(f"Mapping file created successfully at: {output_mapping_file}")

Mapping file created successfully at: /content/convert_output.txt


In [2]:
import pandas as pd

# 1. Define your input and output file paths
#    CHOOSE THIS FILE: It's based on the full graph and is cleaned.
input_pagerank_file = "/content/CYC.cmty.pagerank (1).txt"
output_rnk_file = "/content/pageRank_graph.rnk" # The final file for GSEA

# 2. Read and parse the file
gene_scores = []
with open(input_pagerank_file, 'r') as f:
    for line in f:
        # The format is "GENE = SCORE %". We split on the '='.
        if '=' not in line:
            # This skips header lines or any other malformed lines
            continue

        # Split the line into two parts: gene part and score part
        parts = line.strip().split('=')
        gene_name = parts[0].strip()  # Get gene name, remove extra spaces

        # Clean up the score part: remove spaces and the trailing '%'
        score_str = parts[1].strip().replace('%', '')

        # Convert the score string to a number (float)
        score_val = float(score_str)

        gene_scores.append((gene_name, score_val))

# 3. Create a pandas DataFrame for easy sorting
df = pd.DataFrame(gene_scores, columns=['gene', 'score'])

# 4. Sort the DataFrame by score in descending order (highest first)
#    This is a CRITICAL step for GSEA
df_sorted = df.sort_values(by='score', ascending=False)

# 5. Save the sorted data to the .rnk file
#    - sep='\t' makes it tab-separated
#    - header=False ensures no header is written
#    - index=False ensures row numbers are not written
df_sorted.to_csv(output_rnk_file, sep='\t', header=False, index=False)

print(f"Successfully created GSEA-ready rank file at: {output_rnk_file}")
print(f"Total genes in rank file: {len(df_sorted)}")
print("\nFirst 5 lines of the new .rnk file:")
print(df_sorted.head())

Successfully created GSEA-ready rank file at: /content/pageRank_graph.rnk
Total genes in rank file: 1627

First 5 lines of the new .rnk file:
      gene     score
0  YBR160W  0.390867
1  YER133W  0.340878
2  YPL031C  0.231531
3  YDL047W  0.180974
4  YPL129W  0.176248
