# Updating Coexpression Dataset with Missing Genes
This notebook processes a coexpression dataset by adding missing genes from a full gene list.

In [None]:
# Load required libraries
import pandas as pd

# File paths
coexpression_path = 'coexpression_network_nodes_with_freq_corrected.txt'
full_gene_list_path = 'coexpression_gene_names_full.txt'
final_cleaned_output_path = 'final_cleaned_updated_coexpression_with_109_genes.txt'

# Step 1: Load the datasets
coexpression_df = pd.read_csv(coexpression_path, sep="\t")
full_gene_list_df = pd.read_csv(full_gene_list_path, header=None, names=["gene"])


In [None]:
# Step 2: Clean the full gene list to remove extra quotes and indices
cleaned_full_gene_list = full_gene_list_df[~full_gene_list_df['gene'].str.contains("gene_name", case=False, na=False)]
cleaned_full_gene_list['gene'] = cleaned_full_gene_list['gene'].str.extract(r'[\"]?(?P<gene>[a-zA-Z0-9_-]+)[\"]?$')


In [None]:
# Step 3: Identify missing genes
missing_genes_cleaned = cleaned_full_gene_list[~cleaned_full_gene_list['gene'].isin(coexpression_df['node'])]


In [None]:
# Step 4: Create rows for missing genes
missing_rows_cleaned = pd.DataFrame({
    'node': missing_genes_cleaned['gene'],
    'module': 'none',
    'frequency': -1
})


In [None]:
# Step 5: Combine the original dataset with missing rows
final_combined_df = pd.concat([coexpression_df, missing_rows_cleaned], ignore_index=True)


In [None]:
# Step 6: Reorder based on the cleaned full gene list
final_ordered_df = final_combined_df.set_index('node').reindex(cleaned_full_gene_list['gene']).reset_index()


In [None]:
# Step 7: Save the final updated dataset
final_ordered_df.to_csv(final_cleaned_output_path, sep="\t", index=False)

print(f"Processed file saved at: {final_cleaned_output_path}")
