In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import os

In [3]:
os.getcwd()

'/Users/mubaraq/Desktop/gnn-challenge/organizer_scripts'

In [4]:
# -----------------------------
# 1. Load expression data
# -----------------------------
# Use relative path - notebook is in organizer_scripts/, data is in ../data/
data_dir = '../data'

expr_cfRNA = pd.read_csv(os.path.join(data_dir, 'expr_df_2_GSE192902.csv'))
expr_placenta = pd.read_csv(os.path.join(data_dir, 'expr_df_GSE234729.csv'))

meta_cfRNA = pd.read_csv(os.path.join(data_dir, 'metadata_cfRNA.csv'))
meta_placenta = pd.read_csv(os.path.join(data_dir, 'metadata_placenta.csv'))


print(expr_placenta.head())

                    Gene_ID    P1_1154    P1_1651    P1_1689     P1_1748  \
0     ENSG00000139055$ERP27  28.783956  19.929550  24.891419   39.407868   
1      ENSG00000182759$MAFA  18.820279  26.223093  45.337942   33.185573   
2       ENSG00000205221$VIT  23.248580  29.369864  13.334689  145.186882   
3    ENSG00000207582$MIR30B  14.391978  14.684932  12.445710    2.074098   
4  ENSG00000207620$MIR516A2  17.713204   7.342466   8.889793    7.259344   

    P1_1973    P1_2013     P1_2052    P1_2139    P1_2263  ...    P9_1805  \
0   7.12230  44.643687   49.938054   6.726263  27.046152  ...  14.759850   
1  10.68345  11.420478   16.646018  10.569842  36.782766  ...  28.465425   
2  17.80575  53.987714  267.525291  73.988894  51.928611  ...  10.542750   
3  11.87050  11.420478    1.189001   9.608947  12.982153  ...   9.488475   
4   0.00000   8.305802    5.945006   7.687158   4.327384  ...   7.379925   

        P931       P942       P950        P972       P976       P979  \
0   5.535609  

In [5]:
# -----------------------------
# 2. Gene harmonization
# -----------------------------


# manipulating GENE numbers to ensure matching

# Extract ENSG ID from 'ENSG...$SYMBOL'
expr_placenta["ensembl_id"] = expr_placenta["Gene_ID"].str.split("$").str[0]
expr_placenta = expr_placenta.set_index("ensembl_id")
expr_placenta = expr_placenta.drop(columns=["Gene_ID"])

expr_cfRNA = expr_cfRNA.set_index("gene_num")
expr_cfRNA = expr_cfRNA.drop(columns=["gene_name"])

expr_placenta.index = expr_placenta.index.str.strip()
expr_cfRNA.index = expr_cfRNA.index.str.strip()

shared_genes = expr_placenta.index.intersection(expr_cfRNA.index)
print("Shared genes:", len(shared_genes))

expr_placenta = expr_placenta.loc[shared_genes]
expr_cfRNA = expr_cfRNA.loc[shared_genes]



Shared genes: 6650


In [None]:

# -----------------------------
# 3. Normalize expression per gene (samples x genes format)

'''
We make samples the nodes and genes the features, scaling each gene across samples to produce meaningful....
node feature vectors; the original orientation (genes as rows, samples as columns) would treat genes as ....
“data points,” which doesn’t fit the GNN node-feature paradigm. because all the features comprises of its gene representation =)
'''

# -----------------------------
# Transpose to get samples x genes, scale each gene across samples, then keep that format
scaler = StandardScaler()
expr_cfRNA_T = expr_cfRNA.T
expr_cfRNA_scaled = pd.DataFrame(scaler.fit_transform(expr_cfRNA_T),
                                  index=expr_cfRNA_T.index, columns=expr_cfRNA_T.columns)

expr_placenta_T = expr_placenta.T
expr_placenta_scaled = pd.DataFrame(scaler.transform(expr_placenta_T),
                                     index=expr_placenta_T.index, columns=expr_placenta_T.columns)



In [None]:
expr_cfRNA_scaled.head()

(          ENSG00000169877  ENSG00000176463  ENSG00000205639  ENSG00000244716  \
 782752_3         0.156980        -0.072228         0.514708        -0.401727   
 782752_4         0.444778         0.091410        -0.014561         0.108674   
 637549_3         0.300879         0.255048         1.440930         0.108674   
 637549_4        -0.130817        -0.399504        -0.279196        -0.656928   
 549101_1        -0.130817        -0.276775        -0.345355        -0.912128   
 
           ENSG00000124813  ENSG00000215302  ENSG00000226608  ENSG00000248309  \
 782752_3         0.144883         0.052908        -0.667910         0.868102   
 782752_4        -0.033763        -0.172249         0.661549         0.036856   
 637549_3        -0.033763         2.104340         0.661549         3.306422   
 637549_4         0.055560         0.052908        -1.000274        -0.129393   
 549101_1        -0.212409         0.778414        -0.667910         0.369354   
 
           ENSG000001133

In [12]:
meta_cfRNA.head()

Unnamed: 0,Sample_title,Sample_geo_accession,Sample_status,Sample_submission_date,Sample_last_update_date,Sample_type,Sample_channel_count,Sample_source_name_ch1,Sample_organism_ch1,disease,...,Sample_contact_zip/postal_code,Sample_contact_country,Sample_data_row_count,Sample_instrument_model,Sample_library_selection,Sample_library_source,Sample_library_strategy,Sample_relation,Sample_supplementary_file_1,Series_sample_id
0,944838_2,GSM5768318,Public on Jan 03 2022,Jan 02 2022,Jan 03 2022,SRA,1,blood plasma,Homo sapiens,control,...,94305,USA,0,Illumina NovaSeq 6000,cDNA,transcriptomic,RNA-Seq,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,NONE,GSM5768318
1,944838_1,GSM5768319,Public on Jan 03 2022,Jan 02 2022,Jan 03 2022,SRA,1,blood plasma,Homo sapiens,control,...,94305,USA,0,Illumina NovaSeq 6000,cDNA,transcriptomic,RNA-Seq,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,NONE,GSM5768319
2,944838_4,GSM5768320,Public on Jan 03 2022,Jan 02 2022,Jan 03 2022,SRA,1,blood plasma,Homo sapiens,control,...,94305,USA,0,Illumina NovaSeq 6000,cDNA,transcriptomic,RNA-Seq,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,NONE,GSM5768320
3,931746_1,GSM5768321,Public on Jan 03 2022,Jan 02 2022,Jan 03 2022,SRA,1,blood plasma,Homo sapiens,control,...,94305,USA,0,Illumina NovaSeq 6000,cDNA,transcriptomic,RNA-Seq,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,NONE,GSM5768321
4,826701_4,GSM5768322,Public on Jan 03 2022,Jan 02 2022,Jan 03 2022,SRA,1,blood plasma,Homo sapiens,control,...,94305,USA,0,Illumina NovaSeq 6000,cDNA,transcriptomic,RNA-Seq,SRA: https://www.ncbi.nlm.nih.gov/sra?term=SRX...,NONE,GSM5768322


In [7]:

# -----------------------------
# 4. Create train.csv (cfRNA) and test.csv (placenta)
# -----------------------------
train_df = expr_cfRNA_scaled.copy()
train_df['target'] = meta_cfRNA['diagnosis']  # 1=preeclampsia, 0=control
train_df['node_id'] = ['cfRNA_'+str(i) for i in range(len(train_df))]
train_df = train_df.reset_index(drop=True)
train_df.to_csv(os.path.join(data_dir, 'train.csv'), index=False)

test_df = expr_placenta_scaled.copy()
test_df['node_id'] = ['placenta_'+str(i) for i in range(len(test_df))]
test_df = test_df.reset_index(drop=True)
test_df.to_csv(os.path.join(data_dir, 'test.csv'), index=False)

KeyError: 'diagnosis'

In [15]:
# -----------------------------
# 4. Create train.csv (cfRNA) and test.csv (placenta) with correct label mapping
# -----------------------------

# Copy scaled expression matrices
train_df = expr_cfRNA_scaled.copy()
test_df = expr_placenta_scaled.copy()

# Build a mapping from metadata sample ID -> numeric disease label
# Adjust column name if your metadata uses 'disease' instead of 'diagnosis'
disease_map = meta_cfRNA.set_index('Sample_title')['disease'].map({'control': 0, 'preeclampsia': 1})


In [None]:

# Assign target labels by matching sample IDs (index of expr_cfRNA_scaled)
train_df['target'] = train_df.index.map(disease_map)

print(f'cfRNA dataset has {len(train_df['target'])} valid samples with matching metadata')

cfRNA dataset 209 valid samples with matching metadata


In [23]:
# Check for any unmatched samples
if train_df['target'].isnull().any():
    print("⚠️ Some cfRNA samples could not be matched to metadata!")

# Add unique node IDs
train_df['node_id'] = ['cfRNA_'+str(i) for i in range(len(train_df))]
test_df['node_id'] = ['placenta_'+str(i) for i in range(len(test_df))]

# Reset indices (optional)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Save CSVs
train_df.to_csv(os.path.join(data_dir, 'train.csv'), index=False)
test_df.to_csv(os.path.join(data_dir, 'test.csv'), index=False)

print("✅ train.csv and test.csv created with correct labels and node IDs")


⚠️ Some cfRNA samples could not be matched to metadata!
✅ train.csv and test.csv created with correct labels and node IDs


In [None]:
# -----------------------------
# 5. Node types
# -----------------------------
node_types = pd.DataFrame({
    'node_id': list(train_df['node_id']) + list(test_df['node_id']),
    'node_type': ['cfRNA']*len(train_df) + ['placenta']*len(test_df)
})
node_types.to_csv(os.path.join(data_dir, 'node_types.csv'), index=False)

In [None]:

# -----------------------------
# 6. Build graph_edges.csv
# -----------------------------
edges = []

# a) Within-modality cosine similarity edges (sparse)
def build_edges(df, node_prefix, top_k=10):
    sim_matrix = cosine_similarity(df[shared_genes])
    for i in range(sim_matrix.shape[0]):
        top_idx = np.argsort(sim_matrix[i])[-(top_k+1):-1]  # skip self
        for j in top_idx:
            edges.append({
                'src': f'{node_prefix}_{i}',
                'dst': f'{node_prefix}_{j}',
                'edge_type': 'similarity'
            })

build_edges(train_df, 'cfRNA')
build_edges(test_df, 'placenta')

# b) Cross-modality edges (cfRNA <-> placenta)
sim_matrix_cross = cosine_similarity(train_df[shared_genes], test_df[shared_genes])
top_k = 5
for i in range(sim_matrix_cross.shape[0]):
    top_idx = np.argsort(sim_matrix_cross[i])[-top_k:]
    for j in top_idx:
        edges.append({
            'src': f'cfRNA_{i}',
            'dst': f'placenta_{j}',
            'edge_type': 'cross_modality'
        })

# c) Optional: ancestry / gestational age edges
# Example for ancestry: connect nodes of same ancestry within each dataset
for df, prefix in zip([meta_cfRNA, meta_placenta], ['cfRNA','placenta']):
    ancestry_map = df['ancestry'].to_dict()
    nodes = list(df.index)
    for i in range(len(nodes)):
        for j in range(i+1, len(nodes)):
            if ancestry_map[nodes[i]] == ancestry_map[nodes[j]]:
                edges.append({
                    'src': f'{prefix}_{i}',
                    'dst': f'{prefix}_{j}',
                    'edge_type': 'ancestry'
                })

# Convert to DataFrame and save
edges_df = pd.DataFrame(edges)
edges_df.to_csv(os.path.join(data_dir, 'graph_edges.csv'), index=False)

print("✅ Dataset built successfully: train.csv, test.csv, node_types.csv, graph_edges.csv")