In [284]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import os

In [285]:
os.getcwd()

'/Users/mubaraq/gnn-challenge/organizer_scripts'

In [286]:
# -----------------------------
# 1. Load expression data
# -----------------------------
# Use relative path - notebook is in organizer_scripts/, data is in ../data/
data_dir = '../data'

expr_cfRNA = pd.read_csv(os.path.join(data_dir, 'expr_df_2_GSE192902.csv'))
expr_placenta = pd.read_csv(os.path.join(data_dir, 'expr_df_GSE234729.csv'))

meta_cfRNA = pd.read_csv(os.path.join(data_dir, 'metadata_cfRNA.csv'))
meta_placenta = pd.read_csv(os.path.join(data_dir, 'metadata_placenta.csv'))


print(expr_placenta.head())

                    Gene_ID    P1_1154    P1_1651    P1_1689     P1_1748  \
0     ENSG00000139055$ERP27  28.783956  19.929550  24.891419   39.407868   
1      ENSG00000182759$MAFA  18.820279  26.223093  45.337942   33.185573   
2       ENSG00000205221$VIT  23.248580  29.369864  13.334689  145.186882   
3    ENSG00000207582$MIR30B  14.391978  14.684932  12.445710    2.074098   
4  ENSG00000207620$MIR516A2  17.713204   7.342466   8.889793    7.259344   

    P1_1973    P1_2013     P1_2052    P1_2139    P1_2263  ...    P9_1805  \
0   7.12230  44.643687   49.938054   6.726263  27.046152  ...  14.759850   
1  10.68345  11.420478   16.646018  10.569842  36.782766  ...  28.465425   
2  17.80575  53.987714  267.525291  73.988894  51.928611  ...  10.542750   
3  11.87050  11.420478    1.189001   9.608947  12.982153  ...   9.488475   
4   0.00000   8.305802    5.945006   7.687158   4.327384  ...   7.379925   

        P931       P942       P950        P972       P976       P979  \
0   5.535609  

In [287]:
# -----------------------------
# 2. Gene harmonization
# -----------------------------


# manipulating GENE numbers to ensure matching

# Extract ENSG ID from 'ENSG...$SYMBOL'
expr_placenta["ensembl_id"] = expr_placenta["Gene_ID"].str.split("$").str[0]
expr_placenta = expr_placenta.set_index("ensembl_id")
expr_placenta = expr_placenta.drop(columns=["Gene_ID"])

expr_cfRNA = expr_cfRNA.set_index("gene_num")
expr_cfRNA = expr_cfRNA.drop(columns=["gene_name"])

expr_placenta.index = expr_placenta.index.str.strip()
expr_cfRNA.index = expr_cfRNA.index.str.strip()

shared_genes = expr_placenta.index.intersection(expr_cfRNA.index)
print("Shared genes:", len(shared_genes))

expr_placenta = expr_placenta.loc[shared_genes]
expr_cfRNA = expr_cfRNA.loc[shared_genes]



Shared genes: 6650


In [288]:

# -----------------------------
# 3. Normalize expression per gene (samples x genes format)

'''
We make samples the nodes and genes the features, scaling each gene across samples to produce meaningful....
node feature vectors; the original orientation (genes as rows, samples as columns) would treat genes as ....
“data points,” which doesn’t fit the GNN node-feature paradigm. because all the features comprises of its gene representation =)
'''

# -----------------------------
# Transpose to get samples x genes, scale each gene across samples, then keep that format
scaler = StandardScaler()
expr_cfRNA_T = expr_cfRNA.T
expr_cfRNA_scaled = pd.DataFrame(scaler.fit_transform(expr_cfRNA_T),
                                  index=expr_cfRNA_T.index, columns=expr_cfRNA_T.columns)

expr_placenta_T = expr_placenta.T
expr_placenta_scaled = pd.DataFrame(scaler.transform(expr_placenta_T),
                                     index=expr_placenta_T.index, columns=expr_placenta_T.columns)



In [289]:
# -----------------------------
# 4. Create train.csv (cfRNA) and test.csv (placenta) with correct label mapping
# -----------------------------

# Copy scaled expression matrices
train_df = expr_cfRNA_scaled.copy()
test_df = expr_placenta_scaled.copy()



In [290]:
meta_cfRNA['disease'] = meta_cfRNA['disease'].str.strip()

mapping_dict = {
    'control': 0,
    'pre-eclampsia': 1,
    'severe pre-eclampsia': 1
}

meta_cfRNA['disease_labels'] = meta_cfRNA['disease'].map(mapping_dict)





In [291]:
meta_placenta['diagnosis'] = meta_placenta['diagnosis'].str.strip()

mapping_dict = {
    'Non-hypertensive control': 0,
    'Severe Preeclampsia': 1,
}

meta_placenta['diagnosis_labels'] = meta_placenta['diagnosis'].map(mapping_dict)
       

In [292]:
train_df.index.name = 'Sample_title'
test_df.index.name = 'Sample_title'


In [293]:
new_meta_cfRNA = pd.DataFrame()
new_meta_cfRNA['Sample_title'] = meta_cfRNA['Sample_title']
new_meta_cfRNA['disease_labels'] = meta_cfRNA['disease_labels']



new_meta_placenta = pd.DataFrame()
new_meta_placenta['Sample_title'] = meta_placenta['Sample_title']
new_meta_placenta['diagnosis_labels'] = meta_placenta['diagnosis_labels']

# Store original indices BEFORE merge (so we can preserve sample_id mapping)
train_original_index = train_df.index.tolist()
test_original_index = test_df.index.tolist()

In [294]:
combined_data = pd.merge(
    train_df,                        # Left DF: Use its index
    new_meta_cfRNA,                  # Right DF: Use the 'Sample_title' column
    left_index=True,             # Use the index of 'expr'
    right_on='Sample_title',     # Use the 'Sample_title' column of 'meta_cfRNA'
    how='inner'
)
combined_data.drop(columns=['Sample_title'], inplace=True)




combined_data_test = pd.merge(
    test_df,                        # Left DF: Use its index
    new_meta_placenta,                  # Right DF: Use the 'Sample_title' column
    left_index=True,             # Use the index of 'expr'
    right_on='Sample_title',     # Use the 'Sample_title' column of 'meta_cfRNA'
    how='inner'
)
combined_data_test.drop(columns=['Sample_title'], inplace=True)

In [295]:
train_df = combined_data
test_df = combined_data_test

In [296]:
combined_data_test
#test_df

Unnamed: 0,ENSG00000169877,ENSG00000176463,ENSG00000205639,ENSG00000244716,ENSG00000124813,ENSG00000215302,ENSG00000226608,ENSG00000248309,ENSG00000113356,ENSG00000226360,...,ENSG00000278311,ENSG00000278540,ENSG00000278588,ENSG00000278705,ENSG00000278730,ENSG00000278771,ENSG00000278828,ENSG00000279117,ENSG00000281649,diagnosis_labels
22,2.001803,2.317889,-0.536747,-0.347076,3.959981,-0.084712,-0.228781,1.200495,-0.357654,1.613520,...,9.357108,32.332687,17.047665,17.906580,12.667922,3.331847,-0.385512,15.059538,29.532892,1
53,5.482081,1.703132,0.083965,-0.376756,2.700811,-0.000114,0.061862,1.284646,-0.308349,3.585350,...,10.990689,36.481765,14.394918,14.012298,14.249167,3.839910,-0.404922,15.502249,31.687713,1
57,3.642971,0.291482,-0.315921,-0.685260,4.452723,-0.138859,0.735618,2.340005,-0.361129,7.173345,...,9.307186,40.343675,22.426637,14.020356,18.782609,6.118513,-0.405092,13.589086,27.089200,1
61,0.785889,1.042952,0.419158,-0.382817,5.074247,-0.290849,0.046075,1.149308,-0.427971,2.279673,...,12.733581,28.795592,9.773705,12.817422,22.890911,0.861155,-0.474706,8.988595,43.902293,1
77,2.197444,0.280359,0.332415,-0.609192,7.137128,0.002693,-1.332639,1.061460,-0.418492,3.719206,...,8.590159,30.622515,17.610857,22.085023,20.916202,16.806903,-0.313426,12.818772,36.006956,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,2.274994,-0.052761,0.199985,1.731592,1.964331,-0.236854,3.362488,0.213339,-0.432816,4.248511,...,11.680663,33.490555,11.686262,43.784322,20.397816,36.816875,-0.115603,12.161175,30.280703,0
14,1.739488,0.370352,0.032595,6.369070,1.506762,-0.229470,3.106114,0.290063,-0.506884,5.539638,...,12.055757,43.459875,26.339624,58.275057,19.704476,31.632055,0.253226,15.629236,21.565081,0
15,0.253155,-0.009791,0.965614,3.463839,1.140580,-0.138809,4.999710,-0.042191,-0.485235,4.693423,...,12.239275,45.175453,26.231479,52.968440,17.798404,36.186048,0.303302,7.146752,21.091543,0
17,0.285531,0.325647,0.497571,5.119357,1.192258,-0.152188,8.159067,0.192128,-0.495560,6.474127,...,10.841962,34.375178,31.570018,47.968702,22.955524,32.685270,0.222761,13.757955,26.855832,0


In [297]:

# Assign target labels by matching sample IDs (index of expr_cfRNA_scaled)

print(f'cfRNA dataset has {len(train_df['disease_labels'])} valid samples with matching metadata')
print(f'Placenta dataset has {len(test_df['diagnosis_labels'])} valid samples with matching metadata')


cfRNA dataset has 209 valid samples with matching metadata
Placenta dataset has 111 valid samples with matching metadata


In [298]:
test_labels = test_df['diagnosis_labels']
test_df = test_df.drop(columns=['diagnosis_labels'])

In [299]:
test_df

Unnamed: 0,ENSG00000169877,ENSG00000176463,ENSG00000205639,ENSG00000244716,ENSG00000124813,ENSG00000215302,ENSG00000226608,ENSG00000248309,ENSG00000113356,ENSG00000226360,...,ENSG00000278272,ENSG00000278311,ENSG00000278540,ENSG00000278588,ENSG00000278705,ENSG00000278730,ENSG00000278771,ENSG00000278828,ENSG00000279117,ENSG00000281649
22,2.001803,2.317889,-0.536747,-0.347076,3.959981,-0.084712,-0.228781,1.200495,-0.357654,1.613520,...,18.126205,9.357108,32.332687,17.047665,17.906580,12.667922,3.331847,-0.385512,15.059538,29.532892
53,5.482081,1.703132,0.083965,-0.376756,2.700811,-0.000114,0.061862,1.284646,-0.308349,3.585350,...,14.559401,10.990689,36.481765,14.394918,14.012298,14.249167,3.839910,-0.404922,15.502249,31.687713
57,3.642971,0.291482,-0.315921,-0.685260,4.452723,-0.138859,0.735618,2.340005,-0.361129,7.173345,...,27.807886,9.307186,40.343675,22.426637,14.020356,18.782609,6.118513,-0.405092,13.589086,27.089200
61,0.785889,1.042952,0.419158,-0.382817,5.074247,-0.290849,0.046075,1.149308,-0.427971,2.279673,...,11.719354,12.733581,28.795592,9.773705,12.817422,22.890911,0.861155,-0.474706,8.988595,43.902293
77,2.197444,0.280359,0.332415,-0.609192,7.137128,0.002693,-1.332639,1.061460,-0.418492,3.719206,...,24.208032,8.590159,30.622515,17.610857,22.085023,20.916202,16.806903,-0.313426,12.818772,36.006956
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,2.274994,-0.052761,0.199985,1.731592,1.964331,-0.236854,3.362488,0.213339,-0.432816,4.248511,...,7.820491,11.680663,33.490555,11.686262,43.784322,20.397816,36.816875,-0.115603,12.161175,30.280703
14,1.739488,0.370352,0.032595,6.369070,1.506762,-0.229470,3.106114,0.290063,-0.506884,5.539638,...,21.584293,12.055757,43.459875,26.339624,58.275057,19.704476,31.632055,0.253226,15.629236,21.565081
15,0.253155,-0.009791,0.965614,3.463839,1.140580,-0.138809,4.999710,-0.042191,-0.485235,4.693423,...,20.166849,12.239275,45.175453,26.231479,52.968440,17.798404,36.186048,0.303302,7.146752,21.091543
17,0.285531,0.325647,0.497571,5.119357,1.192258,-0.152188,8.159067,0.192128,-0.495560,6.474127,...,21.678771,10.841962,34.375178,31.570018,47.968702,22.955524,32.685270,0.222761,13.757955,26.855832


In [300]:
# Check for any unmatched samples
if train_df['disease_labels'].isnull().any():
    print("⚠️ Some cfRNA samples could not be matched to metadata!")

# Store original sample IDs - use the index that was set before merge
# The index of train_df/test_df is now from expr_cfRNA_scaled/expr_placenta_scaled
train_df['sample_id'] = train_df.index
test_df['sample_id'] = test_df.index

# Add unique node IDs
train_df['node_id'] = ['cfRNA_'+str(i) for i in range(len(train_df))]
test_df['node_id'] = ['placenta_'+str(i) for i in range(len(test_df))]

# Reset indices (optional)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Save CSVs
train_df.to_csv(os.path.join(data_dir, 'train.csv'), index=False)
test_df.to_csv(os.path.join(data_dir, 'test.csv'), index=False) # without labels
test_labels.to_csv(os.path.join(data_dir, 'test_labels.csv'), index=True, header=['diagnosis_labels']) # with labels
print("✅ train.csv and test.csv created with correct labels and node IDs")

✅ train.csv and test.csv created with correct labels and node IDs


In [301]:
# -----------------------------
# 5. Node types
# -----------------------------
node_types = pd.DataFrame({
    'node_id': list(train_df['node_id']) + list(test_df['node_id']),
    'node_type': ['cfRNA']*len(train_df) + ['placenta']*len(test_df)
})
node_types.to_csv(os.path.join(data_dir, 'node_types.csv'), index=False)

In [302]:


edges = []

def build_similarity_edges(df, top_k=10):
    """
    Build undirected similarity edges within a single graph (train or test).
    """
    # Only numeric feature columns
    feature_cols = [c for c in df.columns if c not in ['target', 'node_id', 'sample_id']]
    
    # Fill NaNs if any
    sim_matrix = cosine_similarity(df[feature_cols].fillna(0))
    
    edges_local = []
    for i in range(sim_matrix.shape[0]):
        # get top_k neighbors, exclude self
        top_idx = np.argsort(sim_matrix[i])[-(top_k + 1):-1]
        for j in top_idx:
            edges_local.append({
                'src': df.loc[i, 'node_id'],
                'dst': df.loc[j, 'node_id'],
                'edge_type': 'similarity'
            })
    return edges_local


In [303]:
def build_extra_edges(meta_df, df, meta_col, edge_type_name, node_to_sample):
    """
    Build undirected edges based on a metadata column (ancestry or geo accession).
    Only connects nodes that share the same value in `meta_col`.
    """
    edges_local = []

    # Map Sample_title -> meta value
    meta_map = meta_df.set_index('Sample_title')[meta_col].to_dict()

    for i, node_i in enumerate(df['node_id']):
        sample_i = node_to_sample[node_i]
        value_i = meta_map.get(sample_i)
        if value_i is None:
            continue

        for j in range(i + 1, len(df)):
            node_j = df['node_id'].iloc[j]
            sample_j = node_to_sample[node_j]
            value_j = meta_map.get(sample_j)
            if value_j is None:
                continue

            if value_i == value_j:
                edges_local.append({
                    'src': node_i,
                    'dst': node_j,
                    'edge_type': edge_type_name
                })

    return edges_local


In [304]:
# -----------------------------
# 3. Build edges for train/test
# -----------------------------
edges = []

# -----------------------------
# Similarity edges (train and test separately)
# -----------------------------
edges.extend(build_similarity_edges(train_df, top_k=10))
edges.extend(build_similarity_edges(test_df, top_k=10))

# -----------------------------
# Mapping node_id → Sample_title
# -----------------------------
node_to_sample_train = dict(zip(train_df['node_id'], train_df['sample_id']))
node_to_sample_test = dict(zip(test_df['node_id'], test_df['sample_id']))

'''# -----------------------------
# Geo accession edges (train - cfRNA)
# -----------------------------
train_geo_edges = build_extra_edges(
    meta_df=meta_cfRNA,
    df=train_df,
    meta_col='Sample_geo_accession',
    edge_type_name='geo_accession',
    node_to_sample=node_to_sample_train
)
edges.extend(train_geo_edges)
print(f"✅ {len(train_geo_edges)} geo accession edges added for cfRNA nodes")
'''
# -----------------------------
# Ancestry edges (test - placenta)
# Groups samples by maternal ancestry
# -----------------------------
test_ancestry_edges = build_extra_edges(
    meta_df=meta_placenta,
    df=test_df,
    meta_col='maternal_ancestry_self_report',
    edge_type_name='ancestry',
    node_to_sample=node_to_sample_test
)
edges.extend(test_ancestry_edges)
print(f"✅ {len(test_ancestry_edges)} ancestry edges added for placenta nodes")

# -----------------------------
# Save edges to CSV
# -----------------------------
edges_df = pd.DataFrame(edges)
edges_df.to_csv(os.path.join(data_dir, 'graph_edges.csv'), index=False)
print("✅ graph_edges.csv created successfully")
print(edges_df['edge_type'].value_counts())

✅ 0 ancestry edges added for placenta nodes
✅ graph_edges.csv created successfully
edge_type
similarity    3200
Name: count, dtype: int64
