In [128]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import os

In [129]:
os.getcwd()

'/Users/mubaraq/gnn-challenge/organizer_scripts'

In [130]:
# -----------------------------
# 1. Load expression data
# -----------------------------
# Use relative path - notebook is in organizer_scripts/, data is in ../data/
data_dir = '../data'

expr_cfRNA = pd.read_csv(os.path.join(data_dir, 'expr_df_2_GSE192902.csv'))
expr_placenta = pd.read_csv(os.path.join(data_dir, 'expr_df_GSE234729.csv'))

meta_cfRNA = pd.read_csv(os.path.join(data_dir, 'metadata_cfRNA.csv'))
meta_placenta = pd.read_csv(os.path.join(data_dir, 'metadata_placenta.csv'))


print(expr_placenta.head())

                    Gene_ID    P1_1154    P1_1651    P1_1689     P1_1748  \
0     ENSG00000139055$ERP27  28.783956  19.929550  24.891419   39.407868   
1      ENSG00000182759$MAFA  18.820279  26.223093  45.337942   33.185573   
2       ENSG00000205221$VIT  23.248580  29.369864  13.334689  145.186882   
3    ENSG00000207582$MIR30B  14.391978  14.684932  12.445710    2.074098   
4  ENSG00000207620$MIR516A2  17.713204   7.342466   8.889793    7.259344   

    P1_1973    P1_2013     P1_2052    P1_2139    P1_2263  ...    P9_1805  \
0   7.12230  44.643687   49.938054   6.726263  27.046152  ...  14.759850   
1  10.68345  11.420478   16.646018  10.569842  36.782766  ...  28.465425   
2  17.80575  53.987714  267.525291  73.988894  51.928611  ...  10.542750   
3  11.87050  11.420478    1.189001   9.608947  12.982153  ...   9.488475   
4   0.00000   8.305802    5.945006   7.687158   4.327384  ...   7.379925   

        P931       P942       P950        P972       P976       P979  \
0   5.535609  

In [131]:
# -----------------------------
# 2. Gene harmonization
# -----------------------------


# manipulating GENE numbers to ensure matching

# Extract ENSG ID from 'ENSG...$SYMBOL'
expr_placenta["ensembl_id"] = expr_placenta["Gene_ID"].str.split("$").str[0]
expr_placenta = expr_placenta.set_index("ensembl_id")
expr_placenta = expr_placenta.drop(columns=["Gene_ID"])

expr_cfRNA = expr_cfRNA.set_index("gene_num")
expr_cfRNA = expr_cfRNA.drop(columns=["gene_name"])

expr_placenta.index = expr_placenta.index.str.strip()
expr_cfRNA.index = expr_cfRNA.index.str.strip()

shared_genes = expr_placenta.index.intersection(expr_cfRNA.index)
print("Shared genes:", len(shared_genes))

expr_placenta = expr_placenta.loc[shared_genes]
expr_cfRNA = expr_cfRNA.loc[shared_genes]



Shared genes: 6650


In [132]:

# -----------------------------
# 3. Normalize expression per gene (samples x genes format)

'''
We make samples the nodes and genes the features, scaling each gene across samples to produce meaningful....
node feature vectors; the original orientation (genes as rows, samples as columns) would treat genes as ....
“data points,” which doesn’t fit the GNN node-feature paradigm. because all the features comprises of its gene representation =)
'''

# -----------------------------
# Transpose to get samples x genes, scale each gene across samples, then keep that format
scaler = StandardScaler()
expr_cfRNA_T = expr_cfRNA.T
expr_cfRNA_scaled = pd.DataFrame(scaler.fit_transform(expr_cfRNA_T),
                                  index=expr_cfRNA_T.index, columns=expr_cfRNA_T.columns)

expr_placenta_T = expr_placenta.T
expr_placenta_scaled = pd.DataFrame(scaler.transform(expr_placenta_T),
                                     index=expr_placenta_T.index, columns=expr_placenta_T.columns)



In [133]:
# -----------------------------
# 4. Create train.csv (cfRNA) and test.csv (placenta) with correct label mapping
# -----------------------------

# Copy scaled expression matrices
train_df = expr_cfRNA_scaled.copy()
test_df = expr_placenta_scaled.copy()



In [134]:
meta_cfRNA['disease'] = meta_cfRNA['disease'].str.strip()

mapping_dict = {
    'control': 0,
    'pre-eclampsia': 1,
    'severe pre-eclampsia': 1
}

meta_cfRNA['disease_labels'] = meta_cfRNA['disease'].map(mapping_dict)


In [135]:
expr_cfRNA_T

Unnamed: 0,ENSG00000169877,ENSG00000176463,ENSG00000205639,ENSG00000244716,ENSG00000124813,ENSG00000215302,ENSG00000226608,ENSG00000248309,ENSG00000113356,ENSG00000226360,...,ENSG00000278272,ENSG00000278311,ENSG00000278540,ENSG00000278588,ENSG00000278705,ENSG00000278730,ENSG00000278771,ENSG00000278828,ENSG00000279117,ENSG00000281649
782752_3,6,8,17,2,6,21,2,25,325,3,...,11,44,8,7,0,37,256,312,11,12
782752_4,8,12,9,4,4,12,6,10,88,1,...,13,24,3,3,9,13,182,92,8,10
637549_3,7,16,31,4,4,103,6,69,613,1,...,11,37,9,7,4,43,296,521,12,7
637549_4,4,0,5,1,5,21,1,7,83,0,...,4,13,4,4,3,6,105,75,12,2
549101_1,4,3,4,0,2,50,2,16,150,1,...,4,17,4,0,3,10,133,170,5,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839621_3,30,23,31,10,2,94,10,39,380,4,...,20,41,6,15,17,53,414,348,11,6
839621_4,26,9,4,4,2,15,8,1,50,3,...,16,18,7,10,11,7,269,43,0,4
930841_1,1,1,3,2,6,9,6,5,47,0,...,4,18,3,1,3,4,160,31,3,2
930841_2,2,2,2,1,2,6,2,2,25,0,...,2,7,3,0,3,13,122,36,5,1


In [136]:
train_df

Unnamed: 0,ENSG00000169877,ENSG00000176463,ENSG00000205639,ENSG00000244716,ENSG00000124813,ENSG00000215302,ENSG00000226608,ENSG00000248309,ENSG00000113356,ENSG00000226360,...,ENSG00000278272,ENSG00000278311,ENSG00000278540,ENSG00000278588,ENSG00000278705,ENSG00000278730,ENSG00000278771,ENSG00000278828,ENSG00000279117,ENSG00000281649
782752_3,0.156980,-0.072228,0.514708,-0.401727,0.144883,0.052908,-0.667910,0.868102,1.491133,0.146461,...,0.520635,0.597109,0.238333,0.610908,-0.927578,1.134118,-0.046383,1.022826,0.392123,0.236402
782752_4,0.444778,0.091410,-0.014561,0.108674,-0.033763,-0.172249,0.661549,0.036856,0.006385,-0.659073,...,0.754390,0.109054,-0.134508,-0.206242,0.852804,0.049723,-0.529131,-0.088537,0.134136,0.146569
637549_3,0.300879,0.255048,1.440930,0.108674,-0.033763,2.104340,0.661549,3.306422,3.295383,-0.659073,...,0.520635,0.426290,0.312902,0.610908,-0.136297,1.405217,0.214562,2.078621,0.478118,0.011820
637549_4,-0.130817,-0.399504,-0.279196,-0.656928,0.055560,0.052908,-1.000274,-0.129393,-0.024939,-1.061840,...,-0.297506,-0.159377,-0.059940,-0.001955,-0.334117,-0.266559,-1.031451,-0.174415,0.478118,-0.212762
549101_1,-0.130817,-0.276775,-0.345355,-0.912128,-0.212409,0.778414,-0.667910,0.369354,0.394800,-0.659073,...,-0.297506,-0.061766,-0.059940,-0.819105,-0.334117,-0.085826,-0.848789,0.305492,-0.123850,-0.033096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839621_3,3.610551,0.541415,1.440930,1.639877,-0.212409,1.879183,1.991007,1.643931,1.835694,0.549228,...,1.572530,0.523901,0.089197,2.245208,2.435365,1.857049,0.984349,1.204685,0.392123,-0.033096
839621_4,3.034956,-0.031318,-0.345355,0.108674,-0.212409,-0.097197,1.326278,-0.461892,-0.231676,0.146461,...,1.105021,-0.037363,0.163765,1.223770,1.248444,-0.221376,0.038424,-0.336068,-0.553827,-0.122929
930841_1,-0.562513,-0.358594,-0.411513,-0.401727,0.144883,-0.247302,0.661549,-0.240226,-0.250470,-1.061840,...,-0.297506,-0.037363,-0.134508,-0.614817,-0.334117,-0.356925,-0.672651,-0.396687,-0.295841,-0.212762
930841_2,-0.418615,-0.317685,-0.477672,-0.656928,-0.212409,-0.322354,-0.667910,-0.406475,-0.388295,-1.061840,...,-0.531260,-0.305794,-0.134508,-0.819105,-0.334117,0.049723,-0.920549,-0.371429,-0.123850,-0.257678


In [137]:
train_df.index.name = 'Sample_title'


In [138]:
new_meta_cfRNA = pd.DataFrame()
new_meta_cfRNA['Sample_title'] = meta_cfRNA['Sample_title']
new_meta_cfRNA['disease_labels'] = meta_cfRNA['disease_labels']

In [139]:
combined_data = pd.merge(
    train_df,                        # Left DF: Use its index
    new_meta_cfRNA,                  # Right DF: Use the 'Sample_title' column
    left_index=True,             # Use the index of 'expr'
    right_on='Sample_title',     # Use the 'Sample_title' column of 'meta_cfRNA'
    how='inner'
)
combined_data.drop(columns=['Sample_title'], inplace=True)

In [140]:
train_df = combined_data

In [141]:

# Assign target labels by matching sample IDs (index of expr_cfRNA_scaled)

print(f'cfRNA dataset has {len(train_df['disease_labels'])} valid samples with matching metadata')

cfRNA dataset has 209 valid samples with matching metadata


In [142]:
# Check for any unmatched samples
if train_df['disease_labels'].isnull().any():
    print("⚠️ Some cfRNA samples could not be matched to metadata!")

# Store original sample IDs before resetting index
train_df['sample_id'] = expr_cfRNA_scaled.index
test_df['sample_id'] = expr_placenta_scaled.index

# Add unique node IDs
train_df['node_id'] = ['cfRNA_'+str(i) for i in range(len(train_df))]
test_df['node_id'] = ['placenta_'+str(i) for i in range(len(test_df))]

# Reset indices (optional)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Save CSVs
train_df.to_csv(os.path.join(data_dir, 'train.csv'), index=False)
test_df.to_csv(os.path.join(data_dir, 'test.csv'), index=False)

print("✅ train.csv and test.csv created with correct labels and node IDs")

✅ train.csv and test.csv created with correct labels and node IDs


In [143]:
# -----------------------------
# 5. Node types
# -----------------------------
node_types = pd.DataFrame({
    'node_id': list(train_df['node_id']) + list(test_df['node_id']),
    'node_type': ['cfRNA']*len(train_df) + ['placenta']*len(test_df)
})
node_types.to_csv(os.path.join(data_dir, 'node_types.csv'), index=False)

In [144]:


edges = []

def build_similarity_edges(df, top_k=10):
    """
    Build undirected similarity edges within a single graph (train or test).
    """
    # Only numeric feature columns
    feature_cols = [c for c in df.columns if c not in ['target', 'node_id', 'sample_id']]
    
    # Fill NaNs if any
    sim_matrix = cosine_similarity(df[feature_cols].fillna(0))
    
    edges_local = []
    for i in range(sim_matrix.shape[0]):
        # get top_k neighbors, exclude self
        top_idx = np.argsort(sim_matrix[i])[-(top_k + 1):-1]
        for j in top_idx:
            edges_local.append({
                'src': df.loc[i, 'node_id'],
                'dst': df.loc[j, 'node_id'],
                'edge_type': 'similarity'
            })
    return edges_local


In [145]:
def build_extra_edges(meta_df, df, meta_col, edge_type_name, node_to_sample):
    """
    Build undirected edges based on a metadata column (ancestry or geo accession).
    Only connects nodes that share the same value in `meta_col`.
    """
    edges_local = []

    # Map Sample_title -> meta value
    meta_map = meta_df.set_index('Sample_title')[meta_col].to_dict()

    for i, node_i in enumerate(df['node_id']):
        sample_i = node_to_sample[node_i]
        value_i = meta_map.get(sample_i)
        if value_i is None:
            continue

        for j in range(i + 1, len(df)):
            node_j = df['node_id'].iloc[j]
            sample_j = node_to_sample[node_j]
            value_j = meta_map.get(sample_j)
            if value_j is None:
                continue

            if value_i == value_j:
                edges_local.append({
                    'src': node_i,
                    'dst': node_j,
                    'edge_type': edge_type_name
                })

    return edges_local


In [146]:
# -----------------------------
# 3. Build edges for train/test
# -----------------------------
edges = []

# -----------------------------
# Similarity edges (train and test separately)
# -----------------------------
edges.extend(build_similarity_edges(train_df, top_k=10))
edges.extend(build_similarity_edges(test_df, top_k=10))

# -----------------------------
# Mapping node_id → Sample_title
# -----------------------------
node_to_sample_train = dict(zip(train_df['node_id'], train_df['sample_id']))
node_to_sample_test = dict(zip(test_df['node_id'], test_df['sample_id']))

'''# -----------------------------
# Geo accession edges (train - cfRNA)
# -----------------------------
train_geo_edges = build_extra_edges(
    meta_df=meta_cfRNA,
    df=train_df,
    meta_col='Sample_geo_accession',
    edge_type_name='geo_accession',
    node_to_sample=node_to_sample_train
)
edges.extend(train_geo_edges)
print(f"✅ {len(train_geo_edges)} geo accession edges added for cfRNA nodes")
'''
# -----------------------------
# Ancestry edges (test - placenta)
# Groups samples by maternal ancestry
# -----------------------------
test_ancestry_edges = build_extra_edges(
    meta_df=meta_placenta,
    df=test_df,
    meta_col='maternal_ancestry_self_report',
    edge_type_name='ancestry',
    node_to_sample=node_to_sample_test
)
edges.extend(test_ancestry_edges)
print(f"✅ {len(test_ancestry_edges)} ancestry edges added for placenta nodes")

# -----------------------------
# Save edges to CSV
# -----------------------------
edges_df = pd.DataFrame(edges)
edges_df.to_csv(os.path.join(data_dir, 'graph_edges.csv'), index=False)
print("✅ graph_edges.csv created successfully")
print(edges_df['edge_type'].value_counts())

✅ 2601 ancestry edges added for placenta nodes
✅ graph_edges.csv created successfully
edge_type
similarity    3320
ancestry      2601
Name: count, dtype: int64


In [None]:
meta_cfRNA['disease'] = meta_cfRNA['disease'].str.strip()

mapping_dict = {
    'control': 0,
    'pre-eclampsia': 1,
    'severe pre-eclampsia': 1
}

meta_cfRNA['disease_labels'] = meta_cfRNA['disease'].map(mapping_dict)


In [None]:
#A 
expr_cfRNA_T.index.name = 'Sample_title'
#.set_index('Sample_title')


In [None]:
new_meta_cfRNA = pd.DataFrame()
new_meta_cfRNA['Sample_title'] = meta_cfRNA['Sample_title']
new_meta_cfRNA['disease_labels'] = meta_cfRNA['disease_labels']

In [None]:
combined_data = pd.merge(
    expr_cfRNA_T,                        # Left DF: Use its index
    new_meta_cfRNA,                  # Right DF: Use the 'Sample_title' column
    left_index=True,             # Use the index of 'expr'
    right_on='Sample_title',     # Use the 'Sample_title' column of 'meta_cfRNA'
    how='inner'
)
combined_data.drop(columns=['Sample_title'], inplace=True)

In [153]:
(train_df['disease_labels'])

0      0
1      0
2      0
3      0
4      0
      ..
204    1
205    1
206    0
207    0
208    0
Name: disease_labels, Length: 209, dtype: int64

In [None]:
combined_data

In [None]:
new_meta_cfRNA.shape