In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import os

In [3]:
os.getcwd()

'/Users/mubaraq/Desktop/gnn-challenge/organizer_scripts'

In [4]:
# -----------------------------
# 1. Load expression data
# -----------------------------
# Use relative path - notebook is in organizer_scripts/, data is in ../data/
data_dir = '../data'

expr_cfRNA = pd.read_csv(os.path.join(data_dir, 'expr_df_2_GSE192902.csv'))
expr_placenta = pd.read_csv(os.path.join(data_dir, 'expr_df_GSE234729.csv'))

meta_cfRNA = pd.read_csv(os.path.join(data_dir, 'metadata_cfRNA.csv'))
meta_placenta = pd.read_csv(os.path.join(data_dir, 'metadata_placenta.csv'))


print(expr_placenta.head())

                    Gene_ID    P1_1154    P1_1651    P1_1689     P1_1748  \
0     ENSG00000139055$ERP27  28.783956  19.929550  24.891419   39.407868   
1      ENSG00000182759$MAFA  18.820279  26.223093  45.337942   33.185573   
2       ENSG00000205221$VIT  23.248580  29.369864  13.334689  145.186882   
3    ENSG00000207582$MIR30B  14.391978  14.684932  12.445710    2.074098   
4  ENSG00000207620$MIR516A2  17.713204   7.342466   8.889793    7.259344   

    P1_1973    P1_2013     P1_2052    P1_2139    P1_2263  ...    P9_1805  \
0   7.12230  44.643687   49.938054   6.726263  27.046152  ...  14.759850   
1  10.68345  11.420478   16.646018  10.569842  36.782766  ...  28.465425   
2  17.80575  53.987714  267.525291  73.988894  51.928611  ...  10.542750   
3  11.87050  11.420478    1.189001   9.608947  12.982153  ...   9.488475   
4   0.00000   8.305802    5.945006   7.687158   4.327384  ...   7.379925   

        P931       P942       P950        P972       P976       P979  \
0   5.535609  

In [5]:
# -----------------------------
# 2. Gene harmonization
# -----------------------------


# manipulating GENE numbers to ensure matching

# Extract ENSG ID from 'ENSG...$SYMBOL'
expr_placenta["ensembl_id"] = expr_placenta["Gene_ID"].str.split("$").str[0]
expr_placenta = expr_placenta.set_index("ensembl_id")
expr_placenta = expr_placenta.drop(columns=["Gene_ID"])

expr_cfRNA = expr_cfRNA.set_index("gene_num")
expr_cfRNA = expr_cfRNA.drop(columns=["gene_name"])

expr_placenta.index = expr_placenta.index.str.strip()
expr_cfRNA.index = expr_cfRNA.index.str.strip()

shared_genes = expr_placenta.index.intersection(expr_cfRNA.index)
print("Shared genes:", len(shared_genes))

expr_placenta = expr_placenta.loc[shared_genes]
expr_cfRNA = expr_cfRNA.loc[shared_genes]



Shared genes: 6650


In [None]:

# -----------------------------
# 3. Normalize expression per gene (samples x genes format)

'''
We make samples the nodes and genes the features, scaling each gene across samples to produce meaningful....
node feature vectors; the original orientation (genes as rows, samples as columns) would treat genes as ....
“data points,” which doesn’t fit the GNN node-feature paradigm. because all the features comprises of its gene representation =)
'''

# -----------------------------
# Transpose to get samples x genes, scale each gene across samples, then keep that format
scaler = StandardScaler()
expr_cfRNA_T = expr_cfRNA.T
expr_cfRNA_scaled = pd.DataFrame(scaler.fit_transform(expr_cfRNA_T),
                                  index=expr_cfRNA_T.index, columns=expr_cfRNA_T.columns)

expr_placenta_T = expr_placenta.T
expr_placenta_scaled = pd.DataFrame(scaler.transform(expr_placenta_T),
                                     index=expr_placenta_T.index, columns=expr_placenta_T.columns)



In [15]:
# -----------------------------
# 4. Create train.csv (cfRNA) and test.csv (placenta) with correct label mapping
# -----------------------------

# Copy scaled expression matrices
train_df = expr_cfRNA_scaled.copy()
test_df = expr_placenta_scaled.copy()

# Build a mapping from metadata sample ID -> numeric disease label
# Adjust column name if your metadata uses 'disease' instead of 'diagnosis'
disease_map = meta_cfRNA.set_index('Sample_title')['disease'].map({'control': 0, 'preeclampsia': 1})


In [None]:

# Assign target labels by matching sample IDs (index of expr_cfRNA_scaled)
train_df['target'] = train_df.index.map(disease_map)

print(f'cfRNA dataset has {len(train_df['target'])} valid samples with matching metadata')

cfRNA dataset 209 valid samples with matching metadata


In [30]:
meta_cfRNA.shape
#meta_cfRNA.head()
#meta_cfRNA.columns

(404, 36)

In [23]:
# Check for any unmatched samples
if train_df['target'].isnull().any():
    print("⚠️ Some cfRNA samples could not be matched to metadata!")

# Add unique node IDs
train_df['node_id'] = ['cfRNA_'+str(i) for i in range(len(train_df))]
test_df['node_id'] = ['placenta_'+str(i) for i in range(len(test_df))]

# Reset indices (optional)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Save CSVs
train_df.to_csv(os.path.join(data_dir, 'train.csv'), index=False)
test_df.to_csv(os.path.join(data_dir, 'test.csv'), index=False)

print("✅ train.csv and test.csv created with correct labels and node IDs")


⚠️ Some cfRNA samples could not be matched to metadata!
✅ train.csv and test.csv created with correct labels and node IDs


In [24]:
# -----------------------------
# 5. Node types
# -----------------------------
node_types = pd.DataFrame({
    'node_id': list(train_df['node_id']) + list(test_df['node_id']),
    'node_type': ['cfRNA']*len(train_df) + ['placenta']*len(test_df)
})
node_types.to_csv(os.path.join(data_dir, 'node_types.csv'), index=False)

In [31]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

edges = []

def build_similarity_edges(df, top_k=10):
    feature_cols = [c for c in df.columns if c not in ['target', 'node_id']]
    sim = cosine_similarity(df[feature_cols])

    for i in range(sim.shape[0]):
        nbrs = np.argsort(sim[i])[-(top_k + 1):-1]  # exclude self
        for j in nbrs:
            edges.append({
                'src': df.loc[i, 'node_id'],
                'dst': df.loc[j, 'node_id'],
                'edge_type': 'similarity'
            })

# Training graph
build_similarity_edges(train_df, top_k=10)

# Test graph (separate, unseen during training)
build_similarity_edges(test_df, top_k=10)


In [33]:
def build_ancestry_edges(meta_df, df, ancestry_col):
    """
    Build undirected ancestry-based edges within a graph split.
    """

    # Align metadata to df samples ONLY
    aligned_meta = meta_df.loc[df.index, ancestry_col]

    edges = []

    for i, idx_i in enumerate(df.index):
        for j in range(i + 1, len(df)):
            idx_j = df.index[j]

            if aligned_meta.loc[idx_i] == aligned_meta.loc[idx_j]:
                edges.append({
                    "src": df.loc[idx_i, "node_id"],
                    "dst": df.loc[idx_j, "node_id"],
                    "edge_type": "ancestry"
                })

    return edges

In [None]:
'''
# Train graph (cfRNA or placenta — but never mixed)
train_edges = build_ancestry_edges(
    meta_df=meta_cfRNA,
    df=train_df,
    ancestry_col="maternal_ancestry_self_report"
)

# Test graph (separate graph!)
test_edges = build_ancestry_edges(
    meta_df=meta_placenta,
    df=test_df,
    ancestry_col="maternal_ancestry_self_report"
)

'''