In [173]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import os

In [174]:
os.getcwd()

'/Users/mubaraq/Desktop/gnn-challenge/organizer_scripts'

In [175]:
# -----------------------------
# 1. Load expression data
# -----------------------------
# Use relative path - notebook is in organizer_scripts/, data is in ../data/
data_dir = '../data'

expr_cfRNA = pd.read_csv(os.path.join(data_dir, 'expr_df_2_GSE192902.csv'))
expr_placenta = pd.read_csv(os.path.join(data_dir, 'expr_df_GSE234729.csv'))

meta_cfRNA = pd.read_csv(os.path.join(data_dir, 'metadata_cfRNA.csv'))
meta_placenta = pd.read_csv(os.path.join(data_dir, 'metadata_placenta.csv'))


print(expr_placenta.head())

                    Gene_ID    P1_1154    P1_1651    P1_1689     P1_1748  \
0     ENSG00000139055$ERP27  28.783956  19.929550  24.891419   39.407868   
1      ENSG00000182759$MAFA  18.820279  26.223093  45.337942   33.185573   
2       ENSG00000205221$VIT  23.248580  29.369864  13.334689  145.186882   
3    ENSG00000207582$MIR30B  14.391978  14.684932  12.445710    2.074098   
4  ENSG00000207620$MIR516A2  17.713204   7.342466   8.889793    7.259344   

    P1_1973    P1_2013     P1_2052    P1_2139    P1_2263    P1_2326  \
0   7.12230  44.643687   49.938054   6.726263  27.046152  14.415633   
1  10.68345  11.420478   16.646018  10.569842  36.782766  45.464687   
2  17.80575  53.987714  267.525291  73.988894  51.928611  47.682477   
3  11.87050  11.420478    1.189001   9.608947  12.982153  13.306738   
4   0.00000   8.305802    5.945006   7.687158   4.327384   3.326684   

     P1_2591    P1_2672     P1_2699     P1_2703    P1_2778    P1_2858  \
0  11.309764  15.227685   23.935366   25.31

In [176]:
# -----------------------------
# 2. Gene harmonization
# -----------------------------


# manipulating GENE numbers to ensure matching

# Extract ENSG ID from 'ENSG...$SYMBOL'
expr_placenta["ensembl_id"] = expr_placenta["Gene_ID"].str.split("$").str[0]
expr_placenta = expr_placenta.set_index("ensembl_id")
expr_placenta = expr_placenta.drop(columns=["Gene_ID"])

expr_cfRNA = expr_cfRNA.set_index("gene_num")
expr_cfRNA = expr_cfRNA.drop(columns=["gene_name"])

expr_placenta.index = expr_placenta.index.str.strip()
expr_cfRNA.index = expr_cfRNA.index.str.strip()

shared_genes = expr_placenta.index.intersection(expr_cfRNA.index)
print("Shared genes:", len(shared_genes))

expr_placenta = expr_placenta.loc[shared_genes]
expr_cfRNA = expr_cfRNA.loc[shared_genes]



Shared genes: 6650


In [177]:

# -----------------------------
# 3. Normalize expression per gene (samples x genes format)

'''
We make samples the nodes and genes the features, scaling each gene across samples to produce meaningful....
node feature vectors; the original orientation (genes as rows, samples as columns) would treat genes as ....
“data points,” which doesn’t fit the GNN node-feature paradigm. because all the features comprises of its gene representation =)
'''

# -----------------------------
# Transpose to get samples x genes, scale each gene across samples, then keep that format
scaler = StandardScaler()
expr_cfRNA_T = expr_cfRNA.T
expr_cfRNA_scaled = pd.DataFrame(scaler.fit_transform(expr_cfRNA_T),
                                  index=expr_cfRNA_T.index, columns=expr_cfRNA_T.columns)

expr_placenta_T = expr_placenta.T
expr_placenta_scaled = pd.DataFrame(scaler.transform(expr_placenta_T),
                                     index=expr_placenta_T.index, columns=expr_placenta_T.columns)



In [178]:
# -----------------------------
# 4. Create train.csv (cfRNA) and test.csv (placenta) with correct label mapping
# -----------------------------

# Copy scaled expression matrices
train_df = expr_cfRNA_scaled.copy()
test_df = expr_placenta_scaled.copy()

# Build a mapping from metadata sample ID -> numeric disease label
# Adjust column name if your metadata uses 'disease' instead of 'diagnosis'
disease_map = meta_cfRNA.set_index('Sample_title')['disease'].map({'control': 0, 'preeclampsia': 1})


In [179]:

# Assign target labels by matching sample IDs (index of expr_cfRNA_scaled)
train_df['target'] = train_df.index.map(disease_map)

print(f'cfRNA dataset has {len(train_df['target'])} valid samples with matching metadata')

cfRNA dataset has 209 valid samples with matching metadata


In [180]:
meta_cfRNA.shape
#meta_cfRNA.head()
#meta_cfRNA.columns

(404, 36)

In [181]:
# Check for any unmatched samples
if train_df['target'].isnull().any():
    print("⚠️ Some cfRNA samples could not be matched to metadata!")

# Store original sample IDs before resetting index
train_df['sample_id'] = expr_cfRNA_scaled.index
test_df['sample_id'] = expr_placenta_scaled.index

# Add unique node IDs
train_df['node_id'] = ['cfRNA_'+str(i) for i in range(len(train_df))]
test_df['node_id'] = ['placenta_'+str(i) for i in range(len(test_df))]

# Reset indices (optional)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# Save CSVs
train_df.to_csv(os.path.join(data_dir, 'train.csv'), index=False)
test_df.to_csv(os.path.join(data_dir, 'test.csv'), index=False)

print("✅ train.csv and test.csv created with correct labels and node IDs")

⚠️ Some cfRNA samples could not be matched to metadata!
✅ train.csv and test.csv created with correct labels and node IDs


In [182]:
# -----------------------------
# 5. Node types
# -----------------------------
node_types = pd.DataFrame({
    'node_id': list(train_df['node_id']) + list(test_df['node_id']),
    'node_type': ['cfRNA']*len(train_df) + ['placenta']*len(test_df)
})
node_types.to_csv(os.path.join(data_dir, 'node_types.csv'), index=False)

In [183]:


edges = []

def build_similarity_edges(df, top_k=10):
    """
    Build undirected similarity edges within a single graph (train or test).
    """
    # Only numeric feature columns
    feature_cols = [c for c in df.columns if c not in ['target', 'node_id', 'sample_id']]
    
    # Fill NaNs if any
    sim_matrix = cosine_similarity(df[feature_cols].fillna(0))
    
    edges_local = []
    for i in range(sim_matrix.shape[0]):
        # get top_k neighbors, exclude self
        top_idx = np.argsort(sim_matrix[i])[-(top_k + 1):-1]
        for j in top_idx:
            edges_local.append({
                'src': df.loc[i, 'node_id'],
                'dst': df.loc[j, 'node_id'],
                'edge_type': 'similarity'
            })
    return edges_local


In [184]:
'''
# Training graph
build_similarity_edges(train_df, top_k=10)

# Test graph (separate, unseen during training)
build_similarity_edges(test_df, top_k=10)

train_df["sample_id"] = expr_cfRNA_scaled.index
test_df["sample_id"] = expr_placenta_scaled.index


'''

'\n# Training graph\nbuild_similarity_edges(train_df, top_k=10)\n\n# Test graph (separate, unseen during training)\nbuild_similarity_edges(test_df, top_k=10)\n\ntrain_df["sample_id"] = expr_cfRNA_scaled.index\ntest_df["sample_id"] = expr_placenta_scaled.index\n\n\n'

In [185]:
def build_extra_edges(meta_df, df, meta_col, edge_type_name, node_to_sample):
    """
    Build undirected edges based on a metadata column (ancestry or geo accession).
    Only connects nodes that share the same value in `meta_col`.
    """
    edges_local = []

    # Map Sample_title -> meta value
    meta_map = meta_df.set_index('Sample_title')[meta_col].to_dict()

    for i, node_i in enumerate(df['node_id']):
        sample_i = node_to_sample[node_i]
        value_i = meta_map.get(sample_i)
        if value_i is None:
            continue

        for j in range(i + 1, len(df)):
            node_j = df['node_id'].iloc[j]
            sample_j = node_to_sample[node_j]
            value_j = meta_map.get(sample_j)
            if value_j is None:
                continue

            if value_i == value_j:
                edges_local.append({
                    'src': node_i,
                    'dst': node_j,
                    'edge_type': edge_type_name
                })

    return edges_local


In [186]:
# -----------------------------
# 3. Build edges for train/test
# -----------------------------
edges = []

# -----------------------------
# Similarity edges (train and test separately)
# -----------------------------
edges.extend(build_similarity_edges(train_df, top_k=10))
edges.extend(build_similarity_edges(test_df, top_k=10))

# -----------------------------
# Mapping node_id → Sample_title
# -----------------------------
node_to_sample_train = dict(zip(train_df['node_id'], train_df['sample_id']))
node_to_sample_test = dict(zip(test_df['node_id'], test_df['sample_id']))

# -----------------------------
# Geo accession edges (train - cfRNA)
# -----------------------------
train_geo_edges = build_extra_edges(
    meta_df=meta_cfRNA,
    df=train_df,
    meta_col='Sample_geo_accession',
    edge_type_name='geo_accession',
    node_to_sample=node_to_sample_train
)
edges.extend(train_geo_edges)
print(f"✅ {len(train_geo_edges)} geo accession edges added for cfRNA nodes")

# -----------------------------
# Ancestry edges (test - placenta)
# Groups samples by maternal ancestry
# -----------------------------
test_ancestry_edges = build_extra_edges(
    meta_df=meta_placenta,
    df=test_df,
    meta_col='maternal_ancestry_self_report',
    edge_type_name='ancestry',
    node_to_sample=node_to_sample_test
)
edges.extend(test_ancestry_edges)
print(f"✅ {len(test_ancestry_edges)} ancestry edges added for placenta nodes")

# -----------------------------
# Save edges to CSV
# -----------------------------
edges_df = pd.DataFrame(edges)
edges_df.to_csv(os.path.join(data_dir, 'graph_edges.csv'), index=False)
print("✅ graph_edges.csv created successfully")
print(edges_df['edge_type'].value_counts())

✅ 0 geo accession edges added for cfRNA nodes
✅ 2601 ancestry edges added for placenta nodes
✅ graph_edges.csv created successfully
edge_type
similarity    3320
ancestry      2601
Name: count, dtype: int64
