In [88]:
# Import all necessary libraries
import pandas as pd
import numpy as np
import sys
from sklearn.model_selection import train_test_split
sys.path.append("../utils")
from utils import load_data

In [89]:
# Load the data 

data_train, data_test , targets_train= load_data(raw=True)

In [178]:

from pathlib import Path

project_root = Path().resolve().parent.parent
sys.path.append(str(project_root))
# Load the data
data_path = project_root / 'data' 
cgc_file = pd.read_csv(data_path / 'CGC.csv')

cgc_genes = cgc_file['Gene Symbol']
print(cgc_genes)

# Extract CGC genes
cgc_genes = np.array(cgc_genes)  # Convert to numpy array
cgc_genes = cgc_genes.tolist()   # Convert to list
cgc_genes = set(cgc_genes)       # Convert to set for unique values

0        A1CF
1        ABI1
2        ABL1
3        ABL2
4       ACKR3
        ...  
718    ZNF429
719    ZNF479
720    ZNF521
721     ZNRF3
722     ZRSR2
Name: Gene Symbol, Length: 723, dtype: object


In [191]:
import pandas as pd 

# Filter for Breast cell lines in the train_targets dataframe
data_train.iloc[:, 0] = data_train.iloc[:, 0].astype(str).str.strip() 

Breast_Cell_line = targets_train[targets_train['tissue'].str.contains('Breast', case=False)]
Breast_Cell_lines =Breast_Cell_line ["sample"].unique()

# all the genes 
Breast_cancer = data_train[data_train.iloc[:, 0].isin(Breast_Cell_lines)]
print (Breast_cancer.shape) 
print (Breast_Cell_line.shape)
# Extract the gene names (columns) from breast cancer data (excluding 'Unnamed: 0')
gene_columns = Breast_cancer.columns[1:]  # Start from the second column to skip 'Unnamed: 0'
gene_columns = np.array(gene_columns)
gene_columns = gene_columns.tolist()
gene_columns = set(gene_columns)

# Find common genes between CGC genes and breast cancer dataset genes
common_genes = list(cgc_genes & gene_columns)

# Filter the breast cancer dataset to include only the common genes
filtered_breast_cancer = Breast_cancer[['Unnamed: 0'] + common_genes]  # Keep 'Unnamed: 0' as identifier

# Print the number of common genes
print(f"Number of common genes: {len(common_genes)}")
print (common_genes)
print (filtered_breast_cancer)


(33, 19921)
(33, 3)
Number of common genes: 704
['ETV4', 'LARP4B', 'SUFU', 'PALB2', 'POT1', 'FOXO1', 'CCND2', 'FLI1', 'COL1A1', 'GMPS', 'SIX1', 'GNAQ', 'FBLN2', 'RAP1GDS1', 'PRDM1', 'MLF1', 'SMAD3', 'RPN1', 'MTCP1', 'S100A7', 'MAP2K1', 'TNC', 'CRTC1', 'PTPN6', 'ZNF331', 'IRS4', 'FOXL2', 'NSD3', 'TCEA1', 'ARID2', 'TMEM127', 'PRCC', 'CASP3', 'PRRX1', 'LATS2', 'SOX2', 'CTCF', 'CRTC3', 'TRIM24', 'NBN', 'CANT1', 'JUN', 'NRAS', 'MYD88', 'ABL1', 'AXIN1', 'WNK2', 'RB1', 'EPHA7', 'IL7R', 'ZBTB16', 'RABEP1', 'ELK4', 'TAL1', 'DCC', 'HOXD11', 'IL2', 'ISX', 'FES', 'CHST11', 'DDX5', 'CDH17', 'KTN1', 'NUTM2D', 'STAG1', 'FLT3', 'STRN', 'PIM1', 'RAD17', 'IKZF1', 'RAD51B', 'AR', 'NKX2-1', 'PAX3', 'KAT6A', 'CAMTA1', 'FANCE', 'BMPR1A', 'ETV6', 'IGF2BP2', 'MAML2', 'SLC34A2', 'MSN', 'KNL1', 'LMNA', 'OLIG2', 'HLA-A', 'SUZ12', 'WAS', 'PAX8', 'CYLD', 'WDCP', 'PPP2R1A', 'SMARCA4', 'CHCHD7', 'NFATC2', 'WIF1', 'SDHA', 'APC', 'HMGA2', 'PMS1', 'SDHC', 'YWHAE', 'CCR7', 'BCL7A', 'MECOM', 'CDK4', 'ECT2L', 'MYB', 'RSPO

In [180]:
X = filtered_breast_cancer.drop(columns=["Unnamed: 0"])
Y = Breast_Cell_line["AAC"]
# Split the data 
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

In [181]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import spearmanr
from sklearn.neural_network import MLPRegressor

In [182]:
# Train a Random Forest model
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest.fit(X_train, y_train)

# Predict on validation data
y_val_pred = random_forest.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
spearman_non_encoded, p_value_non_encoded = spearmanr(y_val, y_val_pred)

print("Validation Mean Squared Error (MSE):", mse)
print("Validation R² Score:", r2)
print("Spearman's Correlation (ρ):", spearman_non_encoded)

Validation Mean Squared Error (MSE): 0.002056835623210622
Validation R² Score: 0.016774132089205196
Spearman's Correlation (ρ): 0.7142857142857144


In [183]:

# Train Linear Regression model 
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
spearman_non_encoded, p_value_non_encoded = spearmanr(y_val, y_val_pred)

print("--- Non Encoded Features ---")
print("Mean Squared Error (MSE):", mse)
print("R-squared (R²):", r2)
print("Spearman's Correlation (ρ):", spearman_non_encoded)
print("P-value (Spearman):", p_value_non_encoded)

--- Non Encoded Features ---
Mean Squared Error (MSE): 0.002056835623210622
R-squared (R²): 0.016774132089205196
Spearman's Correlation (ρ): 0.7142857142857144
P-value (Spearman): 0.0713435614675376


In [184]:
# Train an MLP model
mlp_model = MLPRegressor(
    hidden_layer_sizes=(100, 50),  # Two hidden layers with 100 and 50 neurons
    activation='relu',
    solver='adam',
    learning_rate='adaptive',
    max_iter=1000,
    random_state=42
)
mlp_model.fit(X_train, y_train)

# Predict on validation data

y_val_pred = mlp_model.predict(X_val)

# Evaluate the model
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)
spearman_non_encoded, p_value_non_encoded = spearmanr(y_val, y_val_pred)

print("Mean Squared Error (MSE):", mse)
print("R-squared (R²):", r2)
print("Spearman's Correlation (ρ):", spearman_non_encoded)
print("P-value (Spearman):", p_value_non_encoded)

Mean Squared Error (MSE): 2361.304112081336
R-squared (R²): -1128769.4563276889
Spearman's Correlation (ρ): 0.21428571428571433
P-value (Spearman): 0.6445115810207203


In [185]:
#Extract unique tissues
unique_tissues = targets_train["tissue"].unique().tolist()
print(unique_tissues)
print (len(unique_tissues))

['Prostate', 'Esophagus/Stomach', 'Bladder/Urinary Tract', 'CNS/Brain', 'Lymphoid', 'Kidney', 'Thyroid', 'Soft Tissue', 'Skin', 'Lung', 'Bone', 'Ovary/Fallopian Tube', 'Pleura', 'Myeloid', 'Uterus', 'Pancreas', 'Breast', 'Head and Neck', 'Bowel', 'Peripheral Nervous System', 'Cervix', 'Liver', 'Biliary Tract', 'Other', 'Ampulla of Vater']
25


In [186]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from scipy.stats import spearmanr
from sklearn.preprocessing import StandardScaler

In [194]:
# compute all # computing all the spearmanr value for each tissue with CGC
 

tissue_results = {}
model_results = {}

for tissue in unique_tissues:

    data_train.iloc[:, 0] = data_train.iloc[:, 0].astype(str).str.strip() 

    # Filter rows in targets_train for the current tissue
    tissue_cell_lines = targets_train[targets_train['tissue'].str.contains(tissue, case=False)]
    tissue_samples = tissue_cell_lines["sample"].unique()

    # Filter data_train for rows corresponding to the current tissue's samples
    tissue_data = data_train[data_train.iloc[:, 0].isin(tissue_samples)]

    # Extract the gene names (columns) from the tissue dataset
    gene_columns = tissue_data.columns[1:]  # Skip the first column (e.g., 'Unnamed: 0')
    gene_columns = set(np.array(gene_columns).tolist())  # Convert to a set

    # Find common genes between CGC genes and the tissue dataset genes
    common_genes = list(cgc_genes & gene_columns)

    # Filter the tissue dataset to include only the common genes
    filtered_tissue_data = tissue_data[['Unnamed: 0'] + common_genes]  # Keep the identifier column

    # Preprocess data 
    X =  filtered_tissue_data.drop(columns=["Unnamed: 0"])
    Y = tissue_cell_lines["AAC"]
    
    if len(X) < 2:
        print(f"Skipping tissue '{tissue}' due to insufficient samples: {len(X)}")
        continue

    # Split the data 
    X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42)


    # Scale features for MLP regression
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Train and evaluate models
    models = {
        "RandomForest": RandomForestRegressor(random_state=42),
        "LinearRegression": LinearRegression(),
        "MLPRegression": MLPRegressor(random_state=42, max_iter=500)
    }

    results = {}
    for model_name, model in models.items():
        # Train the model
        if model_name == "MLPRegression":
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_val_scaled)
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_val)

        y_val = np.array(y_val).flatten()  # If it's already a NumPy array, this won't change it
        y_pred = np.array(y_pred).flatten()  # Ensure predictions are also flat

        # Check for length mismatch
        if len(y_val) != len(y_pred):
            raise ValueError(f"Length mismatch: y_val({len(y_val)}) vs y_pred({len(y_pred)})")

        # Compute Spearman correlation
        spearman_corr, p_value = spearmanr(y_val, y_pred)
        results[model_name] = spearman_corr

    # Print results for the tissue
    print(f"Tissue: {tissue}")
    for model_name, correlation in results.items():
        print(f"{model_name} Spearman Correlation: {correlation:.4f}")
    print()

# Summary of results
print("Model performance across tissues:")
for tissue, results in model_results.items():
    print(f"Tissue: {tissue}")
    for model_name, correlation in results.items():
        print(f"  {model_name}: Spearman Correlation = {correlation:.4f}")



Tissue: Prostate
RandomForest Spearman Correlation: -1.0000
LinearRegression Spearman Correlation: -1.0000
MLPRegression Spearman Correlation: -1.0000

Tissue: Esophagus/Stomach
RandomForest Spearman Correlation: 0.1182
LinearRegression Spearman Correlation: 0.0091
MLPRegression Spearman Correlation: 0.0909

Tissue: Bladder/Urinary Tract
RandomForest Spearman Correlation: -0.3000
LinearRegression Spearman Correlation: 0.1000
MLPRegression Spearman Correlation: -0.3000

Tissue: CNS/Brain
RandomForest Spearman Correlation: 0.3000
LinearRegression Spearman Correlation: 0.5000
MLPRegression Spearman Correlation: -0.1000

Tissue: Lymphoid
RandomForest Spearman Correlation: 0.5000
LinearRegression Spearman Correlation: 0.2807
MLPRegression Spearman Correlation: -0.3070

Tissue: Kidney
RandomForest Spearman Correlation: -0.2000
LinearRegression Spearman Correlation: -0.5000
MLPRegression Spearman Correlation: 0.2000

Tissue: Thyroid
RandomForest Spearman Correlation: 0.5000
LinearRegression S

In [211]:
# computing all the spearmanr value for each tissue whitout CGC 

# Iterate over all unique tissues
unique_tissues = targets_train["tissue"].unique().tolist()

for tissue in unique_tissues:
    print(f"Processing tissue: {tissue}")
    
    # Filter for specific tissue in the train_targets dataframe
    data_train.iloc[:, 0] = data_train.iloc[:, 0].astype(str).str.strip()
    Tissue_Cell_line = targets_train[targets_train['tissue'].str.contains(tissue, case=False)]
    Tissue_Cell_lines = Tissue_Cell_line["sample"].unique()

    # Filter data for specific tissue cell lines
    Tissue_cancer = data_train[data_train.iloc[:, 0].isin(Tissue_Cell_lines)]

    # Prepare features (X) and target (Y)
    X = Tissue_cancer.drop(columns=["Unnamed: 0"])
    Y = Tissue_Cell_line["AAC"]

    if len(X) < 2:
        print(f"Skipping tissue '{tissue}' due to insufficient samples: {len(X)}")
        continue

    # Split the data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42)

    # Train and evaluate Random Forest Regressor
    rf_model = RandomForestRegressor(random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_val)

    # Compute Spearman correlation for Random Forest
    spearman_corr_rf, p_value_rf = spearmanr(y_val, y_pred_rf)
    print(f"{tissue} - Random Forest Spearman correlation: {spearman_corr_rf}, p-value: {p_value_rf}")

    # Train and evaluate Linear Regression model
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    y_pred_lr = lr_model.predict(X_val)

    # Compute Spearman correlation for Linear Regression
    spearman_corr_lr, p_value_lr = spearmanr(y_val, y_pred_lr)
    print(f"{tissue} - Linear Regression Spearman correlation: {spearman_corr_lr}, p-value: {p_value_lr}")


Processing tissue: Prostate
Prostate - Random Forest Spearman correlation: -0.9999999999999999, p-value: nan
Prostate - Linear Regression Spearman correlation: 0.9999999999999999, p-value: nan
Processing tissue: Esophagus/Stomach
Esophagus/Stomach - Random Forest Spearman correlation: 0.06363636363636364, p-value: 0.8525390725425217
Esophagus/Stomach - Linear Regression Spearman correlation: 0.20909090909090913, p-value: 0.5372209352113229
Processing tissue: Bladder/Urinary Tract
Bladder/Urinary Tract - Random Forest Spearman correlation: -0.39999999999999997, p-value: 0.5046315754686911
Bladder/Urinary Tract - Linear Regression Spearman correlation: 0.09999999999999999, p-value: 0.8728885715695383
Processing tissue: CNS/Brain
CNS/Brain - Random Forest Spearman correlation: 0.16666666666666669, p-value: 0.668231040071504
CNS/Brain - Linear Regression Spearman correlation: 0.5666666666666667, p-value: 0.11163298761149114
Processing tissue: Lymphoid
Lymphoid - Random Forest Spearman corr