# AIM

Trim down the current dataset ~ breast cancer, multi-omics (trans & prot), gdsc1, isosmile (converted to morgan) to both:

1. Filter dataset to only 1 omics each (1st transcriptomics, 2nd:proteomics, 3rd: geonomics)
2. Perform deep learning on each omics to find their values
3. Attempt with other values e.g., **AUC**, RMSE, Z_SCORE  --> edit the pre-processing dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read main dataset
breast_cancer_multi_omics_gdsc1_iso_m = pd.read_csv("../Dataset/breast_cancer_transcript_proteo_multi_omics_gdsc1_iso_morgan.csv")

breast_cancer_multi_omics_gdsc1_iso_m.drop(columns = ['Unnamed: 0'], inplace=True, axis=1)

In [3]:
breast_cancer_multi_omics_gdsc1_iso_m

# 4133 rows × 19697 columns

Unnamed: 0,DRUG_ID,DRUG_NAME,CCLE_Name,COSMIC_ID,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),...,247,248,249,250,251,252,253,254,255,LN_IC50
0,133,Doxorubicin,AU565_BREAST,910704,3.428946,0.0,6.869871,3.500802,4.319040,0.111031,...,0,0,1,0,1,0,1,0,0,-2.020833
1,134,Etoposide,AU565_BREAST,910704,3.428946,0.0,6.869871,3.500802,4.319040,0.111031,...,0,0,1,0,1,0,0,0,1,1.047349
2,135,Gemcitabine,AU565_BREAST,910704,3.428946,0.0,6.869871,3.500802,4.319040,0.111031,...,0,0,0,1,1,0,0,0,0,-3.499537
3,136,Mitomycin-C,AU565_BREAST,910704,3.428946,0.0,6.869871,3.500802,4.319040,0.111031,...,1,0,0,0,1,0,0,1,0,-0.633531
4,140,Vinorelbine,AU565_BREAST,910704,3.428946,0.0,6.869871,3.500802,4.319040,0.111031,...,1,0,1,1,1,0,0,0,0,-5.551392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4128,1377,Afatinib,ZR7530_BREAST,909907,3.472488,0.0,5.959306,3.878725,3.646163,0.000000,...,1,0,0,0,1,0,0,0,0,-0.335696
4129,1378,Bleomycin (50 uM),ZR7530_BREAST,909907,3.472488,0.0,5.959306,3.878725,3.646163,0.000000,...,0,0,0,1,1,1,1,1,1,6.689241
4130,1494,SN-38,ZR7530_BREAST,909907,3.472488,0.0,5.959306,3.878725,3.646163,0.000000,...,0,1,0,1,0,0,0,0,1,-0.448730
4131,1495,Olaparib,ZR7530_BREAST,909907,3.472488,0.0,5.959306,3.878725,3.646163,0.000000,...,0,0,0,0,1,0,0,0,1,5.755940


-----

# DL with only transcriptomics
## Remove proteonomics data

In [4]:
# proteonomics data
ccle_p = pd.read_csv("../Dataset/Raw_files/CCLE_Proteomics.csv")
ccle_p = ccle_p.rename({"Unnamed: 0": "CCLE_Name"}, axis = 'columns')
ccle_p

# 899 rows × 215 columns

Unnamed: 0,CCLE_Name,14-3-3_beta,14-3-3_epsilon_Caution,14-3-3_zeta,4E-BP1,4E-BP1_pS65,4E-BP1_pT37_T46,4E-BP1_pT70,53BP1,A-Raf_pS299_Caution,...,Tuberin_pT1462,VAV1_Caution,VEGFR2,VHL_Caution,XBP1_Caution,XRCC1_Caution,YAP_Caution,YAP_pS127_Caution,YB-1,YB-1_pS102
0,DMS53_LUNG,-0.104888,0.060414,0.309068,-0.075506,0.230359,0.198304,-0.030541,0.455889,0.090484,...,-0.099433,-0.486715,-1.147858,0.133876,-0.075812,-0.144388,-1.090303,-2.109324,0.178104,0.246541
1,SW1116_LARGE_INTESTINE,0.358504,-0.180291,-0.041237,-0.286629,-0.877406,-1.026948,-0.462761,-0.011197,0.605330,...,-0.109777,0.349330,0.770148,0.984297,-0.168138,-0.004905,0.189294,-0.283593,0.255972,-0.121134
2,NCIH1694_LUNG,0.028738,0.071902,-0.094847,0.285069,1.321551,0.620703,-0.439484,0.195007,0.036221,...,0.154344,-0.478189,-1.185530,1.273013,-0.240413,0.476633,-1.367465,-2.525695,-0.137880,-0.451282
3,P3HR1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.120039,-0.066802,-0.128007,-0.552081,-0.292428,-1.415935,-0.138858,-0.066122,-0.346564,...,0.040106,5.923830,-3.893832,-2.499188,0.632758,0.025639,-1.189180,-3.056863,0.025997,-0.465205
4,HUT78_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,-0.268997,-0.060281,-0.137881,-0.398729,-0.095622,-0.533905,0.054245,-0.573022,-0.162968,...,-0.466919,5.475880,-0.561973,-0.500953,-0.261494,0.358679,-0.951686,-3.247388,-0.151424,-0.145426
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,UO31_KIDNEY,0.043231,0.093158,0.105461,0.011264,-0.264052,-0.079559,-0.000355,-0.308669,-0.025941,...,-0.109095,0.137608,0.618270,-0.541645,0.263695,-0.092934,0.433857,0.327542,0.108921,-0.197684
895,SF268_CENTRAL_NERVOUS_SYSTEM,-0.034597,0.065964,-0.329024,-0.540973,0.492108,0.137051,-0.003109,-0.276884,-0.390067,...,-0.339392,-0.054790,-0.271650,-1.735560,-0.397684,-0.166362,2.550478,3.225039,0.136907,0.426637
896,SF539_CENTRAL_NERVOUS_SYSTEM,-0.250998,0.420490,-0.334213,-0.156368,-0.219208,-0.187704,0.240884,0.352163,0.052992,...,0.094319,0.346651,-0.367292,-1.851276,0.069242,0.165163,-0.174660,-0.811089,0.067923,0.237027
897,SNB75_CENTRAL_NERVOUS_SYSTEM,-0.139833,0.194831,-0.135708,-0.434248,0.208941,-0.071338,0.145042,-0.457499,0.032008,...,0.159241,-0.140213,0.436948,-1.476417,-0.390487,-0.163100,0.407310,0.390911,-0.034470,0.153921


In [5]:
breast_cancer_transcript_omics_gdsc1_isoM = breast_cancer_multi_omics_gdsc1_iso_m.drop(columns=ccle_p.columns, errors='ignore')

breast_cancer_transcript_omics_gdsc1_isoM
# 4133 rows × 19482 columns

# check correct:
# 19697 columns - 215 columns = 19482 columns (correct)

Unnamed: 0,DRUG_ID,DRUG_NAME,COSMIC_ID,TSPAN6 (7105),TNMD (64102),DPM1 (8813),SCYL3 (57147),C1orf112 (55732),FGR (2268),CFH (3075),...,247,248,249,250,251,252,253,254,255,LN_IC50
0,133,Doxorubicin,910704,3.428946,0.0,6.869871,3.500802,4.319040,0.111031,0.084064,...,0,0,1,0,1,0,1,0,0,-2.020833
1,134,Etoposide,910704,3.428946,0.0,6.869871,3.500802,4.319040,0.111031,0.084064,...,0,0,1,0,1,0,0,0,1,1.047349
2,135,Gemcitabine,910704,3.428946,0.0,6.869871,3.500802,4.319040,0.111031,0.084064,...,0,0,0,1,1,0,0,0,0,-3.499537
3,136,Mitomycin-C,910704,3.428946,0.0,6.869871,3.500802,4.319040,0.111031,0.084064,...,1,0,0,0,1,0,0,1,0,-0.633531
4,140,Vinorelbine,910704,3.428946,0.0,6.869871,3.500802,4.319040,0.111031,0.084064,...,1,0,1,1,1,0,0,0,0,-5.551392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4128,1377,Afatinib,909907,3.472488,0.0,5.959306,3.878725,3.646163,0.000000,0.042644,...,1,0,0,0,1,0,0,0,0,-0.335696
4129,1378,Bleomycin (50 uM),909907,3.472488,0.0,5.959306,3.878725,3.646163,0.000000,0.042644,...,0,0,0,1,1,1,1,1,1,6.689241
4130,1494,SN-38,909907,3.472488,0.0,5.959306,3.878725,3.646163,0.000000,0.042644,...,0,1,0,1,0,0,0,0,1,-0.448730
4131,1495,Olaparib,909907,3.472488,0.0,5.959306,3.878725,3.646163,0.000000,0.042644,...,0,0,0,0,1,0,0,0,1,5.755940


## Perform the deep learning (only transcriptomics)

In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr
from sklearn.metrics import r2_score

RuntimeError: module compiled against API version 0xf but this version of numpy is 0xe

ImportError: initialization failed

In [None]:
# Split data into features (X) and target variable (y)
X = breast_cancer_transcript_omics_gdsc1_isoM.drop(columns=['DRUG_ID', 'DRUG_NAME', 'COSMIC_ID', 'LN_IC50'])
y = breast_cancer_transcript_omics_gdsc1_isoM['LN_IC50']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# remove for web development

# Hyperparameter tuning
# Define configurations for neurons and activation functions for each layer
neurons_list_layer1 = [64, 128, 256]
activation_list_layer1 = ['relu', 'tanh', 'sigmoid']
neurons_list_layer2 = [32, 64, 128]
activation_list_layer2 = ['relu', 'tanh', 'sigmoid']

In [None]:
# remove for web development

# Initialize variables to store best results
best_rmse = float('inf')
best_config = None

In [None]:
# remove for web development

### DO NOT RERUN
# Loop over configurations for layer 1
for neurons_layer1 in neurons_list_layer1:
    for activation_layer1 in activation_list_layer1:
        # Loop over configurations for layer 2
        for neurons_layer2 in neurons_list_layer2:
            for activation_layer2 in activation_list_layer2:
                print(f"Training model with {neurons_layer1} neurons and {activation_layer1} activation function for layer 1...")
                print(f"                   and {neurons_layer2} neurons and {activation_layer2} activation function for layer 2...")
                
                # Build the deep learning model
                model = Sequential()
                model.add(Dense(neurons_layer1, activation=activation_layer1, input_shape=(X_train.shape[1],)))
                model.add(Dense(neurons_layer2, activation=activation_layer2))
                model.add(Dense(1))

                # Compile the model
                model.compile(optimizer='adam', loss='mean_squared_error')

                # Early stopping
                early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

                # Train the model with early stopping
                model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping], verbose=0)

                # Predict on the test set
                y_pred = model.predict(X_test_scaled)
                
                # Convert predictions to numpy arrays
                y_test_np = np.array(y_test)
                y_pred_np = np.squeeze(y_pred)

                # Remove NaN values from predictions
                nan_indices = np.isnan(y_pred_np)
                y_test_np = y_test_np[~nan_indices]
                y_pred_np = y_pred_np[~nan_indices]

                # Evaluation metrics
                rmse = np.sqrt(mean_squared_error(y_test_np, y_pred_np))
                
                # Print RMSE for current configuration
                print(f"RMSE for {neurons_layer1} neurons and {activation_layer1} activation function for layer 1,")
                print(f"            and {neurons_layer2} neurons and {activation_layer2} activation function for layer 2: {rmse}")

                # Check if current configuration is the best
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_config = (neurons_layer1, activation_layer1, neurons_layer2, activation_layer2)

print("Best configuration found:", best_config)
print("Best RMSE:", best_rmse)

In [None]:
# Build the deep learning model using the best parameters
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(128, activation='relu'))
model.add(Dense(1))

In [None]:
model.build()
model.summary()

In [None]:
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [None]:
# Train the model with early stopping
history = model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping], verbose=1)

In [None]:
# Predict on the test set
y_pred = model.predict(X_test_scaled)

In [None]:
# Convert predictions to numpy arrays
y_test_np = np.array(y_test)
y_pred_np = np.squeeze(y_pred)

# Remove NaN values from predictions
nan_indices = np.isnan(y_pred_np)
y_test_np = y_test_np[~nan_indices]
y_pred_np = y_pred_np[~nan_indices]

# Evaluation metrics
rmse = np.sqrt(mean_squared_error(y_test_np, y_pred_np))
mae = mean_absolute_error(y_test_np, y_pred_np)
mse = mean_squared_error(y_test_np, y_pred_np)
pearson_corr, _ = pearsonr(y_test_np, y_pred_np)
r2 = r2_score(y_test_np, y_pred_np)

print("Root Mean Squared Error (RMSE):", rmse)
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Pearson Correlation:", pearson_corr)
print("R-squared (R2):", r2)

In [None]:
# Create a DataFrame with the required columns
prediction_df = pd.DataFrame({
    'DRUG_ID': df.loc[X_test.index, 'DRUG_ID'],  # Retrieve 'DRUG_ID' using the index of the original DataFrame
    'DRUG_NAME': df.loc[X_test.index, 'DRUG_NAME'],
    'COSMIC_ID': df.loc[X_test.index, 'COSMIC_ID'],
    'CCLE_NAME': df.loc[X_test.index, 'CCLE_Name'],
    'PRED_LN_IC50': y_pred.flatten()  # Flatten the y_pred array to make it one-dimensional
})

# Reset the index of the prediction DataFrame
prediction_df.reset_index(drop=True, inplace=True)

In [None]:
prediction_df

-----

# DL with only proteonomics
## Remove transcriptomics data

-----

# DL with only genomics