In [1]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score
import numpy as np

In [4]:
data_path = "E:/RA/SMILES_Morgan_kmers_CIKM/Dataset/"
# dataset_name = "Morgan_Fingerprint_Mannual_6897_vectors"
# dataset_name = "Morgan_plus_kmers_Fingerprint_Mannual_6897_vectors"
# dataset_name = "Morgan_plus_daylight_Fingerprint_Mannual_6897_vectors"
# dataset_name = "Morgan_plus_kmers_plus_daylight_Fingerprint_Mannual_6897_vectors"
# dataset_name = "Daylight_Fingerprint_Mannual_6897_vectors"

# frequency_vector = np.load(data_path + dataset_name + ".npy")
smile_strings_label = np.load("E:/RA/SMILES_ICLR_Drug_Analysis/Dataset/nano-drugbank-master/SMILE_Strings_6897_attributes_solubility_ALOGPS.npy")


In [5]:
smile_strings_label[0]

0.0464

In [34]:
frequency_vector.shape

(6897, 2048)

In [32]:


# Assume that frequency_vector contains the k-mer spectrum embedding for SMILES strings, and smile_strings_label contains the solubility label.
X = np.array(frequency_vector)
y = np.array(smile_strings_label)

# Split the data into training and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the models.
linear_reg = LinearRegression()
ridge_reg = Ridge(alpha=1)
lasso_reg = Lasso(alpha=0.1)
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
gb_reg = GradientBoostingRegressor(n_estimators=100, random_state=42)

# Train the models.
linear_reg.fit(X_train, y_train)
ridge_reg.fit(X_train, y_train)
lasso_reg.fit(X_train, y_train)
rf_reg.fit(X_train, y_train)
gb_reg.fit(X_train, y_train)

# Make predictions on the test set.
y_pred_linear_reg = linear_reg.predict(X_test)
y_pred_ridge_reg = ridge_reg.predict(X_test)
y_pred_lasso_reg = lasso_reg.predict(X_test)
y_pred_rf_reg = rf_reg.predict(X_test)
y_pred_gb_reg = gb_reg.predict(X_test)




In [33]:
# Compute evaluation metrics for each model.
linear_reg_mae = mean_absolute_error(y_test, y_pred_linear_reg)
ridge_reg_mae = mean_absolute_error(y_test, y_pred_ridge_reg)
lasso_reg_mae = mean_absolute_error(y_test, y_pred_lasso_reg)
rf_reg_mae = mean_absolute_error(y_test, y_pred_rf_reg)
gb_reg_mae = mean_absolute_error(y_test, y_pred_gb_reg)

linear_reg_mse = mean_squared_error(y_test, y_pred_linear_reg)
ridge_reg_mse = mean_squared_error(y_test, y_pred_ridge_reg)
lasso_reg_mse = mean_squared_error(y_test, y_pred_lasso_reg)
rf_reg_mse = mean_squared_error(y_test, y_pred_rf_reg)
gb_reg_mse = mean_squared_error(y_test, y_pred_gb_reg)

linear_reg_rmse = np.sqrt(mean_squared_error(y_test, y_pred_linear_reg))
ridge_reg_rmse = np.sqrt(mean_squared_error(y_test, y_pred_ridge_reg))
lasso_reg_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lasso_reg))
rf_reg_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf_reg))
gb_reg_rmse = np.sqrt(mean_squared_error(y_test, y_pred_gb_reg))

linear_reg_r2 = r2_score(y_test, y_pred_linear_reg)
ridge_reg_r2 = r2_score(y_test, y_pred_ridge_reg)
lasso_reg_r2 = r2_score(y_test, y_pred_lasso_reg)
rf_reg_r2 = r2_score(y_test, y_pred_rf_reg)
gb_reg_r2 = r2_score(y_test, y_pred_gb_reg)

linear_reg_evs = explained_variance_score(y_test, y_pred_linear_reg)
ridge_reg_evs = explained_variance_score(y_test, y_pred_ridge_reg)
lasso_reg_evs = explained_variance_score(y_test, y_pred_lasso_reg)
rf_reg_evs = explained_variance_score(y_test, y_pred_rf_reg)
gb_reg_evs = explained_variance_score(y_test, y_pred_gb_reg)

# Print the evaluation metrics for each model.
print("Linear Regression:")
print("MAE:", linear_reg_mae)
print("MSE:", linear_reg_mse)
print("RMSE:", linear_reg_rmse)
print("R^2:", linear_reg_r2)
print("EVS:", linear_reg_evs)

print("Ridge Regression:")
print("MAE:", ridge_reg_mae)
print("MSE:", ridge_reg_mse)
print("RMSE:", ridge_reg_rmse)
print("R^2:", ridge_reg_r2)
print("EVS:", ridge_reg_evs)

print("Lasso Regression:")
print("MAE:", lasso_reg_mae)
print("MSE:", lasso_reg_mse)
print("RMSE:", lasso_reg_rmse)
print("R^2:", lasso_reg_r2)
print("EVS:", lasso_reg_evs)

print("Random Forest Regression:")
print("MAE:", rf_reg_mae)
print("MSE:", rf_reg_mse)
print("RMSE:", rf_reg_rmse)
print("R^2:", rf_reg_r2)
print("EVS:", rf_reg_evs)

print("Gradient Boosting Regression:")
print("MAE:", gb_reg_mae)
print("MSE:", gb_reg_mse)
print("RMSE:", gb_reg_rmse)
print("R^2:", gb_reg_r2)
print("EVS:", gb_reg_evs)


Linear Regression:
MAE: 349389460292.8494
MSE: 1.6846073299297187e+26
RMSE: 12979242389021.475
R^2: -9.961620946077438e+21
EVS: -9.954402380174649e+21
Ridge Regression:
MAE: 58.48712640903566
MSE: 15272.787683684397
RMSE: 123.58312054517963
R^2: 0.09687130649531661
EVS: 0.09757895040543063
Lasso Regression:
MAE: 55.06291457239276
MSE: 15029.781579719694
RMSE: 122.59600964028027
R^2: 0.11124103321009027
EVS: 0.11185955770198897
Random Forest Regression:
MAE: 45.885544691114646
MSE: 11480.049886966319
RMSE: 107.14499468928224
R^2: 0.3211480005801194
EVS: 0.3310673271063108
Gradient Boosting Regression:
MAE: 43.36137085637197
MSE: 13154.618747389743
RMSE: 114.69358633938404
R^2: 0.2221253978686677
EVS: 0.22240146809745087
