In [26]:
# Google Colab specific library for mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import pickle

# Machine Learning and Model Evaluation
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from surprise.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.decomposition import PCA

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Recommendation system libraries
from surprise import SVD, accuracy, Reader, Dataset
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import auc_score

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 1. Dataset

In [23]:
# read data
data = pd.read_csv('/content/drive/My Drive/activity_train.csv')
data.iloc[:, 1] = data.iloc[:, 1].str.strip()
print("\n=========================================")
print("1. Data Information (TRAIN)")
print("=========================================\n")
print(data.head())
print("\n")
print(data.describe())
print("\n")
print(data.info())

# read test data
data_test = pd.read_csv('/content/drive/My Drive/activity_test_blanked.csv')
print("\n=========================================")
print("2. Data Information (TEST)")
print("=========================================\n")
print(data_test.head())

# shows some fingerprints
print("\n=========================================")
print("3. Fingerprints")
print("=========================================\n")

# get fingerprints
# in pickle : dictionary with keys corresponding the ChEMBL IDs and values corresponding to a list of the set bits of each molecule.
import pickle
with open('/content/drive/My Drive/mol_bits.pkl', 'rb') as f:
    fingerprint = pickle.load(f)

for key in list(fingerprint.keys())[:5]:
    print(key, fingerprint[key])

# size of fingerprints
print(" Number of fingerprints: ", len(fingerprint))


1. Data Information (TRAIN)

   O14842  CHEMBL2022243   4
0  O14842  CHEMBL2022244   6
1  O14842  CHEMBL2022245   2
2  O14842  CHEMBL2022246   1
3  O14842  CHEMBL2022247   4
4  O14842  CHEMBL2022248   4


                   4
count  135710.000000
mean        4.708798
std         2.869917
min         1.000000
25%         2.000000
50%         5.000000
75%         7.000000
max        10.000000


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135710 entries, 0 to 135709
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   O14842          135710 non-null  object
 1    CHEMBL2022243  135710 non-null  object
 2    4              135710 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 3.1+ MB
None

2. Data Information (TEST)

   O14842   CHEMBL2022258  0
0  O14842   CHEMBL2047161  0
1  O14842   CHEMBL2047163  0
2  O14842   CHEMBL2047168  0
3  O14842   CHEMBL2047169  0
4  O14842   CHEMBL2048621  0

3. Finger

In [None]:
plt.figure(figsize=(6, 6))
plt.pie(activity_table["Count"], labels=activity_table["Activity"], autopct='%1.1f%%', startangle=90)
plt.title("Count of Each Activity Level")
plt.axis('equal')
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(protein_interactions, bins=30, kde=True)
plt.title("Number of Interactions per Protein")
plt.xlabel("Number of Interactions")
plt.ylabel("Count")
plt.show()

# PCA

In [None]:
# Create a list of unique fingerprint values
fingerprint_values = [f for sublist in fingerprint.values() for f in sublist]
fingerprint_values = list(set(fingerprint_values))

# Map fingerprint values to indices
fingerprint_value_to_index = {value: index for index, value in enumerate(fingerprint_values)}

# Assuming 'molecules' is a list of molecule identifiers
molecules = list(fingerprint.keys())

# Initialize the matrix Y
Y = np.zeros((len(molecules), len(fingerprint_values)))

# Iterate over all molecules and their fingerprints
for mol_idx, mol in enumerate(molecules):
    for f in fingerprint[mol]:
        # Use the dictionary to find the index of the fingerprint value
        Y[mol_idx, fingerprint_value_to_index[f]] = 1
# transform the matrix Y to a dataframe
Y = pd.DataFrame(Y)
Y.columns = fingerprint_values
Y.index = molecules

# PCA on Y to reduce the dimensionality of the matrix Y
pca = PCA(n_components=1000)         # <- 0.86% of the variance is preserved
Y_pca = pca.fit_transform(Y)

print("Explained variance ratio:", pca.explained_variance_ratio_.sum())

# plot the explained variance ratio
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')
plt.show()

# plot the 2d PCA
plt.scatter(Y_pca[:, 0], Y_pca[:, 1], alpha=0.5)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('2D PCA of Y')
plt.show()

# convert pca to a dataframe with respect molecule name
Y_pca = pd.DataFrame(Y_pca)
Y_pca.index = molecules

# save Y_pca
# np.save('Y_pca.npy', Y_pca)

# Bias Baseline

In [30]:
reader = Reader(rating_scale=(1, 10))  # escala de classificação de 1 a 10

# Divide into train, validation, and test sets
X_train, X_test = train_test_split(data, test_size=0.2, random_state=42)
X_train, X_val = train_test_split(X_train, test_size=0.25, random_state=42)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

# Tranform train into a Surprise dataset
train_data = DatasetAutoFolds(reader=reader, df=X_train)
trainset = train_data.build_full_trainset()

# Define a grid of hyperparameters to search
param_grid = {
    'n_epochs': [20, 40, 60],
    'lr_all': [0.001, 0.005, 0.01],
    'reg_all': [0.02, 0.05, 0.1]
}

gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(train_data)

# Get the best hyperparameters
best_params = gs.best_params['rmse']
best_rmse = gs.best_score['rmse']

print(f"Melhores hiperparâmetros: {best_params}")

# Train a new model with the best parameters
best_model = SVD(**best_params)
best_model.fit(trainset)

# Test the model on the validation set
predictions = best_model.test(X_val)

# Calculate RMSE and MAE
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

# Print the results
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
predictions[:10]

[]

# LightFM Hybrid Recommenders

In [None]:
# Filter common molecules
molecules = set(fingerprint.keys())
molecules_d = set(data.iloc[:, 1].unique())
common_molecules = molecules & molecules_d

# Filter and process PCA data
Y_pca = Y_pca[[mol in common_molecules for mol in molecules]]
Y_pca_df = pd.DataFrame(Y_pca, columns=[f'PC{i+1}' for i in range(Y_pca.shape[1])])

# Split data
train, test = train_test_split(data.values, test_size=0.2, random_state=42)

# Prepare dataset
dataset = Dataset()
dataset.fit((x[0] for x in data.values), (x[1] for x in data.values), item_features=set(Y_pca_df.columns))

# Build interactions and weights matrix
train_interactions, _ = dataset.build_interactions((tuple(i) for i in train))
test_interactions, _ = dataset.build_interactions((tuple(i) for i in test))

# Train model
model = LightFM(learning_rate=0.05, loss='warp')
model.fit(train_interactions, item_features=sp.csr_matrix(Y_pca_df.values), epochs=20, num_threads=1)

# Evaluate the model
test_auc = auc_score(model, test_interactions, item_features=sp.csr_matrix(Y_pca_df.values)).mean()
print(f'Test AUC: {test_auc}')

# Predict Activity_test_blanked

In [None]:
# Load the dataset with all activity values set to zero
data_activity = pd.read_csv('activity_test_blanked.csv')

# Prepare the dataset for Surprise
data_for_prediction = Dataset.load_from_df(data_activity, reader)
testset_for_prediction = data_for_prediction.build_full_trainset().build_testset()

# Make predictions on the dataset with zero activity values
predictions = best_model.test(testset_for_prediction)

# Extract user IDs, item IDs, and predicted ratings from the predictions
user_ids = [pred.uid for pred in predictions]
item_ids = [pred.iid for pred in predictions]
predicted_ratings = [round(pred.est) for pred in predictions]

# Create a DataFrame with the predicted ratings
predicted_activity = pd.DataFrame({'User': user_ids, 'Item': item_ids, 'Predicted Rating': predicted_ratings})

# Save the predicted activity to a CSV file
predicted_activity.to_csv('PD_PREDS-04.csv', index=False, header=False)