In [None]:
import threadpoolctl
threadpoolctl.threadpool_limits(1, "blas")
from surprise import Reader, Dataset, KNNBasic
from surprise import accuracy
import pandas as pd
from surprise import accuracy
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupShuffleSplit
from implicit.als import AlternatingLeastSquares

In [13]:
read = pd.read_pickle('../Pickle/read.pkl')

In [14]:
read = read.sample(frac = 0.5, random_state = 42)

In [15]:
read = read[read['is_read']== 1]

In [16]:
read = read.drop(columns=["rating"], errors="ignore")

In [17]:
read = read.reset_index(drop = True)

In [18]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
from sklearn.model_selection import GroupShuffleSplit, GridSearchCV
from sklearn.metrics import make_scorer
import implicit

read = read.copy()  

# Step 1: Apply GroupShuffleSplit based on 'user_id' to split the interactions
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(read, groups=read["user_id"]))

train_df = read.iloc[train_idx].copy()  # Copy to avoid SettingWithCopyWarning
test_df = read.iloc[test_idx].copy()

# Step 2: Assign categorical indices to users and books
user_cat = pd.Categorical(read["user_id"])
book_cat = pd.Categorical(read["book_id"])

read["user_idx"] = user_cat.codes
read["book_idx"] = book_cat.codes

# Modify train and test DataFrames
train_df.loc[:, "user_idx"] = pd.Categorical(train_df["user_id"], categories=user_cat.categories).codes
train_df.loc[:, "book_idx"] = pd.Categorical(train_df["book_id"], categories=book_cat.categories).codes

test_df.loc[:, "user_idx"] = pd.Categorical(test_df["user_id"], categories=user_cat.categories).codes
test_df.loc[:, "book_idx"] = pd.Categorical(test_df["book_id"], categories=book_cat.categories).codes

# Step 3: Create the train and test matrices (binary data: 1 for read, 0 for not read)
train_matrix = sp.csr_matrix((train_df["is_read"], (train_df["user_idx"], train_df["book_idx"])))
test_matrix = sp.csr_matrix((test_df["is_read"], (test_df["user_idx"], test_df["book_idx"])))

# Ensure the test set is not empty
if test_df.empty:
    print("Warning: The test set is empty after splitting. Try adjusting your split parameters.")


In [None]:
class ALSWrapper:
    def __init__(self, factors=10, regularization=0.1, iterations=20):
        self.factors = factors
        self.regularization = regularization
        self.iterations = iterations
        self.model = AlternatingLeastSquares(factors=self.factors,
                                             regularization=self.regularization,
                                             iterations=self.iterations)
    
    def fit(self, X, y=None):
        self.model.fit(X)
        return self
    
    def predict(self, X):
        return self.model.predict(X)
    
    def get_params(self, deep=True):
        return {
            'factors': self.factors,
            'regularization': self.regularization,
            'iterations': self.iterations
        }

    def set_params(self, **params):
        for param, value in params.items():
            setattr(self, param, value)
        self.model = AlternatingLeastSquares(factors=self.factors,
                                             regularization=self.regularization,
                                             iterations=self.iterations)
        return self


# Step 5: Define Precision@K scorer
def precision_at_k_scorer(model, X, y, k=10):
    precisions = []
    for user_idx in range(X.shape[0]):
        recommendations = model.predict(user_idx)
        actual_books = set(y[user_idx].nonzero()[1])
        recommended_books = set(recommendations)
        precision = len(recommended_books & actual_books) / k
        precisions.append(precision)
    return np.mean(precisions)

# Step 6: Define parameter grid for ALS model
param_grid = {
    'factors': [10, 20, 50],             # Number of latent factors
    'regularization': [0.01, 0.1, 1.0],  # Regularization strength
    'iterations': [10, 20, 50],          # Number of iterations
    'alpha': [1.0, 2.0]                  # Confidence in the observed interactions
}

# Step 7: Initialize ALS model and GridSearchCV
als_model = ALSWrapper()

grid_search = GridSearchCV(estimator=als_model,
                           param_grid=param_grid,
                           scoring=make_scorer(precision_at_k_scorer, k=10),  # Custom Precision@K scorer
                           cv=3,  # Cross-validation splits
                           n_jobs=-1,  # Use all processors
                           verbose=2)  # Show progress

# Step 8: Fit GridSearchCV
grid_search.fit(train_matrix, train_matrix)  # Using train_matrix for both X and y

# Step 9: Output best hyperparameters and best score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Precision@10:", grid_search.best_score_)


Fitting 3 folds for each of 54 candidates, totalling 162 fits


In [None]:
print("Train Matrix Summary:")
print(f"Shape: {train_matrix.shape}")
print(f"Non-zero values: {train_matrix.nnz}")
