In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/breakthrough-tech-ai-studio-challenge/sample_submission.csv
/kaggle/input/breakthrough-tech-ai-studio-challenge/movies_keywords.csv
/kaggle/input/breakthrough-tech-ai-studio-challenge/movies_metadata.csv
/kaggle/input/breakthrough-tech-ai-studio-challenge/train.csv
/kaggle/input/breakthrough-tech-ai-studio-challenge/test.csv


In [2]:
import pandas as pd
from surprise import Dataset
from surprise import Reader
from surprise import SVD

In [3]:
# Load the train data from csv file
train_df = pd.read_csv('/kaggle/input/breakthrough-tech-ai-studio-challenge/train.csv')
train_df[['userId', 'movieId']] = train_df['userId_movieId'].str.split('_', expand=True)
train_df = train_df.drop('userId_movieId', axis=1)

In [4]:
# Load the test dataset
test_df = pd.read_csv('/kaggle/input/breakthrough-tech-ai-studio-challenge/test.csv')
test_df[['userId', 'movieId']] = test_df['userId_movieId'].str.split('_', expand=True)
test_df = test_df.drop('userId_movieId', axis=1)

In [5]:
# Create a Dataset object with the train data
reader = Reader(rating_scale=(0.0, 1.0))
train_dataset = Dataset.load_from_df(train_df, reader)

In [6]:
# Build the trainset
trainset = train_dataset.build_full_trainset()

In [7]:
# Build the recommendation model using the train dataset
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f38d696dd90>

In [8]:
# Use the model to predict ratings for the test dataset
testset = [(int(row['userId']), int(row['movieId']), 0) for _, row in test_df.iterrows()]
predictions = model.test(testset)

In [9]:
# Save the test dataset with predicted ratings as a CSV file
test_df['rating'] = [prediction.est for prediction in predictions]

In [10]:
test_df.head()

Unnamed: 0,userId,movieId,rating
0,469,2124,1.0
1,439,3753,1.0
2,522,1682,1.0
3,429,1217,1.0
4,71,1210,1.0


In [11]:
import optuna
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate


In [12]:
# Define the objective function for Optuna to optimize
def objective(trial):
    # Define the hyperparameter space to search
    param_grid = {
        'n_factors': trial.suggest_int('n_factors', 10, 100),
        'n_epochs': trial.suggest_int('n_epochs', 10, 50),
        'lr_all': trial.suggest_uniform('lr_all', 0.002, 0.01),
        'reg_all': trial.suggest_uniform('reg_all', 0.02, 0.2),
    }
    
    # Create the SVD model with the suggested hyperparameters
    algo = SVD(**param_grid)
    
    # Cross-validate the model on the train dataset
    cv_results = cross_validate(algo, train_dataset, measures=['rmse'], cv=5, verbose=False)
    
    # Return the mean RMSE score across all folds as the objective value to minimize
    return cv_results['test_rmse'].mean()

In [13]:
# Create a study object and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)

[32m[I 2023-02-23 19:56:15,611][0m A new study created in memory with name: no-name-75e191c7-9f43-4525-9c45-5f365a220dff[0m
  import sys
  
[32m[I 2023-02-23 19:56:23,608][0m Trial 0 finished with value: 29106.16618154135 and parameters: {'n_factors': 21, 'n_epochs': 21, 'lr_all': 0.008917609485306566, 'reg_all': 0.13811679002923488}. Best is trial 0 with value: 29106.16618154135.[0m
[32m[I 2023-02-23 19:56:41,751][0m Trial 1 finished with value: 29103.848716190154 and parameters: {'n_factors': 36, 'n_epochs': 40, 'lr_all': 0.004833039086417565, 'reg_all': 0.1845981222877804}. Best is trial 1 with value: 29103.848716190154.[0m
[32m[I 2023-02-23 19:57:02,574][0m Trial 2 finished with value: 29105.570119404256 and parameters: {'n_factors': 86, 'n_epochs': 27, 'lr_all': 0.0023852343932079627, 'reg_all': 0.12283703723932404}. Best is trial 1 with value: 29103.848716190154.[0m
[32m[I 2023-02-23 19:57:15,291][0m Trial 3 finished with value: 29106.07632365102 and parameters: {'n

In [14]:
# Print the best hyperparameters and objective value found by Optuna
print(f'Best RMSE: {study.best_value:.4f}')
print(f'Best hyperparameters: {study.best_params}')

Best RMSE: 29101.3444
Best hyperparameters: {'n_factors': 33, 'n_epochs': 25, 'lr_all': 0.006354482321137168, 'reg_all': 0.05674236956258821}


In [15]:
# Define the best hyperparameters found by Optuna
best_params = {'n_factors': 62, 'n_epochs': 24, 'lr_all': 0.006745171733062295, 'reg_all': 0.06414005746315199}


In [16]:
# Create the SVD model with the best hyperparameters
model = SVD(**best_params)

In [17]:
# Train the model on the full train dataset
train_dataset = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], Reader(rating_scale=(0.0, 1.0)))
trainset = train_dataset.build_full_trainset()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f38b1862250>

In [18]:
# Make predictions on the test dataset
test_df['rating'] = test_df.apply(lambda x: model.predict(x['userId'], x['movieId'])[3], axis=1)

In [19]:
test_df.head()

Unnamed: 0,userId,movieId,rating
0,469,2124,0.643249
1,439,3753,0.705467
2,522,1682,0.885139
3,429,1217,0.939277
4,71,1210,0.840327


In [20]:
# Combine userId and movieId columns to create a new column called userId_movieId
test_df['userId_movieId'] = test_df['userId'].astype(str) + '_' + test_df['movieId'].astype(str)

# Drop the original userId and movieId columns
test_df.drop(['userId', 'movieId'], axis=1, inplace=True)

# Rearrange the columns so that userId_movieId comes first and rating comes second
test_df = test_df[['userId_movieId', 'rating']]

In [21]:
test_df.head()

Unnamed: 0,userId_movieId,rating
0,469_2124,0.643249
1,439_3753,0.705467
2,522_1682,0.885139
3,429_1217,0.939277
4,71_1210,0.840327


In [22]:
# Save the resulting DataFrame to a new CSV file
test_df.to_csv('submission.csv', index=False)