# Restart the session after running this cell

In [1]:
# Restart the session after running this cell
!pip install -q "numpy<2"


In [2]:
!pip install -q scikit-surprise


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone


# Data Preprocessing

In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import GridSearchCV, train_test_split

# Load the dataset and rename columns as provided
new_columns = [
    "userid", "churches", "resorts", "beaches", "parks", "theatres", "museums",
    "malls", "zoos", "restaurants", "pubs/bars", "local services", "burger/pizza shops",
    "hotels/other lodgings", "juice bars", "art galleries", "dance clubs",
    "swimming pools", "gyms", "bakeries", "beauty & spas", "cafes",
    "view points", "monuments", "gardens"
]

file_path = "google_review_ratings.csv"  # Make sure this file is in your Colab workspace
df = pd.read_csv(file_path)
df.drop(columns=['Unnamed: 25'], errors='ignore', inplace=True)
df.columns = new_columns


In [4]:
# Convert all rating columns to numeric and fill missing values with column mean
for col in new_columns[1:]:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col] = df[col].fillna(df[col].mean())

# Display a quick preview of the data
df.head()


Unnamed: 0,userid,churches,resorts,beaches,parks,theatres,museums,malls,zoos,restaurants,...,art galleries,dance clubs,swimming pools,gyms,bakeries,beauty & spas,cafes,view points,monuments,gardens
0,User 1,0.0,0.0,3.63,3.65,5.0,2.92,5.0,2.35,2.33,...,1.74,0.59,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0
1,User 2,0.0,0.0,3.63,3.65,5.0,2.92,5.0,2.64,2.33,...,1.74,0.59,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0
2,User 3,0.0,0.0,3.63,3.63,5.0,2.92,5.0,2.64,2.33,...,1.74,0.59,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0
3,User 4,0.0,0.5,3.63,3.63,5.0,2.92,5.0,2.35,2.33,...,1.74,0.59,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0
4,User 5,0.0,0.0,3.63,3.63,5.0,2.92,5.0,2.64,2.33,...,1.74,0.59,0.5,0.0,0.5,0.0,0.0,0.0,0.0,0.0


In [5]:
# Transform the dataframe from wide to long format: (userid, category, rating)
df_long = pd.melt(df, id_vars=['userid'], var_name='category', value_name='rating')

# Preview the transformed data
df_long.head()


Unnamed: 0,userid,category,rating
0,User 1,churches,0.0
1,User 2,churches,0.0
2,User 3,churches,0.0
3,User 4,churches,0.0
4,User 5,churches,0.0


# Train the Collaborative filtering (CF) model

In [6]:
# Prepare data for the Surprise library
reader = Reader(rating_scale=(df_long['rating'].min(), df_long['rating'].max()))
data = Dataset.load_from_df(df_long[['userid', 'category', 'rating']], reader)

# Define a parameter grid for tuning SVD hyperparameters
param_grid = {
    'n_factors': [15, 20, 25],
    'lr_all': [0.002, 0.005, 0.01],
    'reg_all': [0.02, 0.05, 0.1]
}

# Use GridSearchCV to find the best hyperparameters based on RMSE
gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3)
gs.fit(data)
best_params = gs.best_params['rmse']
print("Best Hyperparameters based on RMSE:", best_params)

# Split data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train SVD model with the best parameters
model = SVD(n_factors=best_params['n_factors'],
            lr_all=best_params['lr_all'],
            reg_all=best_params['reg_all'],
            random_state=42)
model.fit(trainset)

# Evaluate model performance on the test set
predictions = model.test(testset)
print("RMSE on test set: ", accuracy.rmse(predictions))


Best Hyperparameters based on RMSE: {'n_factors': 25, 'lr_all': 0.01, 'reg_all': 0.05}
RMSE: 1.0774
RMSE on test set:  1.0774301164545736


# Test the Trained Model

In [23]:
import pickle
from tabulate import tabulate

# Function to generate predicted ratings for a given user for all categories
def get_prediction(user, category):
    return model.predict(user, category).est

categories = new_columns[1:]

def get_user_predictions(user):
    """ Create a dictionary with predicted ratings for each category. """
    return {cat: get_prediction(user, cat) for cat in categories}

# Provide available user IDs for input validation (sorting by the numeric part)
available_users = sorted(df['userid'].unique(), key=lambda x: int(x.split()[1]))
print("\nAvailable user IDs:", available_users)
print("User ID range: from {} to {}".format(available_users[0], available_users[-1]))

# Interactive input to get recommendations for a specific user
user_input = input("\nEnter a user number (e.g., '1' for User 1): ").strip()

# If the user input is a digit, convert it to the full user ID format
if user_input.isdigit():
    input_user = "User " + user_input
else:
    input_user = user_input  # assume user already provided in correct format

# Validate the input and print predictions if valid
if input_user not in available_users:
    print("Invalid user ID. Please choose a number corresponding to one of the available IDs:")
    print("Available user numbers:", [u.split()[1] for u in available_users])
else:
    # Get predictions for the specified user
    user_predictions = get_user_predictions(input_user)

    # Sort predictions in descending order (highest rating first)
    sorted_predictions = sorted(user_predictions.items(), key=lambda x: x[1], reverse=True)

    # Prepare data for the tabulated output
    table_data = []
    for category, rating in sorted_predictions:
        stars = "★" * int(round(rating))
        table_data.append([category, f"{rating:.2f}", stars])

    # Print tabulated predictions
    print(f"\nOverall Predicted Ratings for {input_user} (Sorted High to Low):\n")
    print(tabulate(table_data, headers=["Category", "Rating", "Stars"], tablefmt="fancy_grid", maxcolwidths=[None, None, 6]))



Available user IDs: ['User 1', 'User 2', 'User 3', 'User 4', 'User 5', 'User 6', 'User 7', 'User 8', 'User 9', 'User 10', 'User 11', 'User 12', 'User 13', 'User 14', 'User 15', 'User 16', 'User 17', 'User 18', 'User 19', 'User 20', 'User 21', 'User 22', 'User 23', 'User 24', 'User 25', 'User 26', 'User 27', 'User 28', 'User 29', 'User 30', 'User 31', 'User 32', 'User 33', 'User 34', 'User 35', 'User 36', 'User 37', 'User 38', 'User 39', 'User 40', 'User 41', 'User 42', 'User 43', 'User 44', 'User 45', 'User 46', 'User 47', 'User 48', 'User 49', 'User 50', 'User 51', 'User 52', 'User 53', 'User 54', 'User 55', 'User 56', 'User 57', 'User 58', 'User 59', 'User 60', 'User 61', 'User 62', 'User 63', 'User 64', 'User 65', 'User 66', 'User 67', 'User 68', 'User 69', 'User 70', 'User 71', 'User 72', 'User 73', 'User 74', 'User 75', 'User 76', 'User 77', 'User 78', 'User 79', 'User 80', 'User 81', 'User 82', 'User 83', 'User 84', 'User 85', 'User 86', 'User 87', 'User 88', 'User 89', 'User 90

# Save the model

In [9]:
import pickle

# Define the filename to save the model
model_filename = "trained_model.pkl"

# Save the trained model to disk
with open(model_filename, "wb") as file:
    pickle.dump(model, file)

print(f"Trained model saved to {model_filename}")


Trained model saved to trained_model.pkl


# Load a saved Model and use it

Once this model is saved you dont have to train the model again just run the below cell and adjust the path to your saved model and directly start using the saved model for predictions without wasting time on the training again

In [22]:
import pickle
from tabulate import tabulate

# Load the trained model
model_filename = "trained_model.pkl"
with open(model_filename, "rb") as file:
    loaded_model = pickle.load(file)

print(f"Model loaded successfully from {model_filename}")

# Function to get predictions
def get_loaded_model_prediction(user, category):
    return loaded_model.predict(user, category).est

# Get user input
user_input = input("\nEnter a user number (e.g., '1' for User 1): ").strip()
input_user = f"User {user_input}" if user_input.isdigit() else user_input

if input_user not in available_users:
    print("Invalid user ID. Please choose a valid user.")
else:
    user_predictions = {
        cat: get_loaded_model_prediction(input_user, cat) for cat in categories
    }

    # Sort predictions in descending order
    sorted_predictions = sorted(user_predictions.items(), key=lambda x: x[1], reverse=True)

    # Prepare data for tabulation
    table_data = []
    for category, rating in sorted_predictions:
        stars = "★" * int(round(rating))
        table_data.append([category, f"{rating:.2f}", stars])

    # Print table with the Stars column fixed to a max width of 6 characters.
    print(f"\nPredicted Ratings for {input_user} (Sorted High to Low):\n")
    print(tabulate(table_data,
                   headers=["Category", "Rating", "Stars"],
                   tablefmt="fancy_grid",
                   maxcolwidths=[None, None, 6]))


Model loaded successfully from trained_model.pkl

Enter a user number (e.g., '1' for User 1): 44

Predicted Ratings for User 44 (Sorted High to Low):

╒═══════════════════════╤══════════╤═════════╕
│ Category              │   Rating │ Stars   │
╞═══════════════════════╪══════════╪═════════╡
│ parks                 │     3.8  │ ★★★★    │
├───────────────────────┼──────────┼─────────┤
│ theatres              │     3.79 │ ★★★★    │
├───────────────────────┼──────────┼─────────┤
│ beaches               │     3.54 │ ★★★★    │
├───────────────────────┼──────────┼─────────┤
│ local services        │     3.16 │ ★★★     │
├───────────────────────┼──────────┼─────────┤
│ museums               │     2.96 │ ★★★     │
├───────────────────────┼──────────┼─────────┤
│ malls                 │     2.76 │ ★★★     │
├───────────────────────┼──────────┼─────────┤
│ pubs/bars             │     2.51 │ ★★★     │
├───────────────────────┼──────────┼─────────┤
│ resorts               │     2.42 │ ★★      │
├──