# Creating a Hybrid Recommendation System Based on Collaborative and Content-Based Models

In [1]:
pip install annoy

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.3-cp310-cp310-linux_x86_64.whl size=552450 sha256=617581e07d5bd0f7b4dfafc7d5a2e6ab97b0c55f0b4b3f05dc43e236b8a88d99
  Stored in directory: /root/.cache/pip/wheels/64/8a/da/f714bcf46c5efdcfcac0559e63370c21abe961c48e3992465a
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.3


In [2]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m122.9/154.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [3]:
!pip install gdown



In [4]:
import gdown
import joblib
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score
from annoy import AnnoyIndex
from surprise import Dataset, Reader, SVD
import os
import re

# Load the data

In [5]:
file_id = '11SQ7kKA4MslkNrF6eq7u0sm541i1WH7d'
url = f'https://drive.google.com/uc?id={file_id}'
output = 'sampled_data.csv'

gdown.download(url, output, quiet=False)

sampled_data = pd.read_csv(output)
sampled_data.head()

Downloading...
From (original): https://drive.google.com/uc?id=11SQ7kKA4MslkNrF6eq7u0sm541i1WH7d
From (redirected): https://drive.google.com/uc?id=11SQ7kKA4MslkNrF6eq7u0sm541i1WH7d&confirm=t&uuid=9ee906cc-5859-4fab-b01b-e2cfbddb9a78
To: /content/sampled_data.csv
100%|██████████| 247M/247M [00:03<00:00, 81.9MB/s]


Unnamed: 0,app_id,tags_string,user_id,title,rating_numeric
0,951440,"Open World Survival Craft, Survival, Open Worl...",9074881,Volcanoids,4.5
1,32440,"LEGO, Local Co-Op, Adventure, Co-op, Family Fr...",6006822,LEGO® Star Wars™ - The Complete Saga,5.0
2,615610,"Indie, Casual, Action, Space, 2D, Singleplayer...",1239,Orbt XL,5.0
3,1097840,"Action, Adventure, Multiplayer, Third-Person S...",9255103,Gears 5,3.5
4,464060,"Cyberpunk, Action, Top-Down Shooter, Great Sou...",5324004,RUINER,4.5


In [6]:
# Function to preprocess tags
def preprocess_tags(tags):
    # remove special characters and convert to lower case
    tags = re.sub(r'[^\w\s]', '', tags).lower()
    return tags

In [7]:
# Apply preprocessing to tags
sampled_data['tags_string'] = sampled_data['tags_string'].fillna('').apply(preprocess_tags)

# Load the models

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
svd_model_url = 'https://drive.google.com/uc?id=1AMH6Ob2UeM8JbfKcdEDqB2qfPU5ORFwZ'
tfidf_matrix_url = 'https://drive.google.com/uc?id=1-0VocEr64qAgcmesbr2k66fVoaYH45gN'
tfidf_vectorizer_url = 'https://drive.google.com/uc?id=1-0qfCOLgGUbjQwumjV2RoIQwsg75p66J'

In [10]:
gdown.download(svd_model_url, 'svd_model.pkl', quiet=False)
gdown.download(tfidf_matrix_url, 'tfidf_matrix.pkl', quiet=False)
gdown.download(tfidf_vectorizer_url, 'tfidf_vectorizer.pkl', quiet=False)

Downloading...
From (original): https://drive.google.com/uc?id=1AMH6Ob2UeM8JbfKcdEDqB2qfPU5ORFwZ
From (redirected): https://drive.google.com/uc?id=1AMH6Ob2UeM8JbfKcdEDqB2qfPU5ORFwZ&confirm=t&uuid=ef9b086a-7962-4683-9f69-8bc44da7bab1
To: /content/svd_model.pkl
100%|██████████| 422M/422M [00:05<00:00, 73.8MB/s]
Downloading...
From (original): https://drive.google.com/uc?id=1-0VocEr64qAgcmesbr2k66fVoaYH45gN
From (redirected): https://drive.google.com/uc?id=1-0VocEr64qAgcmesbr2k66fVoaYH45gN&confirm=t&uuid=8977ebb2-b4b6-4a98-b59a-0959a3ba25ee
To: /content/tfidf_matrix.pkl
100%|██████████| 260M/260M [00:03<00:00, 76.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-0qfCOLgGUbjQwumjV2RoIQwsg75p66J
To: /content/tfidf_vectorizer.pkl
100%|██████████| 14.4k/14.4k [00:00<00:00, 23.2MB/s]


'tfidf_vectorizer.pkl'

In [11]:
model_dir = '/content/drive/MyDrive/models'
annoy_index_path = os.path.join(model_dir, 'annoy_index.ann')
assert os.path.exists(annoy_index_path), f"File not found at path: {annoy_index_path}"

In [12]:
svd_model = joblib.load('svd_model.pkl')
tfidf_matrix = joblib.load('tfidf_matrix.pkl')
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')

In [13]:
# determine the dimensionality of the TF-IDF vector
f = tfidf_matrix.shape[1]

In [14]:
annoy_index = AnnoyIndex(f, metric='angular')
annoy_index.load(annoy_index_path)

True

# Recommendations

In [27]:
# function to recommend games based on SVD model
def svd_recommend(user_id, n_recommendations=50):
    app_ids = sampled_data['app_id'].unique()
    user_ratings = [(app_id, svd_model.predict(user_id, app_id).est) for app_id in app_ids]
    user_ratings.sort(key=lambda x: x[1], reverse=True)
    top_items = [app_id for app_id, rating in user_ratings[:n_recommendations]]
    return top_items

In [28]:
# function to recommend games based on TF-IDF model
def tfidf_recommend(app_id, n_recommendations=50):
    item_idx = sampled_data[sampled_data['app_id'] == app_id].index[0]
    item_vector = tfidf_matrix[item_idx].toarray()
    similar_items_idx = annoy_index.get_nns_by_vector(item_vector[0], n_recommendations)
    similar_items = sampled_data.iloc[similar_items_idx]['app_id'].tolist()
    return similar_items

In [39]:
# hybrid recommendation combining SVD and TF-IDF recommendations
def hybrid_recommend(user_id, app_id, n_recommendations=10):
    svd_recs = svd_recommend(user_id, n_recommendations)
    tfidf_recs = tfidf_recommend(app_id, n_recommendations)
    hybrid_recs = list(set(svd_recs).union(set(tfidf_recs)))
    return hybrid_recs[:n_recommendations]

In [40]:
# function to get game titles based on app_ids
def get_titles_from_ids(app_ids):
    return sampled_data[sampled_data['app_id'].isin(app_ids)]['title'].unique().tolist()

In [41]:
# Function to recommend games for a user and get game titles
def recommend_games_for_user(user_id, n_recommendations=10):
    user_apps = sampled_data[sampled_data['user_id'] == user_id]['app_id']
    app_id = user_apps.iloc[0] if not user_apps.empty else sampled_data['app_id'].iloc[0]
    recommendations = hybrid_recommend(user_id, app_id, n_recommendations)
    titles = get_titles_from_ids(recommendations)
    return recommendations, titles

# Input user ID
input_user_id = int(input("Enter user ID: "))
recommended_ids, recommended_titles = recommend_games_for_user(input_user_id, n_recommendations=10)
print(f"Top 10 recommended games for user {input_user_id}:")
for idx, title in enumerate(recommended_titles, 1):
    print(f"{idx}. {title}")

Enter user ID: 9255103
Top 10 recommended games for user 9255103:
1. Gears 5
2. Yuppie Psycho: Executive Edition
3. Gorogoa
4. Placid Plastic Duck Simulator
5. DOOM II
6. Motor Town: Behind The Wheel
7. Senren＊Banka
8. Poly Bridge 2
9. LYNE
10. Neon White


# Evaluate

In [42]:
def evaluate_recommendations(recommendations, true_items):
    recommendations_set = set(recommendations)
    y_true = [1 if app_id in recommendations_set else 0 for app_id in true_items]
    y_pred = [1] * len(y_true)
    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    return precision, recall, f1

true_items = sampled_data[sampled_data['user_id'] == input_user_id]['app_id'].tolist()
print(f"True Items: {true_items}")
print(f"Recommendations: {recommended_ids}")
precision, recall, f1 = evaluate_recommendations(recommended_ids, true_items)
print(f'Precision: {precision}, Recall: {recall}, F1: {f1}')

True Items: [1097840]
Recommendations: [597760, 1999360, 557600, 1369670, 1533420, 1144400, 1062160, 1097840, 2300, 266010]
Precision: 1.0, Recall: 1.0, F1: 1.0
