In [1]:
import pandas as pd

users = pd.read_csv('ml-100k/u.user', sep='|', encoding='latin-1', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
items = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', names=['item_id', 'title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'action', 'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film_noir', 'horror', 'musical', 'mystery', 'romance', 'sci_fi', 'thriller', 'war', 'western'])
ratings = pd.read_csv('ml-100k/u.data', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

In [2]:
# Get number of ratings per item
temp = ratings.groupby('item_id').size()

# Filter item_ids with more than 200 ratings
frequent_item_ids = temp[temp > 200].index

# Select the corresponding items from the items DataFrame
movies_with_many_ratings = items[items['item_id'].isin(frequent_item_ids)]

# Create a semicolon-separated string of titles
movies_with_many_ratings['title'] = movies_with_many_ratings['title'].str.replace('|', ';')

titles_string = '; '.join(movies_with_many_ratings['title'].tolist())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_with_many_ratings['title'] = movies_with_many_ratings['title'].str.replace('|', ';')


In [3]:
titles_string

"Toy Story (1995); Get Shorty (1995); Twelve Monkeys (1995); Babe (1995); Dead Man Walking (1995); Seven (Se7en) (1995); Usual Suspects, The (1995); Mr. Holland's Opus (1995); Braveheart (1995); Birdcage, The (1996); Apollo 13 (1995); Star Wars (1977); Pulp Fiction (1994); Shawshank Redemption, The (1994); Forrest Gump (1994); Four Weddings and a Funeral (1994); Lion King, The (1994); Fugitive, The (1993); Jurassic Park (1993); Sleepless in Seattle (1993); Blade Runner (1982); Aladdin (1992); Terminator 2: Judgment Day (1991); Dances with Wolves (1990); Silence of the Lambs, The (1991); Fargo (1996); Truth About Cats & Dogs, The (1996); Rock, The (1996); Twister (1996); Independence Day (ID4) (1996); Phenomenon (1996); Godfather, The (1972); Wizard of Oz, The (1939); 2001: A Space Odyssey (1968); Sound of Music, The (1965); Die Hard (1988); Willy Wonka and the Chocolate Factory (1971); Fish Called Wanda, A (1988); Top Gun (1986); Monty Python and the Holy Grail (1974); Empire Strikes B

In [4]:
random_users = users.sample(20, random_state=42)
random_users_list = random_users[['user_id', 'age', 'gender', 'occupation', 'zip_code']].to_dict(orient='records')


In [5]:
prompt = f"""
You are a movie recommendation assistant tasked with simulating user ratings in a recommendation system. I will provide:

1. A list of movie titles.
2. A list of user profiles, each with the following attributes: age, gender, occupation, and zip code. It will also include a user ID you should use only for identification purposes.

For each user, select 10 movies from the list that you believe match their preferences, based on their profile. The selection should reflect a reasonable balance of relevance and diversity (not overly narrow, but not random either).

For each selected movie, assign a realistic rating from 1 to 5:
- 1 = strongly disliked
- 5 = strongly liked

Return the results in **JSON format** using the structure below:

[
  {{
    "user_id": ...,
    "recommendations": [
      {{"title": "...", "rating": ...}},
      ...
    ]
  }},
  ...
]

The movie titles are as follows:
{titles_string}

The user profiles are as follows:
{random_users_list}

"""

In [6]:
from google import genai

client = genai.Client(api_key="AIzaSyBDHhAXPz7wqXO_JsgP7Zg93U_erbaM0NE")

response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=prompt,
)

print(response.text)

```json
[
  {
    "user_id": 97,
    "recommendations": [
      {"title": "Usual Suspects, The (1995)", "rating": 5},
      {"title": "Pulp Fiction (1994)", "rating": 5},
      {"title": "Seven (Se7en) (1995)", "rating": 4},
      {"title": "GoodFellas (1990)", "rating": 4},
      {"title": "Blade Runner (1982)", "rating": 4},
      {"title": "Brazil (1985)", "rating": 3},
      {"title": "Fargo (1996)", "rating": 4},
      {"title": "L.A. Confidential (1997)", "rating": 5},
      {"title": "Heat (1995)", "rating": 4},
      {"title": "Apocalypse Now (1979)", "rating": 3}
    ]
  },
  {
    "user_id": 266,
    "recommendations": [
      {"title": "Shawshank Redemption, The (1994)", "rating": 5},
      {"title": "Forrest Gump (1994)", "rating": 4},
      {"title": "Sound of Music, The (1965)", "rating": 5},
      {"title": "Sleepless in Seattle (1993)", "rating": 4},
      {"title": "Sense and Sensibility (1995)", "rating": 4},
      {"title": "English Patient, The (1996)", "rating": 3}

In [7]:
text_response = response.text
text_response

'```json\n[\n  {\n    "user_id": 97,\n    "recommendations": [\n      {"title": "Usual Suspects, The (1995)", "rating": 5},\n      {"title": "Pulp Fiction (1994)", "rating": 5},\n      {"title": "Seven (Se7en) (1995)", "rating": 4},\n      {"title": "GoodFellas (1990)", "rating": 4},\n      {"title": "Blade Runner (1982)", "rating": 4},\n      {"title": "Brazil (1985)", "rating": 3},\n      {"title": "Fargo (1996)", "rating": 4},\n      {"title": "L.A. Confidential (1997)", "rating": 5},\n      {"title": "Heat (1995)", "rating": 4},\n      {"title": "Apocalypse Now (1979)", "rating": 3}\n    ]\n  },\n  {\n    "user_id": 266,\n    "recommendations": [\n      {"title": "Shawshank Redemption, The (1994)", "rating": 5},\n      {"title": "Forrest Gump (1994)", "rating": 4},\n      {"title": "Sound of Music, The (1965)", "rating": 5},\n      {"title": "Sleepless in Seattle (1993)", "rating": 4},\n      {"title": "Sense and Sensibility (1995)", "rating": 4},\n      {"title": "English Patient,

In [8]:
import json
import pandas as pd

item_title_to_id = dict(zip(items['title'], items['item_id']))


# Clean the JSON string if it starts with ``` or ```json
cleaned = text_response.strip()
if cleaned.startswith("```json"):
    cleaned = cleaned[7:]
if cleaned.startswith("```"):
    cleaned = cleaned[3:]
if cleaned.endswith("```"):
    cleaned = cleaned[:-3]
cleaned = cleaned.strip()

# Parse the JSON string
parsed = json.loads(cleaned)

# Flatten into a list of rating entries
ratings_data = []
for user in parsed:
    user_id = user["user_id"]
    for rec in user["recommendations"]:
        ratings_data.append({
            "user_id": user_id,
            "item_id": item_title_to_id.get(rec["title"], None),  # Use title to get item_id
            "rating": rec["rating"]
        })

# Convert to DataFrame
ratings_df = pd.DataFrame(ratings_data)

# Show result
ratings_df


Unnamed: 0,user_id,item_id,rating
0,97,12.0,5
1,97,56.0,5
2,97,11.0,4
3,97,182.0,4
4,97,89.0,4
...,...,...,...
195,335,300.0,3
196,335,79.0,4
197,335,265.0,3
198,335,12.0,4


In [9]:
random_users

Unnamed: 0,user_id,age,gender,occupation,zip_code
96,97,43,M,artist,98006
265,266,62,F,administrator,78756
810,811,40,F,educator,73013
23,24,21,F,artist,94533
30,31,24,M,artist,10003
280,281,15,F,student,6059
568,569,34,M,educator,91903
259,260,40,F,artist,89801
331,332,20,M,student,40504
323,324,21,F,student,2176


In [10]:
# Get all interactions for the random users
user_interactions = ratings[ratings['user_id'].isin(random_users['user_id'])]
user_interactions.groupby('user_id').size().sort_values(ascending=False)

user_id
894    245
500    225
332    183
639    148
468    143
569     70
24      68
324     66
423     64
97      63
287     62
539     56
869     47
31      36
281     26
482     26
260     24
266     23
335     22
811     21
dtype: int64

In [11]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(user_interactions, test_size=0.2, random_state=42)

In [12]:
train

Unnamed: 0,user_id,item_id,rating,timestamp
37500,335,355,3,891567053
30415,500,120,3,883865826
68721,500,727,2,883875041
83800,894,339,4,880415854
28920,500,216,4,883873556
...,...,...,...,...
71418,468,82,5,875292320
81641,894,61,4,882404572
54394,539,962,4,879788195
91596,639,14,5,891239813


In [14]:
from surprise import Dataset, Reader, SVD, accuracy

# Create a Reader with your rating scale
reader = Reader(rating_scale=(1, 5))

# Convert your pandas DataFrames into Surprise train/test sets
trainset = Dataset.load_from_df(train[['user_id', 'item_id', 'rating']], reader).build_full_trainset()

# Fit the model
algo = SVD()
algo.fit(trainset)

# Make predictions
predictions = algo.test(test[['user_id', 'item_id', 'rating']].values)

# Evaluate
print("RMSE:", accuracy.rmse(predictions))
print("MAE:", accuracy.mae(predictions))


RMSE: 0.9718
RMSE: 0.971792534046233
MAE:  0.7769
MAE: 0.7769294285616793


In [15]:
pseudo_train = pd.concat([ratings_df, train], ignore_index=True)
pseudo_train

Unnamed: 0,user_id,item_id,rating,timestamp
0,97,12.0,5,
1,97,56.0,5,
2,97,11.0,4,
3,97,182.0,4,
4,97,89.0,4,
...,...,...,...,...
1489,468,82.0,5,875292320.0
1490,894,61.0,4,882404572.0
1491,539,962.0,4,879788195.0
1492,639,14.0,5,891239813.0


In [17]:

# Convert your pandas DataFrames into Surprise train/test sets
pseudo_trainset = Dataset.load_from_df(pseudo_train[['user_id', 'item_id', 'rating']], reader).build_full_trainset()

# Fit the model
algo = SVD()
algo.fit(trainset)

# Make predictions
predictions = algo.test(test[['user_id', 'item_id', 'rating']].values)

# Evaluate
print("RMSE:", accuracy.rmse(predictions))
print("MAE:", accuracy.mae(predictions))


RMSE: 0.9721
RMSE: 0.9720677784280938
MAE:  0.7817
MAE: 0.7816602521095437


In [18]:
from collections import defaultdict
import numpy as np

# Group predictions by user
user_errors = defaultdict(list)

for pred in predictions:
    uid = pred.uid  # user id
    err = abs(pred.r_ui - pred.est)  # absolute error
    user_errors[uid].append(err)


In [19]:
user_mae = {uid: np.mean(errors) for uid, errors in user_errors.items()}

# Example: print MAE for first 5 users
for uid, mae in list(user_mae.items())[:5]:
    print(f"User {uid}: MAE = {mae:.4f}")


User 24: MAE = 0.7060
User 539: MAE = 0.8215
User 894: MAE = 0.6714
User 468: MAE = 0.7507
User 500: MAE = 0.7653


In [20]:
user_rmse = {
    uid: np.sqrt(np.mean([e**2 for e in errors]))
    for uid, errors in user_errors.items()
}

# Example: print RMSE for first 5 users
for uid, rmse in list(user_rmse.items())[:5]:
    print(f"User {uid}: RMSE = {rmse:.4f}")


User 24: RMSE = 0.8142
User 539: RMSE = 1.0362
User 894: RMSE = 0.7781
User 468: RMSE = 0.9643
User 500: RMSE = 0.9136


In [21]:
train.groupby('user_id').size().sort_values(ascending=False)

user_id
894    199
500    180
332    147
639    116
468    108
324     55
569     55
24      52
287     52
97      51
423     51
539     47
869     37
31      26
260     22
281     21
266     21
482     19
335     18
811     17
dtype: int64