In [157]:
import pandas as pd

users = pd.read_csv('ml-100k/u.user', sep='|', encoding='latin-1', names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
items = pd.read_csv('ml-100k/u.item', sep='|', encoding='latin-1', names=['item_id', 'title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'action', 'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film_noir', 'horror', 'musical', 'mystery', 'romance', 'sci_fi', 'thriller', 'war', 'western'])
ratings = pd.read_csv('ml-100k/u.data', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

In [174]:
ratings

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [254]:
import pandas as pd

# Get number of ratings per item
temp = ratings.groupby('item_id').size()

# Filter item_ids with more than 200 ratings
frequent_item_ids = temp[temp > 100].index

# Choose popular and less popular movies
popular = items[items['item_id'].isin(frequent_item_ids)]
less_popular = items[~items['item_id'].isin(frequent_item_ids)]

# Combine a balanced set of popular and less popular movies
sampled_movies = pd.concat([
    popular.sample(n=min(len(popular), 500), random_state=42),
    less_popular.sample(n=min(len(less_popular), 500), random_state=42)
])

# Get all genre columns
genre_cols = [
    'unknown', 'action', 'adventure', 'animation', 'children', 'comedy', 'crime',
    'documentary', 'drama', 'fantasy', 'film_noir', 'horror', 'musical', 'mystery',
    'romance', 'sci_fi', 'thriller', 'war', 'western'
]

# For each genre, sample N movies (without replacement across genres)
N = 10  # number of movies per genre
genre_samples = []
used_indices = set()

for genre in genre_cols:
    # Filter out already used movies to avoid duplicates
    available_movies = sampled_movies[~sampled_movies.index.isin(used_indices)]

    genre_movies_popular = available_movies[
        (available_movies[genre] == 1) &
        (available_movies['item_id'].isin(frequent_item_ids))
    ]

    genre_movies_less_popular = available_movies[
        (available_movies[genre] == 1) &
        (~available_movies['item_id'].isin(frequent_item_ids))
    ]

    sample_popular = genre_movies_popular.sample(n=min(N // 2, len(genre_movies_popular)), random_state=42)
    sample_less_popular = genre_movies_less_popular.sample(n=min(N // 2, len(genre_movies_less_popular)), random_state=42)

    sample = pd.concat([sample_popular, sample_less_popular])
    genre_samples.append(sample)
    used_indices.update(sample.index)

# Combine all genre samples for final sampled_movies (ensures both popularity and genre diversity)
sampled_movies = pd.concat(genre_samples).drop_duplicates(subset='item_id')

# Include genre tags in the prompt
sampled_movies['genres'] = sampled_movies[genre_cols].apply(
    lambda row: [g for g, v in row.items() if v == 1],
    axis=1
)

titles_and_genres_string = sampled_movies[['title', 'genres']].to_dict(orient='records')

titles_string = '; '.join([f"{movie['title']} ({', '.join(movie['genres'])})" for movie in movie_dicts])

In [255]:
sampled_movies

Unnamed: 0,item_id,title,release_date,video_release_date,imdb_url,unknown,action,adventure,animation,children,...,film_noir,horror,musical,mystery,romance,sci_fi,thriller,war,western,genres
116,117,"Rock, The (1996)",07-Jun-1996,,"http://us.imdb.com/M/title-exact?Rock,%20The%2...",0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,"[action, adventure, thriller]"
229,230,Star Trek IV: The Voyage Home (1986),01-Jan-1986,,http://us.imdb.com/M/title-exact?Star%20Trek%2...,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0,"[action, adventure, sci_fi]"
117,118,Twister (1996),10-May-1996,,http://us.imdb.com/M/title-exact?Twister%20(1996),0,1,1,0,0,...,0,0,0,0,0,0,1,0,0,"[action, adventure, thriller]"
244,245,"Devil's Own, The (1997)",26-Mar-1997,,http://us.imdb.com/M/title-exact?Devil%27s%20O...,0,1,0,0,0,...,0,0,0,0,0,0,1,1,0,"[action, drama, thriller, war]"
545,546,Broken Arrow (1996),09-Feb-1996,,http://us.imdb.com/M/title-exact?Broken%20Arro...,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,"[action, thriller]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620,621,"Davy Crockett, King of the Wild Frontier (1955)",01-Jan-1955,,http://us.imdb.com/M/title-exact?Davy%20Crocke...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,[western]
1586,1587,Terror in a Texas Town (1958),01-Jan-1958,,http://us.imdb.com/M/title-exact?Terror%20in%2...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,[western]
574,575,City Slickers II: The Legend of Curly's Gold (...,01-Jan-1994,,http://us.imdb.com/M/title-exact?City%20Slicke...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,"[comedy, western]"
839,840,Last Man Standing (1996),20-Sep-1996,,http://us.imdb.com/M/title-exact?Last%20Man%20...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,"[action, drama, western]"


In [256]:
movies_with_many_ratings

Unnamed: 0,item_id,title,release_date,video_release_date,imdb_url,unknown,action,adventure,animation,children,...,fantasy,film_noir,horror,musical,mystery,romance,sci_fi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,7,Twelve Monkeys (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,9,Dead Man Walking (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Dead%20Man%20...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
602,603,Rear Window (1954),01-Jan-1954,,http://us.imdb.com/M/title-exact?Rear%20Window...,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
654,655,Stand by Me (1986),01-Jan-1986,,http://us.imdb.com/M/title-exact?Stand%20by%20...,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
677,678,Volcano (1997),25-Apr-1997,,http://us.imdb.com/M/title-exact?Volcano%20%28...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
741,742,Ransom (1996),08-Nov-1996,,http://us.imdb.com/M/title-exact?Ransom%20(1996),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [257]:
titles_string

"Rock, The (1996) (action, adventure, thriller); Star Trek IV: The Voyage Home (1986) (action, adventure, sci_fi); Twister (1996) (action, adventure, thriller); Devil's Own, The (1997) (action, drama, thriller, war); Broken Arrow (1996) (action, thriller); Sudden Death (1995) (action); Men With Guns (1997) (action, drama); Jerky Boys, The (1994) (action, comedy); Mirage (1995) (action, thriller); Bloodsport 2 (1995) (action); Lost World: Jurassic Park, The (1997) (action, adventure, sci_fi, thriller); Ben-Hur (1959) (action, adventure, drama); Wizard of Oz, The (1939) (adventure, children, drama, musical); Edge, The (1997) (adventure, thriller); Clear and Present Danger (1994) (action, adventure, thriller); Far From Home: The Adventures of Yellow Dog (1995) (adventure, children); Golden Earrings (1947) (adventure, romance); Fled (1996) (action, adventure); Kull the Conqueror (1997) (action, adventure); Flipper (1996) (adventure, children); Lion King, The (1994) (animation, children, mu

In [258]:
random_users = users.sample(20, random_state=42)
random_users_list = random_users[['user_id', 'age', 'gender', 'occupation', 'zip_code']].to_dict(orient='records')


In [296]:
prompt = f"""
You are a movie recommendation assistant tasked with simulating user preferences in a movie rating system.

You will be given:
- A list of movie entries: each includes a title and genre tags.
- A list of user profiles: each has a user ID, age, gender, occupation, and zip code.

Your job is to:
1. For each user, select **10 movies** that you believe they are **likely to have watched and rated** based on their demographic.
2. Choose movies that reflect a balance of:
   - Personal relevance (based on age, gender, occupation).
   - Diversity (include a mix of genres, not only favorites).
   - Popularity (some well-known and some lesser-known movies).

3. For each selected movie, assign a **realistic rating (1–5)**:
   - Users should have a mix of strong likes (4–5), dislikes (1–2), and average opinions (3).
   - Don’t make every rating a 5. Include a natural variance.
   - Some ratings should reflect random personal taste or emotional bias.

Return your result in **JSON format**, using this structure:

[
  {{
    "user_id": ...,
    "recommendations": [
      {{"title": "...", "rating": ...}},
      ...
    ]
  }},
  ...
]

NOTE: the movie titles should be exactly in the format provided below, including things like the word "The" at the end of the title after the rest of the title


The movie titles are as follows:
{titles_string}

The user profiles are as follows:
{random_users_list}

"""


In [None]:
from google import genai

client = genai.Client(api_key="")

response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=prompt,
)

print(response.text)

```json
[
  {
    "user_id": 97,
    "recommendations": [
      {"title": "Rock, The (1996)", "rating": 5},
      {"title": "Big Lebowski, The (1998)", "rating": 4},
      {"title": "Citizen Kane (1941)", "rating": 3},
      {"title": "GoodFellas (1990)", "rating": 4},
      {"title": "L.A. Confidential (1997)", "rating": 5},
      {"title": "Singin' in the Rain (1952)", "rating": 3},
      {"title": "North by Northwest (1959)", "rating": 4},
      {"title": "Blade Runner (1982)", "rating": 4},
      {"title": "Third Man, The (1949)", "rating": 3},
      {"title": "Wizard of Oz, The (1939)", "rating": 5}
    ]
  },
  {
    "user_id": 266,
    "recommendations": [
      {"title": "Wizard of Oz, The (1939)", "rating": 5},
      {"title": "Beauty and the Beast (1991)", "rating": 4},
      {"title": "Mr. Holland's Opus (1995)", "rating": 4},
      {"title": "Emma (1996)", "rating": 5},
      {"title": "Casablanca (1942)", "rating": 5},
      {"title": "North by Northwest (1959)", "rating":

In [301]:
text_response = response.text
print(text_response)

```json
[
  {
    "user_id": 97,
    "recommendations": [
      {"title": "Rock, The (1996)", "rating": 5},
      {"title": "Big Lebowski, The (1998)", "rating": 4},
      {"title": "Citizen Kane (1941)", "rating": 3},
      {"title": "GoodFellas (1990)", "rating": 4},
      {"title": "L.A. Confidential (1997)", "rating": 5},
      {"title": "Singin' in the Rain (1952)", "rating": 3},
      {"title": "North by Northwest (1959)", "rating": 4},
      {"title": "Blade Runner (1982)", "rating": 4},
      {"title": "Third Man, The (1949)", "rating": 3},
      {"title": "Wizard of Oz, The (1939)", "rating": 5}
    ]
  },
  {
    "user_id": 266,
    "recommendations": [
      {"title": "Wizard of Oz, The (1939)", "rating": 5},
      {"title": "Beauty and the Beast (1991)", "rating": 4},
      {"title": "Mr. Holland's Opus (1995)", "rating": 4},
      {"title": "Emma (1996)", "rating": 5},
      {"title": "Casablanca (1942)", "rating": 5},
      {"title": "North by Northwest (1959)", "rating":

In [302]:
import json
import pandas as pd
import random

item_title_to_id = dict(zip(items['title'], items['item_id']))


# Clean the JSON string if it starts with ``` or ```json
cleaned = text_response.strip()
if cleaned.startswith("```json"):
    cleaned = cleaned[7:]
if cleaned.startswith("```"):
    cleaned = cleaned[3:]
if cleaned.endswith("```"):
    cleaned = cleaned[:-3]
cleaned = cleaned.strip()

# Parse the JSON string
parsed = json.loads(cleaned)

# Flatten into a list of rating entries
ratings_data = []
for user in parsed:
    user_id = user["user_id"]
    for rec in user["recommendations"]:
        item_id = item_title_to_id.get(rec["title"], None)
        if item_id is None:
            print(f"Warning: Item title '{rec['title']}' not found in items DataFrame.")
            continue
        ratings_data.append({
            "user_id": user_id,
            "item_id": int(item_id),  # Use title to get item_id
            "rating": rec["rating"]
        })

# Convert to DataFrame
ratings_df = pd.DataFrame(ratings_data)


ratings_df


Unnamed: 0,user_id,item_id,rating
0,97,117,5
1,97,902,4
2,97,134,3
3,97,182,4
4,97,302,5
...,...,...,...
195,335,176,4
196,335,597,3
197,335,205,3
198,335,483,4


In [303]:
random_users

Unnamed: 0,user_id,age,gender,occupation,zip_code
96,97,43,M,artist,98006
265,266,62,F,administrator,78756
810,811,40,F,educator,73013
23,24,21,F,artist,94533
30,31,24,M,artist,10003
280,281,15,F,student,6059
568,569,34,M,educator,91903
259,260,40,F,artist,89801
331,332,20,M,student,40504
323,324,21,F,student,2176


In [304]:
# For each user in random_users, leave 5 ratings in interactions_of_other_users
interactions_of_random_users = []
interactions_of_other_users = ratings.copy()

for uid in random_users['user_id']:
    user_ratings = ratings[ratings['user_id'] == uid]
    leave_out = user_ratings.sample(1, random_state=42) if len(user_ratings) >= 5 else user_ratings
    interactions_of_random_users.append(user_ratings[~user_ratings.index.isin(leave_out.index)])
    interactions_of_other_users = interactions_of_other_users.drop(user_ratings[~user_ratings.index.isin(leave_out.index)].index)


interactions_of_random_users = pd.concat(interactions_of_random_users)

In [308]:
from surprise import Dataset, Reader, SVD, accuracy

# Create a Reader with your rating scale
reader = Reader(rating_scale=(1, 5))

# Convert your pandas DataFrames into Surprise train/test sets
trainset = Dataset.load_from_df(interactions_of_other_users[['user_id', 'item_id', 'rating']], reader).build_full_trainset()

# Fit the model
algo = SVD()
algo.fit(trainset)

# Make predictions
predictions = algo.test(interactions_of_random_users[['user_id', 'item_id', 'rating']].values)

# Evaluate
print("RMSE:", accuracy.rmse(predictions))
print("MAE:", accuracy.mae(predictions))


RMSE: 0.9962
RMSE: 0.9961677288983305
MAE:  0.7840
MAE: 0.7840285228442767


In [306]:
pseudo_train = pd.concat([ratings_df, interactions_of_other_users], ignore_index=True)
pseudo_train

Unnamed: 0,user_id,item_id,rating,timestamp
0,97,117,5,
1,97,902,4,
2,97,134,3,
3,97,182,4,
4,97,302,5,
...,...,...,...,...
98597,880,476,3,880175444.0
98598,716,204,5,879795543.0
98599,276,1090,1,874795795.0
98600,13,225,2,882399156.0


In [307]:

# Convert your pandas DataFrames into Surprise train/test sets
pseudo_trainset = Dataset.load_from_df(pseudo_train[['user_id', 'item_id', 'rating']], reader).build_full_trainset()

# Fit the model
algo = SVD()
algo.fit(pseudo_trainset)

# Make predictions
predictions = algo.test(interactions_of_random_users[['user_id', 'item_id', 'rating']].values)

# Evaluate
print("RMSE:", accuracy.rmse(predictions))
print("MAE:", accuracy.mae(predictions))


RMSE: 0.9718
RMSE: 0.9718259728048414
MAE:  0.7595
MAE: 0.7594894791873297
