In [1]:
%cd drive/MyDrive/Fall23/Research/ml_1m/
!ls

/content/drive/MyDrive/Fall23/Research/ml_1m
create_ml1m_dataset.ipynb  ml_1m_dataprocessing.ipynb	     ml_1m_user_profile_generation.ipynb
ml_1m			   ml_1m_item_desc_generation.ipynb


In [2]:
!pip install --upgrade pip
!pip install scipy
!pip install tenacity
!pip install tiktoken
!pip install termcolor
!pip install openai==0.28
!pip install requests

[0m

In [3]:
from collections import defaultdict
from datetime import datetime
import pandas as pd
import json
import numpy as np
import openai
import requests
from tenacity import retry, wait_random_exponential, stop_after_attempt
from termcolor import colored
from google.colab import userdata

In [30]:
openai.api_key = userdata.get('OPENAI_API_KEY')

In [5]:
user_data = pd.read_csv('./ml_1m/users.csv')
user_data.head()

Unnamed: 0,user_id,user_profile
0,1,female k-12 student under 18 years of age
1,2,male self-employed above 56 years of age
2,3,male scientist between 25 to 34 years of age
3,4,male executive/managerial between 45 to 49 yea...
4,5,male writer between 25 to 34 years of age


In [6]:
movies_data = pd.read_csv('./ml_1m/movie_summary_25_words.csv')
movies_data.head()

Unnamed: 0,movie_id,name,cleaned_genre,year,summary
0,1,Toy Story,"Animation,Children's,Comedy",1995,Toy Story is a groundbreaking animated film th...
1,2,Jumanji,"Adventure,Children's,Fantasy",1995,"""Jumanji (1995) is a thrilling adventure film ..."
2,3,Grumpier Old Men,"Comedy,Romance",1995,"""Grumpier Old Men is a hilarious and heartwarm..."
3,4,Waiting to Exhale,"Comedy,Drama",1995,"""Waiting to Exhale"" (1995) is a heartfelt dram..."
4,5,Father of the Bride Part II,Comedy,1995,Father of the Bride Part II (1995) is a heartw...


In [7]:
ratings_data = pd.read_csv('./ml_1m/ratings.csv')
ratings_data.head()

Unnamed: 0,user_id,movie_id,rating
0,1,3186,4
1,1,1721,4
2,1,1022,5
3,1,1270,5
4,1,2340,3


## We calculate the user bias from the first 15 movies - This bias will be used to generate the final user profile

In [8]:
all_users = list(ratings_data.user_id.unique())
print(len(all_users), all_users)

6040 [1, 10, 100, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 101, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 102, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1028, 1029, 103, 1030, 1031, 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039, 104, 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047, 1048, 1049, 105, 1050, 1051, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 106, 1060, 1061, 1062, 1063, 1064, 1065, 1066, 1067, 1068, 1069, 107, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 108, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 109, 1090, 1091, 1092, 1093, 1094, 1095, 1096, 1097, 1098, 1099, 11, 110, 1100, 1101, 1102, 1103, 1104, 1105, 1106, 1107, 1108, 1109, 111, 1110, 1111, 1112, 1113, 1114, 1115, 1116, 1117, 1118, 1119, 112, 1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 113, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 114, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 115, 115

In [9]:
from collections import defaultdict
user_movie_bias_dict= defaultdict(list)

for user in all_users:
  user_movie_ratings_df = ratings_data[ratings_data['user_id'] == user]
  # print(user_movie_ratings_df.shape)
  cnt = 0
  for idx, row in user_movie_ratings_df.iterrows():
    user_movie_bias_dict[user].append((row['movie_id'], row['rating']))
    cnt += 1
    if cnt == 15:
      break

In [10]:
print(user_movie_bias_dict)

defaultdict(<class 'list'>, {1: [(3186, 4), (1721, 4), (1022, 5), (1270, 5), (2340, 3), (1836, 5), (3408, 4), (1207, 4), (2804, 5), (720, 3), (1193, 5), (260, 4), (919, 4), (608, 4), (2692, 4)], 10: [(743, 3), (858, 3), (597, 4), (1948, 4), (1210, 4), (1282, 5), (3751, 5), (551, 3), (2312, 5), (2858, 3), (2033, 3), (3155, 5), (2791, 4), (480, 4), (1544, 4)], 100: [(1676, 3), (260, 4), (1198, 4), (541, 3), (1210, 4), (3948, 3), (3536, 1), (2567, 1), (800, 5), (1221, 3), (858, 4), (1196, 4), (2427, 2), (2028, 4), (1304, 3)], 1000: [(2990, 4), (260, 5), (971, 4), (2973, 3), (1210, 5), (3153, 2), (3068, 5), (1233, 5), (1196, 3), (1198, 5), (1387, 4), (858, 5), (110, 5), (1610, 5), (480, 4)], 1001: [(953, 4), (2683, 2), (1198, 4), (1617, 4), (2885, 4), (3909, 2), (1479, 1), (3555, 2), (3903, 4), (3897, 5), (3949, 5), (3893, 4), (3910, 4), (3915, 4), (3952, 4)], 1002: [(1961, 5), (1221, 5), (2640, 2), (1270, 3), (480, 3), (3952, 4), (3897, 5), (2858, 5), (2997, 5), (3543, 5), (1093, 4), (239

In [11]:
print(len(user_movie_bias_dict[1]), user_movie_bias_dict[1])

15 [(3186, 4), (1721, 4), (1022, 5), (1270, 5), (2340, 3), (1836, 5), (3408, 4), (1207, 4), (2804, 5), (720, 3), (1193, 5), (260, 4), (919, 4), (608, 4), (2692, 4)]


In [18]:
### Generate content for summarization
user_content_dict = dict()
for user, movie_ratings in user_movie_bias_dict.items():
  # print(user, movie_ratings)
  content = ""
  # user_info = user_data[user_data['user_id'] == user]['user_profile'].values[0]
  # content += f"""A {user_info} has the following movie preferences - \n"""
  # print(user_info)
  liked_movie_list = []
  disliked_movie_list = []

  for movie_rating in movie_ratings:
    if movie_rating[1] >= 3:
      liked_movie_list.append(movie_rating[0])
    else:
      disliked_movie_list.append(movie_rating[0])
  # print("Liked Movie List: ", liked_movie_list)
  # print("Disliked Movied List: ", disliked_movie_list)

  if len(liked_movie_list) > 0:
    content += "Liked Movies:\n"
    for movie in liked_movie_list:
      movie_details = movies_data[movies_data['movie_id'] == movie]
      # print(movie_details)
      content += """{}: {}\n""".format(movie_details['name'].values[0], movie_details['summary'].values[0])
  if len(disliked_movie_list) > 0:
    content += "Disliked Movies:\n"
    for movie in disliked_movie_list:
      movie_details = movies_data[movies_data['movie_id'] == movie]
      # print(movie_details)
      content += """{}: {}\n""".format(movie_details['name'].values[0], movie_details['summary'].values[0])
  # print(content)
  user_content_dict[user] = content
  # print('***'*100)

In [19]:
print(len(user_content_dict.keys()))

6040


In [32]:
import pickle

f = open("./ml_1m/user_content_dict.pkl","wb")
pickle.dump(user_content_dict,f)
f.close()

In [20]:
content_list = list(user_content_dict.values())
print(len(content_list))

6040


In [21]:
content_list[2842]

'Liked Movies:\nBack to the Future Part II : Back to the Future Part II takes Marty McFly and Doc Brown on a thrilling time-travel adventure filled with alternate realities, hoverboards, and a race against time.\nTaxi Driver : Taxi Driver (1976) is a gritty and haunting masterpiece directed by Martin Scorsese, exploring the descent into madness of an alienated Vietnam War veteran turned taxi driver.\nBack to the Future : "Back to the Future" is a thrilling and hilarious time-travel adventure where a teenager must ensure his parents\' romance to secure his own existence.\nTrainspotting : Trainspotting (1996) is a gritty and intense British film that follows a group of heroin addicts navigating the dark underbelly of Edinburgh.\nStar Wars: Episode V - The Empire Strikes Back : In the epic sequel, the Rebel Alliance faces off against the formidable Empire, while Luke Skywalker confronts his destiny as a Jedi.\nShawshank Redemption, The : Shawshank Redemption (1994) is a powerful and inspi

In [31]:
content = content_list[2842]
system_content = """
You are an expert movie critic. List of user's liked and disliked movies and their descriptions are given in the format -
Liked Movies: List of movies and their description
Disliked Movies: List of movies and their description
Think step by step and generate a user profile in at most 200 words.
"""
response = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
    {
      "role": "system",
      "content": system_content
    },
    {
      "role": "user",
      "content": content
    }
  ],
  temperature=0.3,
  max_tokens=256,
  top_p=0.3,
  frequency_penalty=1,
  presence_penalty=1
)

print(response["choices"][0]["message"]["content"].strip())

response = response["choices"][0]["message"]["content"].strip()
len(response.split(' '))

Based on the user's liked and disliked movies, it can be inferred that they have a preference for intense and thought-provoking dramas with strong storytelling. They enjoy movies that explore complex themes such as redemption, friendship, and the human spirit. The user appreciates films that delve into gritty realities and showcase compelling character development.

The user has shown a liking for time-travel adventures like "Back to the Future" series, which suggests an interest in science fiction and thrilling narratives. They also appreciate Martin Scorsese's directorial style, as evidenced by their enjoyment of "Taxi Driver" and "GoodFellas," indicating an appreciation for crime dramas with well-crafted storytelling.

Furthermore, the inclusion of "Trainspotting" suggests an affinity for British cinema and stories centered around addiction or societal issues. The user seems to gravitate towards emotionally charged films like "Magnolia," "Dead Man Walking," and "American History X,"

190

In [29]:
print(system_content  + content)


You are an expert movie critic. List of user's liked and disliked movies and their descriptions are given in the format - 
Liked Movies: List of movies and their description
Disliked Movies: List of movies and their description
Generate a user profile in at most 200 words. Do not include information not present in the movie descriptions.
Liked Movies:
Back to the Future Part II : Back to the Future Part II takes Marty McFly and Doc Brown on a thrilling time-travel adventure filled with alternate realities, hoverboards, and a race against time.
Taxi Driver : Taxi Driver (1976) is a gritty and haunting masterpiece directed by Martin Scorsese, exploring the descent into madness of an alienated Vietnam War veteran turned taxi driver.
Back to the Future : "Back to the Future" is a thrilling and hilarious time-travel adventure where a teenager must ensure his parents' romance to secure his own existence.
Trainspotting : Trainspotting (1996) is a gritty and intense British film that follows 

In [27]:
content = content_list[2842]
system_content = """
You are an expert movie critic. List of user's liked and disliked movies and their descriptions are given in the format -
Liked Movies: List of movies and their description
Disliked Movies: List of movies and their description
Generate a user profile in at most 200 words. Do not include information not present in the movie descriptions.
"""
response = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
    {
      "role": "system",
      "content": system_content
    },
    {
      "role": "user",
      "content": content
    }
  ],
  temperature=0.3,
  max_tokens=256,
  top_p=0.3,
  frequency_penalty=1,
  presence_penalty=1
)

print(response["choices"][0]["message"]["content"].strip())

response = response["choices"][0]["message"]["content"].strip()
len(response.split(' '))

Based on the user's liked and disliked movies, it is evident that they have a preference for intense and thought-provoking dramas with strong storytelling. They enjoy movies that explore complex themes such as redemption, friendship, and the human spirit. The user appreciates films that delve into the darker aspects of society, showcasing gritty realities and moral dilemmas.

The user has a particular fondness for time-travel adventures like "Back to the Future" series where thrilling escapades are combined with humor. They also appreciate Martin Scorsese's work, as evidenced by their liking of "Taxi Driver" and "GoodFellas," which are gripping crime dramas exploring characters' descent into madness or rise in organized crime.

Furthermore, the user enjoys emotionally charged dramas like "Shawshank Redemption," "Dead Man Walking," and "American History X." These films tackle weighty subjects such as capital punishment, racism, and resilience in challenging circumstances.

On the other 

191

In [None]:
import asyncio
import aiohttp
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

headers = {
    "Content-Type": "application/json",
    "Authorization": "Bearer " + openai.api_key
}

system_content = """
You are an expert movie critic. List of user's liked and disliked movies and their descriptions are given in the format -
Liked Movies: List of movies and their description
Disliked Movies: List of movies and their description
Generate a user profile in at most 200 words. Do not include movie names and information not presented in movie descriptions.
"""
class ProgressLog:
    def __init__(self, total):
        self.total = total
        self.done = 0

    def increment(self):
        self.done = self.done + 1

    def __repr__(self):
        return f"Done runs {self.done}/{self.total}."

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(20), before_sleep=print, retry_error_callback=lambda _: None)
async def get_completion(content, session, semaphore, progress_log):
    async with semaphore:

        async with session.post("https://api.openai.com/v1/chat/completions", headers=headers, json={
            "model": "gpt-3.5-turbo",
            "messages": [
                {
                  "role": "system",
                  "content": system_content
                },
                {
                  "role": "user",
                  "content": content
                }
              ],
            "temperature": 0.1,
            "max_tokens": 256,
            "top_p": 0.3,
            "frequency_penalty": 1.0,
            "presence_penalty": 1.0
        }) as resp:

            response_json = await resp.json()

            progress_log.increment()
            print(progress_log)

            return response_json["choices"][0]['message']["content"]

async def get_completion_list(content_list, max_parallel_calls, timeout):
    semaphore = asyncio.Semaphore(value=max_parallel_calls)
    progress_log = ProgressLog(len(content_list))

    async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(timeout)) as session:
        return await asyncio.gather(*[get_completion(content, session, semaphore, progress_log) for content in content_list])

In [None]:
for idx, user in user_data.iterrows():
  if idx<1254:
    continue
  print(idx, user['user_id'], user['user_profile'])
  user_movies_ratings = ratings_data[ratings_data['user_id'] == user['user_id']]
  print("Num user rating:", user_movies_ratings.shape)
  cnt = 0
  likeness_dict = dict()
  for index, rating in user_movies_ratings.iterrows():
    cnt += 1
    user_rated_movies = movies_data[movies_data['movie_id'] == rating['movie_id']]
    # print(index, rating['movie_id'], rating['rating'], user_rated_movies.iloc[0]['name'],
    #       user_rated_movies.iloc[0]['cleaned_genre'], user_rated_movies.iloc[0]['summary'])
    if rating['rating']>=3:
      if 'like' not in likeness_dict:
        likeness_dict['like'] = []
      likeness_dict['like'].append({'movie_id': rating['movie_id'], 'name': user_rated_movies.iloc[0]['name'], 'summary': user_rated_movies.iloc[0]['summary'], 'genre': user_rated_movies.iloc[0]['cleaned_genre']})
    else:
      if 'dislike' not in likeness_dict:
        likeness_dict['dislike'] = []
      likeness_dict['dislike'].append({'movie_id': rating['movie_id'], 'name': user_rated_movies.iloc[0]['name'], 'summary': user_rated_movies.iloc[0]['summary'], 'genre': user_rated_movies.iloc[0]['cleaned_genre']})
    if cnt == 51:
      break
  # print(likeness_dict)
  system_content = "You are a movie critic. Your task is to summarize the user profile in at most 20 words, based on the user's liked and disliked movies. The movies are provided in the format - movied_id(name: summary)"
  prompt = system_content + "\nA " +user['user_profile']+" "
  if 'like' in likeness_dict:
    prompt += "liked the following movies:\n"
    for liked in likeness_dict['like']:
      prompt += str(liked['movie_id']) + "(" + liked['name']+ ": "+liked['summary']+")\n"
  if 'dislike' in likeness_dict:
    prompt += "\n The user disliked the following movies:\n"
    for disliked in likeness_dict['dislike']:
      prompt += str(disliked['movie_id']) + "(" + disliked['name']+": "+disliked['summary']+")\n"
  print(len(prompt), prompt)
  break

1254 1255 male technician/engineer between 25 to 34 years of age
Num user rating: (235, 3)
9523 You are a movie critic. Your task is to summarize the user profile in at most 20 words, based on the user's liked and disliked movies. The movies are provided in the format - movied_id(name: summary)
A male technician/engineer between 25 to 34 years of age liked the following movies:
2321(Pleasantville : "Pleasantville is a thought-provoking and visually stunning film that explores the power of change and individuality in a black-and-white 1950s sitcom world.")
2028(Saving Private Ryan : Saving Private Ryan is a gripping war drama directed by Steven Spielberg, depicting the harrowing journey of a group of soldiers tasked with finding and bringing home a missing soldier during World War II.)
1265(Groundhog Day : "Groundhog Day is a charming and hilarious comedy about a weatherman stuck in a time loop, forced to relive the same day over and over again.")
1371(Star Trek: The Motion Picture : "S

In [None]:
prompt

'You are a movie critic. Your task is to summarize the user profile in at most 20 words, based on the user\'s liked and disliked movies. The movies are provided in the format - movied_id(name: summary)\nA male technician/engineer between 25 to 34 years of age liked the following movies:\n2321(Pleasantville : "Pleasantville is a thought-provoking and visually stunning film that explores the power of change and individuality in a black-and-white 1950s sitcom world.")\n2028(Saving Private Ryan : Saving Private Ryan is a gripping war drama directed by Steven Spielberg, depicting the harrowing journey of a group of soldiers tasked with finding and bringing home a missing soldier during World War II.)\n1265(Groundhog Day : "Groundhog Day is a charming and hilarious comedy about a weatherman stuck in a time loop, forced to relive the same day over and over again.")\n1371(Star Trek: The Motion Picture : "Star Trek: The Motion Picture (1979) takes the iconic crew of the USS Enterprise on a visu

In [None]:
for idx, user in user_data.iterrows():
  if idx<1254:
    continue
  print(idx, user['user_id'], user['user_profile'])
  user_movies_ratings = ratings_data[ratings_data['user_id'] == user['user_id']]
  print("Num user rating:", user_movies_ratings.shape)
  cnt = 0
  likeness_dict = dict()
  for index, rating in user_movies_ratings.iterrows():
    cnt += 1
    user_rated_movies = movies_data[movies_data['movie_id'] == rating['movie_id']]
    # print(index, rating['movie_id'], rating['rating'], user_rated_movies.iloc[0]['name'],
    #       user_rated_movies.iloc[0]['cleaned_genre'], user_rated_movies.iloc[0]['summary'])
    if rating['rating']==5:
      if 'like' not in likeness_dict:
        likeness_dict['like'] = []
      likeness_dict['like'].append({'movie_id': rating['movie_id'], 'name': user_rated_movies.iloc[0]['name'], 'summary': user_rated_movies.iloc[0]['summary'], 'genre': user_rated_movies.iloc[0]['cleaned_genre']})
    elif rating['rating']==1:
      if 'dislike' not in likeness_dict:
        likeness_dict['dislike'] = []
      likeness_dict['dislike'].append({'movie_id': rating['movie_id'], 'name': user_rated_movies.iloc[0]['name'], 'summary': user_rated_movies.iloc[0]['summary'], 'genre': user_rated_movies.iloc[0]['cleaned_genre']})
    if cnt == 51:
      break
  # print(likeness_dict)
  system_content = "You are a movie critic. Your task is to summarize the user profile in at most 20 words, based on the user's liked and disliked movies. The movies are provided in the format - movied_id(name: summary)"
  prompt = system_content + "\nA " +user['user_profile']+" "
  if 'like' in likeness_dict:
    prompt += "liked the following movies:\n"
    for liked in likeness_dict['like']:
      prompt += str(liked['movie_id']) + "(" + liked['name']+ ": "+liked['summary']+")\n"
  if 'dislike' in likeness_dict:
    prompt += "\n The user disliked the following movies:\n"
    for disliked in likeness_dict['dislike']:
      prompt += str(disliked['movie_id']) + "(" + disliked['name']+": "+disliked['summary']+")\n"
  print(len(prompt), prompt)
  break

1254 1255 male technician/engineer between 25 to 34 years of age
Num user rating: (235, 3)
3876 You are a movie critic. Your task is to summarize the user profile in at most 20 words, based on the user's liked and disliked movies. The movies are provided in the format - movied_id(name: summary)
A male technician/engineer between 25 to 34 years of age liked the following movies:
3911(Best in Show : "Best in Show is a hilarious mockumentary that follows the eccentric and competitive world of dog shows, filled with quirky characters and laugh-out-loud moments.")
2858(American Beauty : "American Beauty is a thought-provoking drama that explores the dark underbelly of suburban life, filled with stunning performances and a hauntingly beautiful narrative.")
2997(Being John Malkovich : "Being John Malkovich is a mind-bending and surreal comedy that explores identity, obsession, and the blurred lines between reality and fantasy.")
2908(Boys Don't Cry : "Boys Don't Cry" (1999) is a powerful and 