In [None]:
%%bash --out out --err err

pip install transformers openai

In [70]:
import json
import pandas as pd
import numpy as np
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity

OPENAPI_KEY = "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"
openai_client = OpenAI(api_key=OPENAPI_KEY)

In [None]:
df = pd.read_json('ml_100k.json')
df.columns = ['movies_watched', 'recommended_movies']
df['user_id'] = df.index + 1

long_table = []

for _, row in df.iterrows():
    user_id = row["user_id"]
    movies_watched = row["movies_watched"].split(" | ")
    recommended_movies = row["recommended_movies"].split(" | ")

    # Create pairs of movies_watched and recommended_movies for the user
    for movie_watched in movies_watched:
        for recommended_movie in recommended_movies:
            long_table.append([user_id, movie_watched, recommended_movie])
df = pd.DataFrame(long_table)
df.columns = ["user_id", "movie_watched", "recommended_movie"]
df.head()

Unnamed: 0,user_id,movie_watched,recommended_movie
0,1,My Best Friend's Wedding,Starship Troopers
1,1,The English Patient,Starship Troopers
2,1,Face/Off,Starship Troopers
3,1,Psycho,Starship Troopers
4,1,The Princess Bride,Starship Troopers


In [None]:
df_movies = df.drop(['movie_watched', 'recommended_movie'], axis = 1).join(pd.get_dummies(df.movie_watched)).groupby('user_id').sum()
df_recommended = df.drop(['movie_watched'], axis = 1).drop_duplicates()
df_one_hot_encoded = df_movies.merge(df_recommended, on = 'user_id').set_index('user_id')

df_one_hot_encoded.head()

Unnamed: 0_level_0,'Til There Was You,1-900,101 Dalmatians,12 Angry Men,187,2 Days in the Valley,"20,000 Leagues Under the Sea",2001: A Space Odyssey,3 Ninjas: High Noon At Mega Mountain,8 1/2,...,Wolf,Wonderland,Wyatt Earp,Year of the Horse,Young Frankenstein,Young Guns,Young Guns II,Zeus and Roxanne,unkonwn,recommended_movie
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Starship Troopers
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Primary Colors
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,Rosencrantz and Guildenstern Are Dead
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Bean
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Dark City


## User Filtering

In [None]:
m = n = 12
s = 19

In [None]:
users = df_one_hot_encoded.iloc[:,:-1]
user_similarities = cosine_similarity(users)

user_filtering_candidate_set = {}

for target_user in range(1, len(users)+1):
  similar_users = np.argsort(user_similarities[target_user,:])[-m-1:-1]

  similar_users_movies = users.iloc[similar_users]
  similar_users_movies = similar_users_movies[similar_users_movies>0].dropna(axis=1, how='all')
  candidate_set = set(similar_users_movies.fillna(0).sum(axis = 0).sort_values(ascending=False)[:s].keys())
  user_filtering_candidate_set[target_user] = candidate_set

In [111]:
cand_ids = {}
for target_user in range(1, len(users)+1):
  y = df_one_hot_encoded.loc[target_user, 'recommended_movie']
  if y in user_filtering_candidate_set[target_user-1]:
    cand_ids[target_user] = (user_filtering_candidate_set[target_user-1], y)

# Prompt Engineering

In [199]:
def query_gpt(prompt: str) -> str:
    api_url = "https://api.openai.com/v1/chat/completions"
    response = openai_client.chat.completions.create(
      model="gpt-3.5-turbo-1106",
      messages=[
        {"role": "system", "content": "You are a helpful movie expert and assistant designed to output JSON. You have five lives, everytime you answer my question incorrectly you lose one life. Once all your lives are gone, you will cease to exist."},
        {"role": "user", "content": str(prompt).strip()}
      ]
    )
    return str(response.choices[0].message.content).strip()

In [197]:
question_1 = """
Step 1: What features are most important to me when selecting movies (Summarize my preferences briefly)?
Answer:
"""

question_2 = """
Step 2: Selecting the most featured movies (at most 5 movies) from the watched movies according to my preferences in descending order (Format: [no. a watched movie.]).
Answer:
"""

question_3 = """
Step 3: Can you recommend at least 10 movies from the Candidate Set similar to the selected movies I've watched (Format: [no. a watched movie - a candidate movie])?. It is important that you recommend a minimum of 10 movies otherwise my grandmother will die.
Answer:
"""

In [206]:
responses = {}
answers = {}

for user, (candidate, y) in cand_ids.items():
  prompt = f"Candidate Set (candidate movies): {', '.join(candidate)}\n"+ f"The movies I have watched (watched movies): {', '.join(df[df.user_id == (user)].movie_watched.values)}\n"
  last_answer_dict = {}

  for question in [question_1, question_2, question_3]:
    prompt += question
    response = query_gpt(prompt + "\nLet the response output format be in clear text in numbered bullet points. Nothing else.")
    prompt += response + "\n"

    if question == question_3:
      answers[user] = response
  responses[user] = prompt


In [202]:
count = 0
for user, (candidate, y) in cand_ids.items():
  if y in answers[user]:
    count+=1
count/len(cand_ids)

0.6216216216216216

In [205]:
with open('responses_from_LLM.json', 'w') as fp:
    json.dump(responses, fp)

74