In [1]:
import pandas as pd

In [2]:
train = pd.read_csv('../data/train.csv')

In [3]:
train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,genres,avg_rating
0,259,255,4,874724710,My Best Friend's Wedding (1997),Romance,4.0
1,259,286,4,874724727,"English Patient, The (1996)","Romance, War",4.0
2,259,298,4,874724754,Face/Off (1997),"Action, Sci-Fi, Thriller",4.0
3,259,185,4,874724781,Psycho (1960),"Horror, Romance, Thriller",4.0
4,259,173,4,874724843,"Princess Bride, The (1987)","Action, Adventure, Romance",4.0


In [318]:
user_movies = train.groupby('user_id').apply(lambda x: x['movie_title'].tolist())

In [562]:
def get_user_recent_movies(dataframe, n = 10):
    return dataframe.groupby('user_id').apply(lambda x: x['movie_title'].tail(n).to_list())

user_recent_movies = get_user_recent_movies(train)

In [493]:
def get_user_top_movies(dataframe, n = 10):
    return dataframe.groupby('user_id').apply(lambda x: x.sort_values('rating', ascending=False).head(n)['movie_title'].to_list())

user_top_movies = get_user_top_movies(train)

In [652]:
movie_genres = train.groupby('movie_title').apply(lambda x: x['genres'].unique().tolist()).apply(lambda x: x[0].split(", "))

In [613]:
user_top_movies

user_id
1      [Empire Strikes Back, The (1980), Usual Suspec...
5      [Wrong Trousers, The (1993), This Is Spinal Ta...
6      [Down by Law (1986), Graduate, The (1967), Lon...
8      [Contact (1997), Die Hard (1988), Braveheart (...
10     [Amadeus (1984), Taxi Driver (1976), All About...
                             ...                        
937    [Boot, Das (1981), Star Wars (1977), Dead Man ...
939    [Jackal, The (1997), My Best Friend's Wedding ...
940    [Titanic (1997), Air Force One (1997), Contact...
941    [Toy Story (1995), Lone Star (1996), Close Sha...
943    [Shawshank Redemption, The (1994), Courage Und...
Length: 621, dtype: object

In [663]:
user_top_movies[1]

['Empire Strikes Back, The (1980)',
 'Usual Suspects, The (1995)',
 'Maya Lin: A Strong Clear Vision (1994)',
 'Brazil (1985)',
 'Hoop Dreams (1994)',
 'Blade Runner (1982)',
 'Groundhog Day (1993)',
 'Full Monty, The (1997)',
 'Haunted World of Edward D. Wood Jr., The (1995)',
 'Princess Bride, The (1987)']

In [686]:
user_top_movie_genres = user_top_movies.apply(lambda x: pd.Series(movie_genres.loc[x].sum()).unique().tolist())

In [454]:
user_ratings = train.drop_duplicates(['user_id', 'movie_title']).pivot(index='user_id', columns='movie_title', values='avg_rating').fillna(0)

In [455]:
from scipy.sparse import csr_matrix
sparse_user_ratings = csr_matrix(user_ratings)

In [302]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [459]:
def get_similar_users(sparse_user_ratings, index, n = 10):
    return pd.DataFrame(cosine_similarity(sparse_user_ratings) - np.identity(sparse_user_ratings.shape[0]), index = index, columns = index).apply(lambda x: list(x.sort_values(ascending = False).head(n).index), axis=1)

similar_users = get_similar_users(sparse_user_ratings, user_ratings.index)

In [460]:
similar_users.head()

user_id
1     [916, 457, 268, 435, 429, 823, 301, 276, 889, ...
5      [648, 407, 307, 497, 268, 276, 22, 622, 804, 70]
6       [18, 194, 716, 10, 666, 151, 321, 85, 389, 854]
8      [746, 158, 37, 352, 638, 425, 22, 627, 671, 538]
10      [6, 406, 666, 389, 321, 398, 524, 716, 194, 18]
dtype: object

In [501]:
def get_candidate_movies(similar_users, dataframe, n = 10):
    return similar_users.apply(lambda x: dataframe.loc[dataframe['user_id'].isin(x), 'movie_title'].unique()[:10])

candidate_movies = get_candidate_movies(similar_users, train)

In [687]:
user_prompt_data = pd.concat((user_movies, user_top_movies, user_recent_movies, similar_users, candidate_movies, user_top_movie_genres), axis = 1, keys = ['user_movies', 'user_top_movies', 'user_recent_movies', 'similar_users', 'candidate_movies', 'user_top_movie_genres'])

In [570]:
new_line = '\n'

def createPrompt(each):
    return f"""I am user {each}.
The most recent ten movies I have seen are:
{", ".join(user_prompt_data.loc[each, 'user_recent_movies'])}.
My top rated movies are:
{", ".join(user_prompt_data.loc[each, 'user_top_movies'])}.
The users who are most like me are {", ".join([str(each) for each in user_prompt_data.loc[each, 'similar_users']])}.
The top movies for each of these users are:
{new_line.join([f"{each}: {', '.join(user_prompt_data.loc[each, 'user_top_movies'])}" for each in user_prompt_data.loc[each, 'similar_users']])}.
Please recommend ten movies for me to watch that I have not seen. Provide brackets around your recommendations so I can easily parse them.
For example (['Midnight Cowboy (1969)', 'Lost in Translation (2003)', etc.])"""
    

In [571]:
prompts = pd.Series(user_prompt_data.index, index = user_prompt_data.index).apply(lambda x: createPrompt(x))

In [572]:
from openai import OpenAI
from key import OPENAI_API_KEY
client = OpenAI(api_key=OPENAI_API_KEY)

In [593]:
response = prompts.iloc[:50].apply(lambda x: client.chat.completions.create(
    model="gpt-3.5-turbo",
        messages= [{ 'role':'user','content' : x}],
        # temperature=0,
        # max_tokens=512,
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0,
        ))

In [594]:
recommendations = response.apply(lambda x: x.choices[0].message.content.split("[")[1].split("]")[0].split(", "))

In [597]:
response[8]

ChatCompletion(id='chatcmpl-8Tfbc3XUkdfOBkz6zLZYYrtVW6F9j', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content="Here are ten movie recommendations for you:\n\n1. ['The Shawshank Redemption (1994)']\n2. ['Inception (2010)']\n3. ['The Godfather (1972)']\n4. ['Schindler's List (1993)']\n5. ['Fight Club (1999)']\n6. ['The Matrix (1999)']\n7. ['The Dark Knight (2008)']\n8. ['The Lord of the Rings: The Fellowship of the Ring (2001)']\n9. ['Pulp Fiction (1994)']\n10. ['The Silence of the Lambs (1991)']", role='assistant', function_call=None, tool_calls=None))], created=1702082136, model='gpt-3.5-turbo-0613', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=130, prompt_tokens=1057, total_tokens=1187))

In [595]:
recommendations.head()

user_id
1     ['Shawshank Redemption, The (1994)', 'Godfathe...
5     ['Pulp Fiction (1994)', 'The Shawshank Redempt...
6     ['Pulp Fiction (1994)', 'Fight Club (1999)', '...
8                   ['The Shawshank Redemption (1994)']
10                           ['Midnight Cowboy (1969)']
Name: user_id, dtype: object

In [337]:
test = pd.read_csv('../data/test.csv')

In [338]:
user_movies_test = test.groupby('user_id').apply(lambda x: x['movie_title'].tolist())

In [598]:
hits = pd.concat((user_movies_test, recommendations), axis=1).dropna().apply(lambda x: len(set(x.iloc[0]).intersection(x.iloc[1])), axis = 1)

In [599]:
hits

user_id
1     0
13    0
14    0
23    0
30    0
43    0
54    0
58    0
60    0
64    0
dtype: int64

In [734]:
def promptTemplate(each):
    return f"""
Candidate Set (candidate movies): {user_prompt_data.loc[each, 'candidate_movies']}.
The movies I have rated highly (watched movies): {user_prompt_data.loc[each, 'user_top_movies']}.
Their genres are: {user_prompt_data.loc[each, 'user_top_movie_genres']}.
Can you recommend 10 movies from the Candidate Set similar to but not in the selected movies I've watched?.
(Format Example: Here are the 10 movies recommended for you: [Midnight Cowboy (1969){new_line}Lost in Translation (2003){new_line}etc.])
Answer: 
"""

In [733]:
def promptTemplate1(each):
    return f"""
Candidate Set (candidate movies): {user_prompt_data.loc[each, 'candidate_movies']}.
The movies I have rated highly (watched movies): {user_prompt_data.loc[each, 'user_top_movies']}.
Their genres are: {user_prompt_data.loc[each, 'user_top_movie_genres']}.
Step 1: What features are most important to me when selecting movies (Summarize my preferences briefly)? 
Answer: 
"""

def promptTemplate2(each):
    return f"""
Candidate Set (candidate movies): {user_prompt_data.loc[each, 'candidate_movies']}.
The movies I have rated highly (watched movies): {user_prompt_data.loc[each, 'user_top_movies']}.
Their genres are: {user_prompt_data.loc[each, 'user_top_movie_genres']}.
Step 1: What features are most important to me when selecting movies (Summarize my preferences briefly)? 
Answer: {}.
Step 2: Can you recommend 10 movies from the Candidate Set similar to but not in the selected movies I've watched?.
(Format Example: Here are the 10 movies recommended for you: [Midnight Cowboy (1969){new_line}Lost in Translation (2003){new_line} etc.])
Answer: 
"""

SyntaxError: f-string: empty expression not allowed (1132140648.py, line 20)

In [722]:
prompts = pd.Series(user_prompt_data.index, index = user_prompt_data.index).apply(lambda x: promptTemplate(x))

In [723]:
prompts.head()

user_id
1     \nCandidate Set (candidate movies): ['Contact ...
5     \nCandidate Set (candidate movies): ['Contact ...
6     \nCandidate Set (candidate movies): ['Full Mon...
8     \nCandidate Set (candidate movies): ['Contact ...
10    \nCandidate Set (candidate movies): ['Keys to ...
Name: user_id, dtype: object

In [724]:
response = prompts.iloc[:10].apply(lambda x: client.chat.completions.create(
    model="gpt-3.5-turbo",
        messages= [{ 'role':'user','content' : x}],
        # temperature=0,
        # max_tokens=512,
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0,
        ))

In [725]:
response[1]

ChatCompletion(id='chatcmpl-8TgkpqWgLPKUh1MRp0o3sQHSk6BSR', choices=[Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Here are 10 movies recommended for you: [\'Contact (1997)\', \'Air Force One (1997)\', \'Conspiracy Theory (1997)\', \'Liar Liar (1997)\', "Dante\'s Peak (1997)", \'Murder at 1600 (1997)\', \'Scream (1996)\', \'Spawn (1997)\', \'Crash (1996)\', \'Lost Highway (1997)\']', role='assistant', function_call=None, tool_calls=None))], created=1702086551, model='gpt-3.5-turbo-0613', object='chat.completion', system_fingerprint=None, usage=CompletionUsage(completion_tokens=88, prompt_tokens=299, total_tokens=387))

In [737]:
recommendations = response.apply(lambda x: x.choices[0].message.content.split("[")[-1].split("]")[0].split("\n"))

In [739]:
recommendations.head()

user_id
1     ['Contact (1997)', 'Air Force One (1997)', 'Co...
5     ['Contact (1997)', 'Air Force One (1997)', 'Co...
6     ['L.A. Confidential (1997)', 'English Patient,...
8     ['Saint, The (1997)', 'Cop Land (1997)', 'Retu...
10    ['Keys to Tulsa (1997)', 'First Wives Club, Th...
Name: user_id, dtype: object

In [712]:
hits = pd.concat((user_movies_test, recommendations), axis=1).dropna().apply(lambda x: len(set(x.iloc[0]).intersection(x.iloc[1])), axis = 1)

In [713]:
hits

user_id
1     0
13    0
14    0
dtype: int64