In [880]:
import pandas as pd

In [881]:
train = pd.read_csv('../data/train.csv')

In [882]:
train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,genres,avg_rating
0,259,255,4,874724710,My Best Friend's Wedding (1997),Romance,4.0
1,259,286,4,874724727,"English Patient, The (1996)","Romance, War",4.0
2,259,298,4,874724754,Face/Off (1997),"Action, Sci-Fi, Thriller",4.0
3,259,185,4,874724781,Psycho (1960),"Horror, Romance, Thriller",4.0
4,259,173,4,874724843,"Princess Bride, The (1987)","Action, Adventure, Romance",4.0


In [883]:
user_movies = train.groupby('user_id').apply(lambda x: x['movie_title'].tolist())

In [884]:
def get_user_recent_movies(dataframe, n = 10):
    return dataframe.groupby('user_id').apply(lambda x: x['movie_title'].tail(n).to_list())

user_recent_movies = get_user_recent_movies(train)

In [885]:
def get_user_top_movies(dataframe, n = 10):
    return dataframe.groupby('user_id').apply(lambda x: x.sort_values('rating', ascending=False).head(n)['movie_title'].to_list())

user_top_movies = get_user_top_movies(train)

In [886]:
movie_genres = train.groupby('movie_title').apply(lambda x: x['genres'].unique().tolist()).apply(lambda x: x[0].split(", "))

In [887]:
user_top_movies.head()

user_id
1     [Empire Strikes Back, The (1980), Usual Suspec...
5     [Wrong Trousers, The (1993), This Is Spinal Ta...
6     [Down by Law (1986), Graduate, The (1967), Lon...
8     [Contact (1997), Die Hard (1988), Braveheart (...
10    [Amadeus (1984), Taxi Driver (1976), All About...
dtype: object

In [888]:
user_top_movie_genres = user_top_movies.apply(lambda x: pd.Series(movie_genres.loc[x].sum()).unique().tolist())

In [889]:
user_ratings = train.drop_duplicates(['user_id', 'movie_title']).pivot(index='user_id', columns='movie_title', values='avg_rating').fillna(0)

In [890]:
from scipy.sparse import csr_matrix
sparse_user_ratings = csr_matrix(user_ratings)

In [891]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [892]:
def get_similar_users(sparse_user_ratings, index, n = 10):
    return pd.DataFrame(cosine_similarity(sparse_user_ratings) - np.identity(sparse_user_ratings.shape[0]), index = index, columns = index).apply(lambda x: list(x.sort_values(ascending = False).head(n).index), axis=1)

similar_users = get_similar_users(sparse_user_ratings, user_ratings.index)

In [893]:
similar_users.head()

user_id
1     [916, 457, 268, 435, 429, 823, 301, 276, 889, ...
5      [648, 407, 307, 497, 268, 276, 22, 622, 804, 70]
6       [18, 194, 716, 10, 666, 151, 321, 85, 389, 854]
8      [746, 158, 37, 352, 638, 425, 22, 627, 671, 538]
10      [6, 406, 666, 389, 321, 398, 524, 716, 194, 18]
dtype: object

In [894]:
def get_candidate_movies(similar_users, dataframe, n = 20):
    return similar_users.apply(lambda x: list(dataframe.loc[dataframe['user_id'].isin(x)].groupby('movie_title').sum().sort_values('rating', ascending = False).index[:n]))

candidate_movies = get_candidate_movies(similar_users, train)

In [895]:
user_prompt_data = pd.concat((user_movies, user_top_movies, user_recent_movies, similar_users, candidate_movies, user_top_movie_genres), axis = 1, keys = ['user_movies', 'user_top_movies', 'user_recent_movies', 'similar_users', 'candidate_movies', 'user_top_movie_genres'])

Collaborative Filtering Prompt

In [896]:
new_line = '\n'

def collabPrompt(each, user_prompt_data, new_line = '\n'):
    return f"""I am user {each}.
The most recent ten movies I have seen are:
{", ".join(user_prompt_data.loc[each, 'user_recent_movies'])}.
My top rated movies are:
{", ".join(user_prompt_data.loc[each, 'user_top_movies'])}.
The users who are most like me are {", ".join([str(each) for each in user_prompt_data.loc[each, 'similar_users']])}.
The top movies for each of these users are:
{new_line.join([f"{each}: {', '.join(user_prompt_data.loc[each, 'user_top_movies'])}" for each in user_prompt_data.loc[each, 'similar_users']])}.
Please recommend ten movies for me to watch that I have not seen. Provide brackets around your recommendations so I can easily parse them.
For example (['Midnight Cowboy (1969)', 'Lost in Translation (2003)', etc.])"""
    

In [897]:
collab_prompts = pd.Series(user_prompt_data.index, index = user_prompt_data.index).apply(lambda x: collabPrompt(x, user_prompt_data))

In [898]:
from openai import OpenAI
from key import OPENAI_API_KEY
client = OpenAI(api_key=OPENAI_API_KEY)

In [899]:
collab_response = collab_prompts.iloc[:10].apply(lambda x: client.chat.completions.create(
    model="gpt-3.5-turbo",
        messages= [{ 'role':'user','content' : x}],
        # temperature=0,
        # max_tokens=512,
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0,
        ))

In [900]:
collab_recommendations = collab_response.apply(lambda x: x.choices[0].message.content.split("[")[1].split("]")[0].split(", "))

In [901]:
collab_recommendations.head()

user_id
1                  ['Shawshank Redemption, The (1994)']
5                               ['Pulp Fiction (1994)']
6     ['Pulp Fiction (1994)', 'Fight Club (1999)', '...
8     ['The Shawshank Redemption (1994)', 'The Godfa...
10           ['One Flew Over the Cuckoo's Nest (1975)']
Name: user_id, dtype: object

In [902]:
test = pd.read_csv('../data/test.csv')

In [903]:
user_movies_test = test.groupby('user_id').apply(lambda x: x['movie_title'].tolist())

In [904]:
collab_hits = pd.concat((user_movies_test, collab_recommendations), axis=1).dropna().apply(lambda x: len(set(x.iloc[0]).intersection(x.iloc[1])), axis = 1)

Prompt Similar to Zero-Shot Paper (reduced to one prompt, added genres)

In [905]:
def genrePrompts(each, user_prompt_data, new_line = '\n'):
    return f"""
Candidate Set (candidate movies): {user_prompt_data.loc[each, 'candidate_movies']}.
The movies I have rated highly (watched movies): {user_prompt_data.loc[each, 'user_top_movies']}.
Their genres are: {user_prompt_data.loc[each, 'user_top_movie_genres']}.
Can you recommend 10 movies from the Candidate Set similar to but not in the selected movies I've watched?.
Please use brackets around the movies you recommend and separate the titles by new lines so I can easily parse them.
(Format Example: Here are the 10 movies recommended for you: [Midnight Cowboy (1969){new_line}Lost in Translation (2003){new_line}etc.])
Answer: 
"""

In [906]:
genre_prompts = pd.Series(user_prompt_data.index, index = user_prompt_data.index).apply(lambda x: genrePrompts(x, user_prompt_data))

In [907]:
genre_prompts.head()

user_id
1     \nCandidate Set (candidate movies): ['Chasing ...
5     \nCandidate Set (candidate movies): ['Raiders ...
6     \nCandidate Set (candidate movies): ['Casablan...
8     \nCandidate Set (candidate movies): ['Blade Ru...
10    \nCandidate Set (candidate movies): ['Casablan...
Name: user_id, dtype: object

In [908]:
genre_response = genre_prompts.iloc[:10].apply(lambda x: client.chat.completions.create(
    model="gpt-3.5-turbo",
        messages= [{ 'role':'user','content' : x}],
        # temperature=0,
        # max_tokens=512,
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0,
        ))

In [909]:
genre_recommendations = genre_response.apply(lambda x: x.choices[0].message.content.split("[")[-1].split("]")[0].split("\n"))

In [910]:
genre_recommendations.head()

user_id
1     [Chasing Amy (1997), Raiders of the Lost Ark (...
5     [Monty Python and the Holy Grail (1974), Princ...
6     [Casablanca (1942), Wizard of Oz, The (1939), ...
8     [Blade Runner (1982), Empire Strikes Back, The...
10    [Bonnie and Clyde (1967), To Catch a Thief (19...
Name: user_id, dtype: object

In [911]:
genre_hits = pd.concat((user_movies_test, genre_recommendations), axis=1).dropna().apply(lambda x: len(set(x.iloc[0]).intersection(x.iloc[1])), axis = 1)

Prompt Similar to Zero-Shot Paper (uses two prompts, adds genres)

In [912]:
def twoStepPrompt1(each, user_prompt_data):
    return f"""
Candidate Set (candidate movies): {user_prompt_data.loc[each, 'candidate_movies']}.
The movies I have rated highly (watched movies): {user_prompt_data.loc[each, 'user_top_movies']}.
Their genres are: {user_prompt_data.loc[each, 'user_top_movie_genres']}.
Step 1: What features are most important to me when selecting movies (Summarize my preferences briefly)? 
Answer: 
"""

def twoStepPrompt2(each, user_prompt_data, response1):
    return f"""
Candidate Set (candidate movies): {user_prompt_data.loc[each, 'candidate_movies']}.
The movies I have rated highly (watched movies): {user_prompt_data.loc[each, 'user_top_movies']}.
Their genres are: {user_prompt_data.loc[each, 'user_top_movie_genres']}.
Step 1: What features are most important to me when selecting movies (Summarize my preferences briefly)? 
Answer: {response1.loc[each]}.
Step 2: Can you recommend 10 movies from the Candidate Set similar to but not in the selected movies I've watched?
Please use brackets around the movies you recommend and separate the titles by new lines so I can easily parse them.
(Format Example: Here are the 10 movies recommended for you: [Midnight Cowboy (1969){new_line}Lost in Translation (2003){new_line}etc.])
Answer: 
"""

In [913]:
prompt1 = pd.Series(user_prompt_data.index, index = user_prompt_data.index).apply(lambda x: twoStepPrompt1(x, user_prompt_data))

In [914]:
response1 = prompt1.iloc[:10].apply(lambda x: client.chat.completions.create(
    model="gpt-3.5-turbo",
        messages= [{ 'role':'user','content' : x}],
        # temperature=0,
        # max_tokens=512,
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0,
        ))

In [915]:
response1.head()

user_id
1     ChatCompletion(id='chatcmpl-8TvrYt4GM67j4WWPqJ...
5     ChatCompletion(id='chatcmpl-8TvradtDvl8WMi185l...
6     ChatCompletion(id='chatcmpl-8TvreKL9UP4B3XDbja...
8     ChatCompletion(id='chatcmpl-8TvrgJ9bEnU1Rh5393...
10    ChatCompletion(id='chatcmpl-8TvrkbhFOnX8wffxUB...
Name: user_id, dtype: object

In [936]:
prompt2 = pd.Series(user_prompt_data.index, index = user_prompt_data.index).iloc[:10].apply(lambda x: twoStepPrompt2(x, user_prompt_data, response1))

In [937]:
response2 = prompt2.apply(lambda x: client.chat.completions.create(
    model="gpt-3.5-turbo",
        messages= [{ 'role':'user','content' : x}],
        # temperature=0,
        # max_tokens=512,
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0,
        ))

In [938]:
twoStep_recommendations = response2.apply(lambda x: x.choices[0].message.content.split("[")[-1].split("]")[0].split("\n"))

In [939]:
twoStep_recommendations.head()

user_id
1     [Chasing Amy (1997), Raiders of the Lost Ark (...
5     [Monty Python and the Holy Grail (1974), Princ...
6     [African Queen, The (1951), Casablanca (1942),...
8     [Blade Runner (1982), Empire Strikes Back, The...
10    [Casablanca (1942), Wizard of Oz, The (1939), ...
Name: user_id, dtype: object

In [940]:
twoStep_hits = pd.concat((user_movies_test, twoStep_recommendations), axis=1).dropna().apply(lambda x: len(set(x.iloc[0]).intersection(x.iloc[1])), axis = 1)

In [942]:
twoStep_hits.head()

user_id
1     0
13    0
14    1
dtype: int64

Prompt using wikipedia movie summaries

In [922]:
movie_wiki = pd.read_csv('../data/movie_wiki.csv')
movie_wiki.head()

Unnamed: 0,movie_title,wiki_summary
0,'Til There Was You (1997),'Til There Was You is a 1997 American romantic...
1,1-900 (1994),1-900 or 06 is a 1994 Dutch erotic romantic dr...
2,101 Dalmatians (1996),101 Dalmatians is a 1996 American adventure co...
3,12 Angry Men (1957),12 Angry Men is a 1957 American legal drama fi...
4,187 (1997),One Eight Seven (also known as 187) is a 1997 ...


In [923]:
def wikiPrompt(each, user_prompt_data, movie_wiki, new_line = '\n'):
    return f"""
Candidate Set (candidate movies): {user_prompt_data.loc[each, 'candidate_movies']}.
The movies I have rated highly (watched movies): {user_prompt_data.loc[each, 'user_top_movies']}.
Summary of the movies I have watched: {new_line.join([f"{eachMovie}: {movie_wiki.loc[movie_wiki['movie_title'] == eachMovie, 'wiki_summary'].iloc[0]}" for eachMovie in user_prompt_data.loc[1, 'user_top_movies']])}
Can you recommend 10 movies from the Candidate Set similar to but not in the selected movies I've watched?.
Please use brackets around the movies you recommend and separate the titles by new lines so I can easily parse them.
(Format Example: Here are the 10 movies recommended for you: [Midnight Cowboy (1969){new_line}Lost in Translation (2003){new_line}etc.])
Answer: 
"""

In [924]:
wiki_prompts = pd.Series(user_prompt_data.index, index = user_prompt_data.index).apply(lambda x: wikiPrompt(x, user_prompt_data, movie_wiki))

In [925]:
wiki_prompts.head()

user_id
1     \nCandidate Set (candidate movies): ['Chasing ...
5     \nCandidate Set (candidate movies): ['Raiders ...
6     \nCandidate Set (candidate movies): ['Casablan...
8     \nCandidate Set (candidate movies): ['Blade Ru...
10    \nCandidate Set (candidate movies): ['Casablan...
Name: user_id, dtype: object

In [926]:
wiki_response = wiki_prompts.iloc[:10].apply(lambda x: client.chat.completions.create(
    model="gpt-3.5-turbo",
        messages= [{ 'role':'user','content' : x}],
        # temperature=0,
        # max_tokens=512,
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0,
        ))

In [927]:
wiki_recommendations = wiki_response.apply(lambda x: x.choices[0].message.content.split("[")[-1].split("]")[0].split("\n"))

In [928]:
wiki_recommendations.head()

user_id
1     [Chasing Amy (1997), Raiders of the Lost Ark (...
5     [Monty Python and the Holy Grail (1974), Princ...
6     [Gone with the Wind (1939), Godfather, The (19...
8     [Blade Runner (1982), Empire Strikes Back, The...
10    [Casablanca (1942), Wizard of Oz, The (1939), ...
Name: user_id, dtype: object

In [932]:
wiki_hits = pd.concat((user_movies_test, wiki_recommendations), axis=1).dropna().apply(lambda x: len(set(x.iloc[0]).intersection(x.iloc[1])), axis = 1)

Comparison of Prompts

In [945]:
comparison = pd.concat((collab_hits, genre_hits, twoStep_hits, wiki_hits), axis = 1, keys = ['collab_hits', 'genre_hits', 'twoStep_hits', 'wiki_hits'])

In [946]:
comparison.describe()

Unnamed: 0,collab_hits,genre_hits,twoStep_hits,wiki_hits
count,3.0,3.0,3.0,3.0
mean,0.0,0.333333,0.333333,0.0
std,0.0,0.57735,0.57735,0.0
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.5,0.5,0.0
max,0.0,1.0,1.0,0.0
