This Notebook compares recommendations made by ChatGPT based on several prompts.

In [1]:
import pandas as pd

In [2]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

### Building the Training Data

In [3]:
train = pd.read_csv('../data/train.csv')

In [4]:
train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,movie_title,genres,avg_rating
0,259,255,4,874724710,My Best Friend's Wedding (1997),Romance,4.0
1,259,286,4,874724727,"English Patient, The (1996)","Romance, War",4.0
2,259,298,4,874724754,Face/Off (1997),"Action, Sci-Fi, Thriller",4.0
3,259,185,4,874724781,Psycho (1960),"Horror, Romance, Thriller",4.0
4,259,173,4,874724843,"Princess Bride, The (1987)","Action, Adventure, Romance",4.0


The movies each user has watched in the training data

In [5]:
user_movies = train.groupby('user_id').apply(lambda x: x['movie_title'].tolist())
user_movies.head()

user_id
1     [Empire Strikes Back, The (1980), Monty Python...
5     [unknown, Star Trek: First Contact (1996), Jac...
6     [Kolya (1996), English Patient, The (1996), L....
8     [Contact (1997), Liar Liar (1997), In & Out (1...
10    [Full Monty, The (1997), L.A. Confidential (19...
dtype: object

In [6]:
""" Number of users to run the recommender on
    (Use a small sample for faster results)
    If you want to run the recommender on all users, uncomment the last line
"""
# num_users = 5

# Uncomment below to run on all users
num_users = user_movies.shape[0]

The most recent movies each user has watched.

In [7]:
def get_user_recent_movies(dataframe, n = 10):
    return dataframe.groupby('user_id').apply(lambda x: x['movie_title'].tail(n).to_list())

user_recent_movies = get_user_recent_movies(train)
user_recent_movies.head()

user_id
1     [Dolores Claiborne (1994), French Twist (Gazon...
5     [Radioland Murders (1994), Houseguest (1994), ...
6     [Monty Python and the Holy Grail (1974), Bob R...
8     [Star Trek: First Contact (1996), Jurassic Par...
10    [To Wong Foo, Thanks for Everything! Julie New...
dtype: object

The movies each user rated highly.

In [8]:
def get_user_top_movies(dataframe, n = 10):
    return dataframe.groupby('user_id').apply(lambda x: x.sort_values('rating', ascending=False).head(n)\
                                              ['movie_title'].to_list())

user_top_movies = get_user_top_movies(train)
user_top_movies.head()

user_id
1     [Empire Strikes Back, The (1980), Usual Suspec...
5     [Wrong Trousers, The (1993), This Is Spinal Ta...
6     [Down by Law (1986), Graduate, The (1967), Lon...
8     [Contact (1997), Die Hard (1988), Braveheart (...
10    [Amadeus (1984), Taxi Driver (1976), All About...
dtype: object

The genres of each movie.

In [9]:
movie_genres = train.groupby('movie_title').apply(lambda x: x['genres'].unique().tolist()).\
                                            apply(lambda x: x[0].split(", "))
movie_genres.head()

movie_title
'Til There Was You (1997)     [Romance]
1-900 (1994)                  [Romance]
101 Dalmatians (1996)        [Children]
12 Angry Men (1957)           [unknown]
187 (1997)                    [unknown]
dtype: object

The genres for each users top rated movies

In [10]:
user_top_movie_genres = user_top_movies.apply(lambda x: pd.Series(movie_genres.loc[x].sum()).unique().tolist())
user_top_movie_genres.head()

user_id
1     [Action, Adventure, Romance, Sci-Fi, War, Thri...
5     [Animation, Musical, unknown, Action, Adventur...
6     [unknown, Romance, Mystery, Adventure, Sci-Fi,...
8     [Sci-Fi, Action, Thriller, War, Adventure, unk...
10    [Mystery, Thriller, unknown, War, Adventure, W...
dtype: object

The ratings provided by each user to every movie they watched in the training data.

In [11]:
user_ratings = train.drop_duplicates(['user_id', 'movie_title']).pivot(index='user_id', columns='movie_title',\
                                                                        values='avg_rating').fillna(0)
user_ratings.head()

movie_title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,"Wrong Trousers, The (1993)",Wyatt Earp (1994),Yankee Zulu (1994),Year of the Horse (1997),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,5.0,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0
5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,5.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0
6,0.0,0.0,0.0,4.0,0.0,0.0,0.0,5.0,0.0,0.0,...,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,5.0,0.0,0.0,0.0,5.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Identifying similar users based on cosine similarity

In [12]:
from scipy.sparse import csr_matrix
sparse_user_ratings = csr_matrix(user_ratings)

In [13]:
def get_similar_users(sparse_user_ratings, index, n = 10):
    df = pd.DataFrame(cosine_similarity(sparse_user_ratings) - np.identity(sparse_user_ratings.shape[0]), \
                        index = index, columns = index)
    return df.apply(lambda x: list(x.sort_values(ascending = False).head(n).index), axis=1)
    # return pd.DataFrame(cosine_similarity(sparse_user_ratings) - np.identity(sparse_user_ratings.shape[0]), \
    #                     index = index, columns = index).apply(lambda x: list(x.sort_values(ascending = False)\
    #                                                                          .head(n).index), axis=1)

similar_users = get_similar_users(sparse_user_ratings, user_ratings.index)

In [14]:
similar_users.head()

user_id
1     [916, 457, 268, 435, 429, 823, 301, 276, 889, ...
5      [648, 407, 307, 497, 268, 276, 22, 622, 804, 70]
6       [18, 194, 716, 10, 666, 151, 321, 85, 389, 854]
8      [746, 158, 37, 352, 638, 425, 22, 627, 671, 538]
10      [6, 406, 666, 389, 321, 398, 524, 716, 194, 18]
dtype: object

Identifying Candidate movies which are the movies each user's similar users rated most highly overall.

In [15]:
def get_candidate_movies(similar_users, dataframe, n=20):
    """
    This function gets the top-n candidate movies for each user based on the movies 
    rated by similar users.
    
    Parameters:
    similar_users (Series): A pandas Series where each entry is a list of user_ids that 
                            are similar to a specific user.
    dataframe (DataFrame): The pandas DataFrame containing user ratings with columns 'user_id', 
                           'movie_title', and 'rating'.
    n (int): The number of top-rated movies to return for each user.
    
    Returns:
    Series: A pandas Series where each entry is a list of n movie titles recommended for each user.
    """
    # Define a helper function to get top-n movies for a single user
    def get_top_movies(user_ids):
        # Filter the dataframe for movies rated by similar users
        filtered_movies = dataframe[dataframe['user_id'].isin(user_ids)]
        # Group by movie title, sum the ratings, and get the top-n movies
        top_movies = (
            filtered_movies.groupby('movie_title')['rating']
            .sum()
            .sort_values(ascending=False)
            .head(n)
            .index
            .tolist()
        )
        return top_movies

    # Apply the helper function to each entry in similar_users
    return similar_users.apply(get_top_movies)

candidate_movies = get_candidate_movies(similar_users, train)
candidate_movies.head()

user_id
1     [Chasing Amy (1997), Star Wars (1977), Empire ...
5     [Raiders of the Lost Ark (1981), Monty Python ...
6     [Casablanca (1942), Wizard of Oz, The (1939), ...
8     [Blade Runner (1982), Raiders of the Lost Ark ...
10    [Casablanca (1942), Wizard of Oz, The (1939), ...
dtype: object

In [16]:
user_prompt_data = pd.concat((user_movies, user_top_movies, user_recent_movies, \
                              similar_users, candidate_movies, user_top_movie_genres), \
                            axis = 1, \
                            keys = ['user_movies', 'user_top_movies', 'user_recent_movies', \
                                    'similar_users', 'candidate_movies', 'user_top_movie_genres'])
user_prompt_data.head()

Unnamed: 0_level_0,user_movies,user_top_movies,user_recent_movies,similar_users,candidate_movies,user_top_movie_genres
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,"[Empire Strikes Back, The (1980), Monty Python...","[Empire Strikes Back, The (1980), Usual Suspec...","[Dolores Claiborne (1994), French Twist (Gazon...","[916, 457, 268, 435, 429, 823, 301, 276, 889, ...","[Chasing Amy (1997), Star Wars (1977), Empire ...","[Action, Adventure, Romance, Sci-Fi, War, Thri..."
5,"[unknown, Star Trek: First Contact (1996), Jac...","[Wrong Trousers, The (1993), This Is Spinal Ta...","[Radioland Murders (1994), Houseguest (1994), ...","[648, 407, 307, 497, 268, 276, 22, 622, 804, 70]","[Raiders of the Lost Ark (1981), Monty Python ...","[Animation, Musical, unknown, Action, Adventur..."
6,"[Kolya (1996), English Patient, The (1996), L....","[Down by Law (1986), Graduate, The (1967), Lon...","[Monty Python and the Holy Grail (1974), Bob R...","[18, 194, 716, 10, 666, 151, 321, 85, 389, 854]","[Casablanca (1942), Wizard of Oz, The (1939), ...","[unknown, Romance, Mystery, Adventure, Sci-Fi,..."
8,"[Contact (1997), Liar Liar (1997), In & Out (1...","[Contact (1997), Die Hard (1988), Braveheart (...","[Star Trek: First Contact (1996), Jurassic Par...","[746, 158, 37, 352, 638, 425, 22, 627, 671, 538]","[Blade Runner (1982), Raiders of the Lost Ark ...","[Sci-Fi, Action, Thriller, War, Adventure, unk..."
10,"[Full Monty, The (1997), L.A. Confidential (19...","[Amadeus (1984), Taxi Driver (1976), All About...","[To Wong Foo, Thanks for Everything! Julie New...","[6, 406, 666, 389, 321, 398, 524, 716, 194, 18]","[Casablanca (1942), Wizard of Oz, The (1939), ...","[Mystery, Thriller, unknown, War, Adventure, W..."


### Prompt Engineering

In [17]:
from openai import OpenAI
from key import OPENAI_API_KEY
client = OpenAI(api_key=OPENAI_API_KEY)

Collaborative Filtering Prompt

In [18]:
new_line = '\n'

def collabPrompt(each, user_prompt_data, new_line = '\n'):
    return f"""I am user {each}.
The most recent ten movies I have seen are:
{", ".join(user_prompt_data.loc[each, 'user_recent_movies'])}.
My top rated movies are:
{", ".join(user_prompt_data.loc[each, 'user_top_movies'])}.
The users who are most like me are {", ".join([str(each) for each in user_prompt_data.loc[each, 'similar_users']])}.
The top movies for each of these users are:
{new_line.join([f"{each}: {', '.join(user_prompt_data.loc[each, 'user_top_movies'])}" for each in user_prompt_data.loc[each, 'similar_users']])}.
Please recommend ten movies for me to watch that I have not seen. 
Please use brackets around the movies you recommend and separate the titles by new lines so I can easily parse them.
(Format Example: Here are the 10 movies recommended for you: [Midnight Cowboy (1969){new_line}Lost in Translation (2003){new_line}etc.]) 
Answer: 
"""   

In [19]:
collab_prompts = pd.Series(user_prompt_data.index, index = user_prompt_data.index).apply(lambda x: collabPrompt(x, user_prompt_data))
collab_prompts.head()

user_id
1     I am user 1.\nThe most recent ten movies I hav...
5     I am user 5.\nThe most recent ten movies I hav...
6     I am user 6.\nThe most recent ten movies I hav...
8     I am user 8.\nThe most recent ten movies I hav...
10    I am user 10.\nThe most recent ten movies I ha...
Name: user_id, dtype: object

In [20]:
collab_response = collab_prompts.iloc[:num_users].apply(lambda x: client.chat.completions.create(
    model="gpt-3.5-turbo",
        messages= [{ 'role':'user','content' : x}],
        # temperature=0,
        # max_tokens=512,
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0,
        ))

In [21]:
collab_recommendations = collab_response.apply(lambda x: x.choices[0].message.content\
                                               .split("[")[1].split("]")[0].split("\n"))

In [22]:
collab_recommendations.head()

user_id
1        [Eternal Sunshine of the Spotless Mind (2004)]
5     [Shawshank Redemption, The (1994), Forrest Gum...
6     [Midnight Cowboy (1969), Lost in Translation (...
8     [The Shawshank Redemption (1994), The Godfathe...
10                          [A Clockwork Orange (1971)]
Name: user_id, dtype: object

In [23]:
test = pd.read_csv('../data/test.csv')

In [24]:
user_movies_test = test.groupby('user_id').apply(lambda x: x['movie_title'].tolist())

In [25]:
collab_data = pd.concat(
    [user_movies, user_movies_test, collab_recommendations], 
    axis=1, 
    keys=['user_movies', 'user_movies_test', 'collab_recommendations']
)

In [26]:
# Function to calculate hits
def calculate_hits(row):
    user_movies_set = set(row['user_movies'])
    user_movies_test_set = set(row['user_movies_test']) if row['user_movies_test'] is not np.nan else set()
    collab_recommendations_set = set(row.iloc[2])
    
    # Calculate the union of user_movies and user_movies_test, then find its intersection with collab_recommendations
    return len(user_movies_set.union(user_movies_test_set).intersection(collab_recommendations_set))

In [27]:
collab_hits = collab_data.dropna(subset='collab_recommendations').apply(calculate_hits, axis=1)

Prompt Similar to Zero-Shot Paper (reduced to one prompt, added genres)

In [28]:
def genrePrompts(each, user_prompt_data, new_line = '\n'):
    return f"""
Candidate Set (candidate movies): {user_prompt_data.loc[each, 'candidate_movies']}.
The movies I have rated highly (watched movies): {user_prompt_data.loc[each, 'user_top_movies']}.
Their genres are: {user_prompt_data.loc[each, 'user_top_movie_genres']}.
Can you recommend 10 movies from the Candidate Set similar to but not in the selected movies I've watched?.
Please use brackets around the movies you recommend and separate the titles by new lines so I can easily parse them.
(Format Example: Here are the 10 movies recommended for you: [Midnight Cowboy (1969){new_line}Lost in Translation (2003){new_line}etc.])
Answer: 
"""

In [29]:
genre_prompts = pd.Series(user_prompt_data.index, index = user_prompt_data.index).apply(lambda x: genrePrompts(x, user_prompt_data))

In [30]:
genre_prompts.head()

user_id
1     \nCandidate Set (candidate movies): ['Chasing ...
5     \nCandidate Set (candidate movies): ['Raiders ...
6     \nCandidate Set (candidate movies): ['Casablan...
8     \nCandidate Set (candidate movies): ['Blade Ru...
10    \nCandidate Set (candidate movies): ['Casablan...
Name: user_id, dtype: object

In [31]:
genre_response = genre_prompts.iloc[:num_users].apply(lambda x: client.chat.completions.create(
    model="gpt-3.5-turbo",
        messages= [{ 'role':'user','content' : x}],
        # temperature=0,
        # max_tokens=512,
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0,
        ))

In [None]:
genre_recommendations = genre_response.apply(lambda x: x.choices[0].message.content.split("[")[-1].split("]")[0].split("\n"))

In [None]:
genre_recommendations.head()

user_id
1     [Chasing Amy (1997), Raiders of the Lost Ark (...
5     [Monty Python and the Holy Grail (1974), Princ...
6     [Raiders of the Lost Ark (1981), Star Wars (19...
8     [Blade Runner (1982), Empire Strikes Back, The...
10    [Wizard of Oz, The (1939), African Queen, The ...
Name: user_id, dtype: object

In [None]:
genre_data = pd.concat(
    [user_movies, user_movies_test, genre_recommendations], 
    axis=1, 
    keys=['user_movies', 'user_movies_test', 'genre_recommendations']
)

In [None]:
genre_hits = genre_data.dropna(subset='genre_recommendations').apply(calculate_hits, axis=1)

Prompt Similar to Zero-Shot Paper (uses two prompts, adds genres)

In [None]:
def twoStepPrompt1(each, user_prompt_data):
    return f"""
Candidate Set (candidate movies): {user_prompt_data.loc[each, 'candidate_movies']}.
The movies I have rated highly (watched movies): {user_prompt_data.loc[each, 'user_top_movies']}.
Their genres are: {user_prompt_data.loc[each, 'user_top_movie_genres']}.
Step 1: What features are most important to me when selecting movies (Summarize my preferences briefly)? 
Answer: 
"""

def twoStepPrompt2(each, user_prompt_data, response1):
    return f"""
Candidate Set (candidate movies): {user_prompt_data.loc[each, 'candidate_movies']}.
The movies I have rated highly (watched movies): {user_prompt_data.loc[each, 'user_top_movies']}.
Their genres are: {user_prompt_data.loc[each, 'user_top_movie_genres']}.
Step 1: What features are most important to me when selecting movies (Summarize my preferences briefly)? 
Answer: {response1.loc[each]}.
Step 2: Can you recommend 10 movies from the Candidate Set similar to but not in the selected movies I've watched?
Please use brackets around the movies you recommend and separate the titles by new lines so I can easily parse them.
(Format Example: Here are the 10 movies recommended for you: [Midnight Cowboy (1969){new_line}Lost in Translation (2003){new_line}etc.])
Answer: 
"""

In [None]:
prompt1 = pd.Series(user_prompt_data.index, index = user_prompt_data.index).apply(lambda x: twoStepPrompt1(x, user_prompt_data))

In [None]:
response1 = prompt1.iloc[:num_users].apply(lambda x: client.chat.completions.create(
    model="gpt-3.5-turbo",
        messages= [{ 'role':'user','content' : x}],
        # temperature=0,
        # max_tokens=512,
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0,
        ))

In [None]:
response1.head()

user_id
1     ChatCompletion(id='chatcmpl-8UggmzDWTfxLrmBGqf...
5     ChatCompletion(id='chatcmpl-8UggoE4vq2jGPbny9p...
6     ChatCompletion(id='chatcmpl-8UggrQP6RIPVzbTATv...
8     ChatCompletion(id='chatcmpl-8Uggtm4Qjx5mgwT3PZ...
10    ChatCompletion(id='chatcmpl-8Uggxkqz62MENUliHR...
Name: user_id, dtype: object

In [None]:
prompt2 = pd.Series(user_prompt_data.index, index = user_prompt_data.index).iloc[:num_users].apply(lambda x: twoStepPrompt2(x, user_prompt_data, response1))

In [None]:
response2 = prompt2.apply(lambda x: client.chat.completions.create(
    model="gpt-3.5-turbo",
        messages= [{ 'role':'user','content' : x}],
        # temperature=0,
        # max_tokens=512,
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0,
        ))

In [None]:
twoStep_recommendations = response2.apply(lambda x: x.choices[0].message.content.split("[")[-1].split("]")[0].split("\n"))

In [None]:
twoStep_recommendations.head()

user_id
1     [Chasing Amy (1997), Raiders of the Lost Ark (...
5     [Monty Python and the Holy Grail (1974), Princ...
6     [Raiders of the Lost Ark (1981), Shawshank Red...
8     [Blade Runner (1982), Empire Strikes Back, The...
10    [Casablanca (1942), Wizard of Oz, The (1939), ...
Name: user_id, dtype: object

In [None]:
twoStep_data = pd.concat(
    [user_movies, user_movies_test, twoStep_recommendations], 
    axis=1, 
    keys=['user_movies', 'user_movies_test', 'twoStep_recommendations']
)

In [None]:
twoStep_hits = twoStep_data.dropna(subset='twoStep_recommendations').apply(calculate_hits, axis=1)

Prompt using wikipedia movie summaries

In [None]:
movie_wiki = pd.read_csv('../data/movie_wiki.csv')
movie_wiki.head()

Unnamed: 0,movie_title,wiki_summary
0,'Til There Was You (1997),'Til There Was You is a 1997 American romantic...
1,1-900 (1994),1-900 or 06 is a 1994 Dutch erotic romantic dr...
2,101 Dalmatians (1996),101 Dalmatians is a 1996 American adventure co...
3,12 Angry Men (1957),12 Angry Men is a 1957 American legal drama fi...
4,187 (1997),One Eight Seven (also known as 187) is a 1997 ...


In [None]:
def wikiPrompt(each, user_prompt_data, movie_wiki, new_line = '\n'):
    return f"""
Candidate Set (candidate movies): {user_prompt_data.loc[each, 'candidate_movies']}.
The movies I have rated highly (watched movies): {user_prompt_data.loc[each, 'user_top_movies']}.
Summary of the movies I have watched: {new_line.join([f"{eachMovie}: {movie_wiki.loc[movie_wiki['movie_title'] == eachMovie, 'wiki_summary'].iloc[0]}" for eachMovie in user_prompt_data.loc[1, 'user_top_movies']])}
Can you recommend 10 movies from the Candidate Set similar to but not in the selected movies I've watched?.
Please use brackets around the movies you recommend and separate the titles by new lines so I can easily parse them.
(Format Example: Here are the 10 movies recommended for you: [Midnight Cowboy (1969){new_line}Lost in Translation (2003){new_line}etc.])
Answer: 
"""

In [None]:
wiki_prompts = pd.Series(user_prompt_data.index, index = user_prompt_data.index).apply(lambda x: wikiPrompt(x, user_prompt_data, movie_wiki))

In [None]:
wiki_prompts.head()

user_id
1     \nCandidate Set (candidate movies): ['Chasing ...
5     \nCandidate Set (candidate movies): ['Raiders ...
6     \nCandidate Set (candidate movies): ['Casablan...
8     \nCandidate Set (candidate movies): ['Blade Ru...
10    \nCandidate Set (candidate movies): ['Casablan...
Name: user_id, dtype: object

In [None]:
wiki_response = wiki_prompts.iloc[:num_users].apply(lambda x: client.chat.completions.create(
    model="gpt-3.5-turbo",
        messages= [{ 'role':'user','content' : x}],
        # temperature=0,
        # max_tokens=512,
        # top_p=1,
        # frequency_penalty=0,
        # presence_penalty=0,
        ))

In [None]:
wiki_recommendations = wiki_response.apply(lambda x: x.choices[0].message.content.split("[")[-1].split("]")[0].split("\n"))

In [None]:
wiki_recommendations.head()

user_id
1     [Chasing Amy (1997), Raiders of the Lost Ark (...
5     [Monty Python and the Holy Grail (1974), Princ...
6     [Shawshank Redemption, The (1994), African Que...
8     [Empire Strikes Back, The (1980), Terminator 2...
10    [Citizen Kane (1941), Casablanca (1942), Wizar...
Name: user_id, dtype: object

In [None]:
wiki_data = pd.concat(
    [user_movies, user_movies_test, wiki_recommendations], 
    axis=1, 
    keys=['user_movies', 'user_movies_test', 'wiki_recommendations']
)

In [None]:
wiki_hits = wiki_data.dropna(subset='wiki_recommendations').apply(calculate_hits, axis=1)

Baseline Recommender: Recommend top 10 most popular movies to every user

In [None]:
viewings = train.groupby('movie_title').count().sort_values('user_id', ascending=False)

In [None]:
viewings.head()

Unnamed: 0_level_0,user_id,movie_id,rating,timestamp,genres,avg_rating
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Star Wars (1977),393,393,393,393,393,393
Fargo (1996),355,355,355,355,355,355
Return of the Jedi (1983),350,350,350,350,350,350
Liar Liar (1997),317,317,317,317,317,317
Toy Story (1995),315,315,315,315,315,315


In [None]:
top_10_movies = viewings.head(10).index.tolist()
top_10_movies

['Star Wars (1977)',
 'Fargo (1996)',
 'Return of the Jedi (1983)',
 'Liar Liar (1997)',
 'Toy Story (1995)',
 'English Patient, The (1996)',
 'Independence Day (ID4) (1996)',
 'Contact (1997)',
 'Scream (1996)',
 'Raiders of the Lost Ark (1981)']

In [None]:
baseline_data = pd.concat((user_movies, user_movies_test), axis=1).dropna().iloc[:num_users]
baseline_hits = baseline_data.apply(lambda x: len(set(x.iloc[0]).union(x.iloc[1]).intersection(top_10_movies)), axis = 1)

Comparison of Prompts

In [None]:
comparison = pd.concat((collab_hits, genre_hits, twoStep_hits, wiki_hits, baseline_hits), axis = 1, keys = ['collab_hits', 'genre_hits', 'twoStep_hits', 'wiki_hits', 'baseline_hits'])

Below values are Hit Rate in % (i.e. 100 is 100%)

In [None]:
comparison.describe().iloc[1:] / 10 * 100

Unnamed: 0,collab_hits,genre_hits,twoStep_hits,wiki_hits,baseline_hits
mean,5.2657,69.259259,66.553945,68.196457,67.744361
std,8.879222,25.446315,28.517834,26.802143,23.277527
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,50.0,50.0,50.0,50.0
50%,0.0,70.0,70.0,70.0,70.0
75%,10.0,90.0,90.0,90.0,90.0
max,80.0,100.0,100.0,100.0,100.0
