In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
# Load the ratings dataframe
ratings_df = pd.read_csv('../Data/movies_dataset/ratings.csv')

columns_to_keep = ['userId', 'movieId', 'rating']
ratings_df = ratings_df[columns_to_keep]


# Also load the movied dataframe to link the movieId's
movies_df = pd.read_csv('../Data/movies_dataset/movies_metadata.csv')
movies_df = movies_df.dropna(subset=['vote_count', 'id']) # remove na
movies_df[['id']] = movies_df[['id']].astype(int)

movies_df = movies_df.drop_duplicates(subset=['title', 'original_title'], keep='last')

# Keep only movies with english as original language
movies_df = movies_df.loc[movies_df.original_language=='en']

  movies_df = pd.read_csv('../Data/movies_dataset/movies_metadata.csv')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df[['id']] = movies_df[['id']].astype(int)


Keep the movies in the ratings_df whose ids also exist in the movies_df:

In [3]:
movies_df_ids = movies_df.id.to_list()

ratings_df =  ratings_df.loc[ratings_df.movieId.isin(movies_df_ids)]

Now, to apply a priori, we need the list of items for each user. So for each `userId`, we need to get the movies he/she has seen. A threshold is also set so that the movies that have not been rated well by the user are not included. Later, the users with only a few ratings are removed.

In [4]:
movie_rating_counts =  movies_df.vote_count.to_numpy()

q = 70 # percentile
counts_perc = np.percentile(movie_rating_counts, q=q).astype(int)
print(f"The {q}'th percentile of the movie counts rating is {counts_perc} movies rated")

The 70'th percentile of the movie counts rating is 28 movies rated


In [5]:
rating_threshold = 3.5 # Min movie rating to be considered liked

In [6]:
# Collect all the movies a user has seen in a list

movies_per_user = []
for el in tqdm(list(ratings_df.groupby('userId'))):
    
    # Get all the movies a user has seen
    if len(el[1]['movieId'].to_list()) >= counts_perc:
        good_movies_by_user = [movie for (movie, rating) in zip(el[1]['movieId'].to_list(),el[1]['rating'].to_list()) if rating >= rating_threshold]
    
        movies_per_user.append(np.array(good_movies_by_user).astype(int))

  0%|          | 0/260554 [00:00<?, ?it/s]

In [7]:
# Find unique movies id's list, to use for one-hot-encoding
all_movies_ids = np.concatenate(movies_per_user)
all_movies_ids = sorted( set(list(all_movies_ids.flat))) # ascending order, not essential

In [8]:
# We can create a dataframe with the following columns: movie_id, movie_title
movie_titles = []
movie_original_titles = []
movie_ids = []

for id_ in tqdm(all_movies_ids):
    if not movies_df.loc[movies_df['id']==id_].empty:
        movie_titles.append(movies_df.loc[movies_df['id']==id_].title.item())
        movie_original_titles.append(movies_df.loc[movies_df['id']==id_].original_title.item()) 
        movie_ids.append(id_)

  0%|          | 0/4507 [00:00<?, ?it/s]

In [9]:
movie_titles_id_df = pd.DataFrame.from_dict({'title':movie_titles, 'original_title':movie_original_titles, 'id':movie_ids})
movie_titles_id_df

Unnamed: 0,title,original_title,id
0,Four Rooms,Four Rooms,5
1,Judgment Night,Judgment Night,6
2,Star Wars,Star Wars,11
3,Finding Nemo,Finding Nemo,12
4,Forrest Gump,Forrest Gump,13
...,...,...,...
4502,Cheap Thrills,Cheap Thrills,175291
4503,Fratricide,Brudermord,175331
4504,These Birds Walk,These Birds Walk,175427
4505,Enter the Dangerous Mind,Enter the Dangerous Mind,176077


Now, all the unique ids are in the column `id`, and the index can be used as a hashcode. But first, a list of baskets needs to be created. This is the list `movies_per_user_fitlered`, however not all movies in each basket exist. As a result, the baskets list is created by taking only the movies from each basket that exist in the `movie_titles_df` data frame:

In [10]:
try:
    baskets_df = pd.read_csv('baskets.csv')
except FileNotFoundError as e:
    print(e)
    print('Generating baskets...')
    baskets = []
    unique_movies_id = movie_titles_id_df.id.to_list()
    for user in tqdm(movies_per_user):
        basket = []
        for movie in user:
            if movie in unique_movies_id:
                basket.append(movie)
        baskets.append(basket)

    baskets_df = pd.DataFrame.from_dict({'basket_no':np.arange(1, len(baskets)+1), 'baskets':baskets})
    baskets_df.to_csv('baskets.csv', index=False)

Generating baskets...


  0%|          | 0/69837 [00:00<?, ?it/s]

This took some time to be created, better save it in a csv file:

In [11]:
baskets_df

Unnamed: 0,basket_no,baskets
0,1,"[318, 912, 968, 1266, 1405, 2019, 3175, 3527, ..."
1,2,"[165, 296, 318, 500, 527, 1073, 1370, 1573, 19..."
2,3,"[16, 82, 97, 123, 150, 162, 223, 232, 235, 247..."
3,4,"[6, 107, 293, 296, 541, 599, 608, 745, 750, 12..."
4,5,"[296, 318, 380, 541, 968, 1580, 1676, 1690, 17..."
...,...,...
69832,69833,"[65, 69, 150, 293, 296, 441, 588, 592, 858, 12..."
69833,69834,"[6, 11, 16, 22, 71, 89, 95, 105, 150, 161, 165..."
69834,69835,"[70, 165, 318, 431, 527, 750, 968, 1073, 1090,..."
69835,69836,"[318, 541, 608, 858, 1090, 1387, 1997, 2028, 2..."


We can now use the a priori algorithm to find 'transcations' above a given support. Use `mlxtend` library.

In [12]:
te = TransactionEncoder()
te_ary = te.fit_transform(baskets)

In [13]:
df_one_hot = pd.DataFrame(te_ary, columns=te.columns_)
df_one_hot

Unnamed: 0,5,6,11,12,13,14,15,16,18,20,...,174645,174671,174675,175245,175287,175291,175331,175427,176077,176143
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69832,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
69833,False,True,True,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
69834,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
69835,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [126]:
min_support = 0.09 # Not lower unless you have 64G ram and linux, or filter out more movies, using higher percentile for counts_perc


frq_items = apriori(df_one_hot, min_support = min_support, use_colnames = True)
frq_items['length'] = frq_items['itemsets'].apply(lambda x: len(x))
frq_items

MemoryError: Unable to allocate 611. MiB for an array with shape (69837, 9180) and data type bool

In [15]:
a = association_rules(frq_items, metric="confidence", min_threshold=0.6)
a

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(6),(296),0.206724,0.598766,0.164326,0.794902,1.327568,0.040546,1.956305
1,(6),(318),0.206724,0.602503,0.154560,0.747662,1.240927,0.030008,1.575258
2,(11),(318),0.132165,0.602503,0.093231,0.705417,1.170811,0.013602,1.349355
3,(16),(296),0.167862,0.598766,0.142117,0.846626,1.413953,0.041606,2.616057
4,(16),(318),0.167862,0.602503,0.131735,0.784782,1.302536,0.030598,1.846951
...,...,...,...,...,...,...,...,...,...
8825,"(296, 2762, 4226, 2959)","(4993, 318)",0.153457,0.280324,0.104658,0.682001,2.432899,0.061640,2.263135
8826,"(296, 2762, 4226, 318)","(4993, 2959)",0.150622,0.274482,0.104658,0.694838,2.531452,0.063315,2.377484
8827,"(2762, 4226, 318, 2959)","(296, 4993)",0.148073,0.277346,0.104658,0.706798,2.548436,0.063590,2.464699
8828,"(296, 2762, 318, 2959)","(4993, 4226)",0.173647,0.235405,0.104658,0.602705,2.560285,0.063781,1.924500


In [121]:
movies = movies_df.title.to_list()

# user_movies = np.random.choice(movies,3)
user_movies = ['Saw']
user_movies_ids = [ movies_df.loc[movies_df.title==i].id.item() for i in user_movies ]

print(f"User movies: {user_movies}")
print(f"User movies id's: {user_movies_ids}")

User movies: ['Saw']
User movies id's: [246355]


In [83]:
consequents = a["consequents"].apply(lambda x:  list(x) ).astype("unicode")

In [84]:
antecedents = a["antecedents"].apply(lambda x:  list(x) ).astype("unicode")

In [85]:
def frozen_set_to_list(frznset):
    l = frznset[1:-1].split(',')
    l = [int(i.strip()) for i in l]
    return l

In [86]:
consequents = consequents.apply(lambda x: frozen_set_to_list(x))

In [87]:
antecedents = antecedents.apply(lambda x: frozen_set_to_list(x))

In [91]:
antecedents, consequents

(0                           [6]
 1                           [6]
 2                          [11]
 3                          [16]
 4                          [16]
                  ...           
 8825    [296, 2762, 4226, 2959]
 8826     [296, 2762, 4226, 318]
 8827    [2762, 4226, 318, 2959]
 8828     [296, 2762, 318, 2959]
 8829         [2762, 4993, 4226]
 Name: antecedents, Length: 8830, dtype: object,
 0                  [296]
 1                  [318]
 2                  [318]
 3                  [296]
 4                  [318]
               ...       
 8825         [4993, 318]
 8826        [4993, 2959]
 8827         [296, 4993]
 8828        [4993, 4226]
 8829    [296, 318, 2959]
 Name: consequents, Length: 8830, dtype: object)

In [95]:
movies_df.loc[movies_df.id==6].title.item()

'Judgment Night'

In [97]:
def get_title_from_index(movies_df_in, idx):
    return movies_df_in.loc[movies_df_in.id==idx].title.item()

In [119]:
def get_containing_lists(list_of_antecedents, user_movie_list):
    matches = []
    for antecedent in list_of_antecedents:
        print(antecedent)
        for item in user_movie_list:
            if item in antecedent:
                matches.append(antecedent)
    return matches