In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [2]:
# Load the ratings dataframe
ratings_df = pd.read_csv('../Data/movies_dataset/ratings.csv')

columns_to_keep = ['userId', 'movieId', 'rating']
ratings_df = ratings_df[columns_to_keep]


# Also load the movied dataframe to link the movieId's
movies_df = pd.read_csv('../Data/movies_dataset/movies_metadata.csv')
movies_df = movies_df.dropna(subset=['vote_count', 'id']) # remove na
movies_df[['id']] = movies_df[['id']].astype(int)

movies_df = movies_df.drop_duplicates(subset=['title', 'original_title'], keep='last')

# Keep only movies with english as original language
movies_df = movies_df.loc[movies_df.original_language=='en']

  movies_df = pd.read_csv('../Data/movies_dataset/movies_metadata.csv')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df[['id']] = movies_df[['id']].astype(int)


Keep the movies in the ratings_df whose ids also exist in the movies_df:

In [3]:
movies_df_ids = movies_df.id.to_list()

ratings_df =  ratings_df.loc[ratings_df.movieId.isin(movies_df_ids)]

Now, to apply a priori, we need the list of items for each user. So for each `userId`, we need to get the movies he/she has seen. A threshold is also set so that the movies that have not been rated well by the user are not included. Later, the users with only a few ratings are removed.

In [4]:
movie_rating_counts =  movies_df.vote_count.to_numpy()

q = 70 # percentile
counts_perc = np.percentile(movie_rating_counts, q=q).astype(int)
print(f"The {q}'th percentile of the movie counts rating is {counts_perc} movies rated")

The 70'th percentile of the movie counts rating is 28 movies rated


In [5]:
rating_threshold = 3.5 # Min movie rating to be considered liked

In [6]:
# Collect all the movies a user has seen in a list

movies_per_user = []
for el in tqdm(list(ratings_df.groupby('userId'))):
    
    # Get all the movies a user has seen
    if len(el[1]['movieId'].to_list()) >= counts_perc:
        good_movies_by_user = [movie for (movie, rating) in zip(el[1]['movieId'].to_list(),el[1]['rating'].to_list()) if rating >= rating_threshold]
    
        movies_per_user.append(np.array(good_movies_by_user).astype(int))

  0%|          | 0/260554 [00:00<?, ?it/s]

In [7]:
# Find unique movies id's list, to use for one-hot-encoding
all_movies_ids = np.concatenate(movies_per_user)
all_movies_ids = sorted( set(list(all_movies_ids.flat))) # ascending order, not essential

In [8]:
# We can create a dataframe with the following columns: movie_id, movie_title
movie_titles = []
movie_original_titles = []
movie_ids = []

for id_ in tqdm(all_movies_ids):
    if not movies_df.loc[movies_df['id']==id_].empty:
        movie_titles.append(movies_df.loc[movies_df['id']==id_].title.item())
        movie_original_titles.append(movies_df.loc[movies_df['id']==id_].original_title.item()) 
        movie_ids.append(id_)

  0%|          | 0/4507 [00:00<?, ?it/s]

In [9]:
movie_titles_id_df = pd.DataFrame.from_dict({'title':movie_titles, 'original_title':movie_original_titles, 'id':movie_ids})
movie_titles_id_df

Unnamed: 0,title,original_title,id
0,Four Rooms,Four Rooms,5
1,Judgment Night,Judgment Night,6
2,Star Wars,Star Wars,11
3,Finding Nemo,Finding Nemo,12
4,Forrest Gump,Forrest Gump,13
...,...,...,...
4502,Cheap Thrills,Cheap Thrills,175291
4503,Fratricide,Brudermord,175331
4504,These Birds Walk,These Birds Walk,175427
4505,Enter the Dangerous Mind,Enter the Dangerous Mind,176077


Now, all the unique ids are in the column `id`, and the index can be used as a hashcode. But first, a list of baskets needs to be created. This is the list `movies_per_user_fitlered`, however not all movies in each basket exist. As a result, the baskets list is created by taking only the movies from each basket that exist in the `movie_titles_df` data frame:

In [10]:
try:
    baskets_df = pd.read_csv('baskets.csv')
except FileNotFoundError as e:
    print(e)
    print('Generating baskets...')
    baskets = []
    unique_movies_id = movie_titles_id_df.id.to_list()
    for user in tqdm(movies_per_user):
        basket = []
        for movie in user:
            if movie in unique_movies_id:
                basket.append(movie)
        baskets.append(basket)

    baskets_df = pd.DataFrame.from_dict({'basket_no':np.arange(1, len(baskets)+1), 'baskets':baskets})
    baskets_df.to_csv('baskets.csv', index=False)

This took some time to be created, better save it in a csv file:

In [12]:
def frozen_set_to_list(frznset):
    l = frznset[1:-1].split(',')
    l = [int(i.strip()) for i in l]
    return l

We can now use the a priori algorithm to find 'transcations' above a given support. Use `mlxtend` library.

In [14]:
basketes_list_str = baskets_df.baskets.to_list()
baskets = [ frozen_set_to_list(basketes_list_str[i]) for i in range(len(baskets_df)) if len(basketes_list_str[i])>2 ]

In [15]:
te = TransactionEncoder()
te_ary = te.fit_transform(baskets)

In [16]:
df_one_hot = pd.DataFrame(te_ary, columns=te.columns_)
df_one_hot

Unnamed: 0,5,6,11,12,13,14,15,16,18,20,...,174645,174671,174675,175245,175287,175291,175331,175427,176077,176143
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69814,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
69815,False,True,True,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
69816,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
69817,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [50]:
min_support = 0.08
nm = str(min_support).replace('.', '_')

try:
    frq_items = pd.read_csv(f"freq_items_{nm}.csv")
except FileNotFoundError as e:
    print(e)
    frq_items = apriori(df_one_hot, min_support = min_support, use_colnames = True)
    frq_items['length'] = frq_items['itemsets'].apply(lambda x: len(x))    
    frq_items.to_csv(f'freq_items_{nm}.csv', index=0)

In [51]:
frq_items

Unnamed: 0,support,itemsets,length
0,0.206778,(6),1
1,0.132199,(11),1
2,0.167906,(16),1
3,0.163867,(21),1
4,0.153397,(25),1
...,...,...,...
6058,0.084877,"(4993, 4226, 296, 2762, 2959, 858)",6
6059,0.086681,"(4993, 4226, 296, 2762, 2028, 2959)",6
6060,0.080308,"(4993, 4226, 2762, 527, 2959, 318)",6
6061,0.081926,"(4993, 4226, 2762, 2959, 858, 318)",6


In [52]:
try:
    assoc_rules = pd.read_csv('association_rules.csv')
except:
    assoc_rules = association_rules(frq_items, metric="confidence", min_threshold=0.6)
    assoc_rules.to_csv('association_rules.csv', index=False)

In [54]:
consequents = assoc_rules["consequents"].apply(lambda x:  list(x) ).astype("unicode")

In [55]:
antecedents = assoc_rules["antecedents"].apply(lambda x:  list(x) ).astype("unicode")

In [70]:
consequents = consequents.apply(frozen_set_to_list)

In [73]:
antecedents = antecedents.apply(frozen_set_to_list)

In [93]:
assoc_rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(6),(296),0.206778,0.598920,0.164368,0.794902,1.327226,0.040525,1.955553
1,(6),(318),0.206778,0.602658,0.154600,0.747662,1.240607,0.029984,1.574642
2,(11),(150),0.132199,0.360561,0.083616,0.632503,1.754219,0.035950,1.739983
3,(11),(318),0.132199,0.602658,0.093255,0.705417,1.170509,0.013585,1.348828
4,(16),(296),0.167906,0.598920,0.142153,0.846626,1.413588,0.041591,2.615050
...,...,...,...,...,...,...,...,...,...
17157,"(2762, 4226, 2028, 2959)","(4993, 318)",0.117074,0.280396,0.086037,0.734891,2.620900,0.053210,2.714370
17158,"(2762, 4226, 2028, 318)","(4993, 2959)",0.122015,0.274553,0.086037,0.705130,2.568285,0.052537,2.460225
17159,"(4226, 2028, 318, 2959)","(4993, 2762)",0.135106,0.246323,0.086037,0.636807,2.585256,0.052757,2.075143
17160,"(2762, 2028, 318, 2959)","(4993, 4226)",0.140420,0.235466,0.086037,0.612709,2.602113,0.052973,1.974056


In [109]:
ax = assoc_rules['antecedents'][1500]
list(ax)[0]

296

In [110]:
assoc_rules['antecedents'].apply(lambda x: list(x))

0                             [6]
1                             [6]
2                            [11]
3                            [11]
4                            [16]
                   ...           
17157    [2762, 4226, 2028, 2959]
17158     [2762, 4226, 2028, 318]
17159     [4226, 2028, 318, 2959]
17160     [2762, 2028, 318, 2959]
17161          [2762, 4226, 2028]
Name: antecedents, Length: 17162, dtype: object

In [142]:
movie_titles_id_df.head(50)

Unnamed: 0,title,original_title,id
0,Four Rooms,Four Rooms,5
1,Judgment Night,Judgment Night,6
2,Star Wars,Star Wars,11
3,Finding Nemo,Finding Nemo,12
4,Forrest Gump,Forrest Gump,13
5,American Beauty,American Beauty,14
6,Citizen Kane,Citizen Kane,15
7,Dancer in the Dark,Dancer in the Dark,16
8,The Fifth Element,The Fifth Element,18
9,My Life Without Me,My Life Without Me,20


Select a movie:

In [282]:
movies = movies_df.title.to_list()

# user_movies = np.random.choice(movies,3)
user_movies = ['Miami Vice']
user_movies_ids = [ movies_df.loc[movies_df.title==i].id.item() for i in user_movies ]

print(f"User movies: {user_movies}")
print(f"User movies id's: {user_movies_ids}")

User movies: ['Miami Vice']
User movies id's: [82]


In [283]:
def get_title_from_index(movies_df_in, idx):
    return movies_df_in.loc[movies_df_in.id==idx].title.item()

In [284]:
assoc_rules = assoc_rules.sort_values(by=['lift'], ascending=False)

In [285]:
user_movies, user_movies_ids

(['Miami Vice'], [82])

In [286]:
a = assoc_rules.loc[assoc_rules.antecedents.apply(lambda x: (user_movies_ids[0] in list(x)) and (len(list(x)))>=len(user_movies_ids)    )]

In [287]:
a.consequents.apply(lambda x:  [get_title_from_index(movies_df,list(x)[i]) for i in range(len(list(x)))] )

Series([], Name: consequents, dtype: object)

In [296]:
b = assoc_rules.consequents.apply(lambda x: list(x)).to_list()
print(f"Number of different movies that are recommended: {len(set([item for sublist in b for item in sublist ]))}")

Number of different movies that are recommended: 30
