In [143]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

In [105]:
# Load the ratings dataframe
ratings_df = pd.read_csv('../Data/movies_dataset/ratings.csv')

# Also load the movied dataframe to link the movieId's
movies_df = pd.read_csv('../Data/movies_dataset/movies_metadata.csv')

print(len(movies_df))

movies_df = movies_df.drop_duplicates(subset=['title', 'original_title'], keep='last')

print(len(movies_df))

45466
43441


  movies_df = pd.read_csv('../Data/movies_dataset/movies_metadata.csv')


In [3]:
columns_to_keep = ['id', 'imdb_id', 'original_language', 'original_title']

en_movies_df = movies_df[columns_to_keep]

# Remove na entries
en_movies_df=en_movies_df[columns_to_keep].dropna(axis=0)
en_movies_df.reset_index(drop=True, inplace=True)

In [4]:
engl_idx = []
for i in range(len(en_movies_df)):
    langs_str = en_movies_df.original_language[i]
    if langs_str=='en':
        engl_idx.append(i)

In [5]:
en_movies_df = en_movies_df.iloc[np.array(engl_idx)]
en_movies_df.reset_index(drop=True, inplace=True)

The `id` column is an str. Change it to integer.

In [7]:
en_movies_df.dtypes

id                   object
imdb_id              object
original_language    object
original_title       object
dtype: object

In [8]:
en_movies_df[['id']] = en_movies_df[['id']].astype(int)

Now, to apply a priori, we need the list of items for each user. So for each `userId`, we need to get the movies he/she has seen. A threshold is also set so that the movies that have not been rated well by the user are not included. Later, the users with only a few ratings are removed.

In [116]:
rating_threshold = 3 # Min movie rating to be considered liked

In [117]:
# Collect all the movies a user has seen in a list
# movies_per_user = [el[1]['movieId'].to_list() for el in list(ratings_df.groupby(['userId']))]

movies_per_user = []
for el in tqdm(list(ratings_df.groupby('userId'))):
    
    # Get all the movies a user has seen
    good_movies_by_user = [movie for (movie, rating) in zip(el[1]['movieId'].to_list(),el[1]['rating'].to_list()) if rating >= rating_threshold]
    
    movies_per_user.append(np.array(good_movies_by_user).astype(int))

  0%|          | 0/270896 [00:00<?, ?it/s]

In [119]:
# Find unique movies id's list, to use for one-hot-encoding
all_movies_ids = np.concatenate(movies_per_user)
all_movies_ids = sorted( set(list(all_movies_ids.flat))) # ascending order, not essential

In [120]:
# We can create a dataframe with the following columns: movie_id, movie_title
movie_titles = []
movie_original_titles = []
movie_ids = []

for id_ in tqdm(all_movies_ids):
    if not movies_df.loc[movies_df['id']==f"{id_}"].empty:
        movie_titles.append(movies_df.loc[movies_df['id']==f"{id_}"].title.item())
        movie_original_titles.append(movies_df.loc[movies_df['id']==f"{id_}"].original_title.item()) 
        movie_ids.append(id_)

  0%|          | 0/40799 [00:00<?, ?it/s]

In [121]:
movie_titles_id_df = pd.DataFrame.from_dict({'title':movie_titles, 'original_title':movie_original_titles, 'id':movie_ids})
movie_titles_id_df

Unnamed: 0,title,original_title,id
0,Ariel,Ariel,2
1,Shadows in Paradise,Varjoja paratiisissa,3
2,Four Rooms,Four Rooms,5
3,Judgment Night,Judgment Night,6
4,Star Wars,Star Wars,11
...,...,...,...
6815,Foreign Letters,Foreign Letters,176069
6816,Enter the Dangerous Mind,Enter the Dangerous Mind,176077
6817,White Reindeer,White Reindeer,176085
6818,Behind the Rising Sun,Behind the Rising Sun,176143


Now, all the unique ids are in the column `id`, and the index can be used as a hashcode. But first, a list of baskets needs to be created. This is the list `movies_per_user_fitlered`, however not all movies in each basket exist. As a result, the baskets list is created by taking only the movies from each basket that exist in the `movie_titles_df` data frame:

In [123]:
baskets = []
unique_movies_id = movie_titles_id_df.id.to_list()
for user in tqdm(movies_per_user):
    basket = []
    for movie in user:
        if movie in unique_movies_id:
            basket.append(movie)
    baskets.append(basket)

  0%|          | 0/270896 [00:00<?, ?it/s]

This took some time to be created, better save it in a csv file:

In [124]:
baskets_df = pd.DataFrame.from_dict({'basket_no':np.arange(1, len(baskets)+1), 'baskets':baskets})

In [135]:
# Now is the time to remove the users who have rated only a few movies, under a certain threshold
# Keep users who have rated at least min_movies_rated movies above rating_threshold
min_movies_rated = 15

baskets_df_reduced = baskets_df.loc[ np.array( list( map( len, baskets_df.baskets ) ) ) > min_movies_rated ]

In [140]:
baskets_ = baskets_df_reduced.baskets.to_list()

We can now use the a priori algorithm to find 'transcations' above a given support. Use `mlxtend` library.

In [149]:
te = TransactionEncoder()
te_ary = te.fit_transform(baskets_)

In [150]:
df_one_hot = pd.DataFrame(te_ary, columns=te.columns_)
df_one_hot

Unnamed: 0,2,3,5,6,11,12,13,14,15,16,...,175457,175503,175587,175791,175869,176069,176077,176085,176143,176167
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
120150,False,True,False,True,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
120151,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
120152,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
120153,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [151]:
len(df_one_hot)

120155

In [152]:
min_support = 0.02

frq_items = apriori(df_one_hot, min_support = min_support, use_colnames = True)
frq_items['length'] = frq_items['itemsets'].apply(lambda x: len(x))
frq_items

MemoryError: Unable to allocate 51.8 GiB for an array with shape (231540, 2, 120155) and data type bool

In [52]:
movies_df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [86]:
if movies_df.loc[movies_df['id']== '32'].empty:
    print('a')

a
