In [None]:
# Importing the important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mlxtend.preprocessing
import mlxtend.frequent_patterns

In [None]:
# In the following, you need to go through the steps of extracting associtaion rules
# from a dataset that contains information on users and what movies that have watched and liked
# The final goal is to create some movie recommendations in the form of rules

# Loading the data, it is the Movie_subset.csv file

movie_data = pd.read_csv('Movie_subset.csv')

# Look at the data (using .head()): we have one column for each user-movie pair (similar structure we had with retail data)
# The important columns are userId and title

movie_data.head()

In [None]:
# Create the transactional format as it is done in the lecture example
# First create a list of lists (one list for each user, combined in a list)

movie_list = movie_data.groupby(['userId'])['title'].apply(list).values.tolist()

print(movie_list[:2])

In [None]:
# Create the transactional format

# Define the TransactionEncoder
encoder = mlxtend.preprocessing.TransactionEncoder().fit(movie_list)

# Transform the data
encoded_data = encoder.transform(movie_list)

# Finally convert it to dataframe
movie_trans = pd.DataFrame(encoded_data, columns = encoder.columns_)

print(movie_trans.head())

In [None]:
# Check size of data
# Answer: 100 users and 4508 movies

movie_trans.shape

In [None]:
# The most frequent movies
# Top 3: The Matrix, American Beauty, Fight Club 

movie_trans.sum().sort_values(ascending = False)

In [None]:
# Start with frequent itemsets, and specify min_support 0.3 and max_len 3
# How many itemsets we obtain?
# Answer: We have 170 itemsets (use len())

frequent_itemsets = mlxtend.frequent_patterns.apriori(movie_trans, min_support = 0.3, max_len = 3, use_colnames = True)

In [None]:
# Extra task not done in the course videos: 
# - create a column with the length of the itemset in the row (hint: use apply)
# - check how many itemsets you have with 3 items (18 itemsets)

frequent_itemsets['item_len'] = frequent_itemsets['itemsets'].apply(len)

frequent_itemsets[frequent_itemsets['item_len'] == 3]

In [None]:
# Different combinations of support and max_len, and impact on the number of rules
# min_support 0.3, max_len 4: 171 itemsets
# min_support 0.2, max_len 3: 5251 itemsets
# min_support 0.2, max_len 3: 9348 itemsets
# min_support 0.4, max_len 4: 16 itemsets
# min_support 0.35, max_len 4: 57 itemsets

print(len(mlxtend.frequent_patterns.apriori(movie_trans, min_support = 0.3, max_len = 4, use_colnames = True)))
print(len(mlxtend.frequent_patterns.apriori(movie_trans, min_support = 0.2, max_len = 3, use_colnames = True)))
print(len(mlxtend.frequent_patterns.apriori(movie_trans, min_support = 0.2, max_len = 4, use_colnames = True)))
print(len(mlxtend.frequent_patterns.apriori(movie_trans, min_support = 0.4, max_len = 4, use_colnames = True)))
print(len(mlxtend.frequent_patterns.apriori(movie_trans, min_support = 0.35, max_len = 4, use_colnames = True)))

In [None]:
# Let's go with min_support 0.3 and max_len 4 and create the frequent itemsets with .apriori

frequent_itemsets = mlxtend.frequent_patterns.apriori(movie_trans, min_support = 0.3, max_len = 4, use_colnames = True)

In [None]:
# Let's start with a strong condition for confidence when using the association_rules function, min_threshold 0.9
# Do you get anything useful?


rules = mlxtend.frequent_patterns.association_rules(frequent_itemsets, metric = "confidence", min_threshold = 0.95)
rules

In [None]:
# Try with confidence 0.8
# How many rules are extracted? (Answer: 90)
# After this, you can experiment with lift and also different thresholds

rules = mlxtend.frequent_patterns.association_rules(frequent_itemsets, metric = "confidence", min_threshold = 0.8)
rules

In [None]:
# Extra task: what movie would you recommend to somebody who has seen Pulp Fiction and Fight club 
# based on the rules extracted with min_confidence 0.8?
# Answer: 3 rules, recommend American Beauty, The Matrix or The Silence of the Lambs

selection = rules['antecedents'].apply(lambda x: 'Pulp Fiction' in x and 'Fight Club' in x)
rules[selection]