In [2]:
# Importing the important libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import mlxtend.preprocessing
import mlxtend.frequent_patterns

In [8]:
# In the following, you need to go through the steps of extracting associtaion rules
# from a dataset that contains information on users and what movies that have watched and liked
# The final goal is to create some movie recommendations in the form of rules
# When appropriate, you can find the answer to questions after the task instuction, so you can check whether you are correct

# Loading the data, it is the Movie_subset.csv file
movie_data = pd.read_csv('Movie_subset.csv', encoding = 'ISO-8859-1')


# Look at the data (using .head()): we have one column for each user-movie pair (similar structure we had with retail data)
# The important columns are userId and title

movie_data.head()

Unnamed: 0.1,Unnamed: 0,userId,movieId,title,year,genres
0,1,1323,1,Toy Story,1995,Adventure|Animation|Children|Comedy|Fantasy
1,2,1323,3,Grumpier Old Men,1995,Comedy|Romance
2,3,1323,5,Father of the Bride Part II,1995,Comedy
3,4,1323,10,GoldenEye,1995,Action|Adventure|Thriller
4,5,1323,11,"American President, The",1995,Comedy|Drama|Romance


In [9]:
import chardet
file = 'Movie_subset.csv'
with open(file, 'rb') as rawdata:
    result = chardet.detect(rawdata.read(40000))
result

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}

In [9]:
# Create the transactional format as it is done in the lecture example
# First create a list of lists (one list for each user, combined in a list)

movie_list = movie_data.groupby(['userId'])['title'].apply(list).values.tolist()

In [10]:
# Create the transactional format

# Define the TransactionEncoder
encoder = mlxtend.preprocessing.TransactionEncoder().fit(movie_list)

# Transform the data
encoded_data = encoder.transform(movie_list)

# Finally convert it to dataframe
movie_trans = pd.DataFrame(encoded_data, columns = encoder.columns_)

print(movie_trans.head())


   'burbs, The  (500) Days of Summer  (Untitled)  *batteries not included  \
0        False                 False       False                    False   
1        False                 False       False                    False   
2        False                 False       False                    False   
3        False                 False       False                    False   
4        False                 False       False                    False   

   ...All the Marbles (California Dolls, The)  ...And Justice for All  \
0                                       False                   False   
1                                       False                   False   
2                                       False                   False   
3                                       False                   False   
4                                       False                   False   

   10 Items or Less  10 Things I Hate About You  10,000 BC  101 Dalmatians  \
0             False 

In [11]:
# Check size of data
# Answer: 100 users and 4508 movies

movie_trans.shape

(100, 4508)

In [12]:
# The most frequent movies
# Top 3: The Matrix, American Beauty, Fight Club 
movie_data['title'].value_counts()


Matrix, The                       60
American Beauty                   57
Fight Club                        54
Silence of the Lambs, The         50
Shawshank Redemption, The         48
                                  ..
Goon                               1
Sorority House Massacre            1
Bugsy Malone                       1
Dawn of the Planet of the Apes     1
EuroTrip                           1
Name: title, Length: 4508, dtype: int64

In [14]:
# Start with frequent itemsets, and specify min_support 0.3 and max_len 3
# How many itemsets we obtain?
# Answer: We have 170 itemsets (use len())

frequent_itemsets = mlxtend.frequent_patterns.apriori(movie_trans, min_support = 0.3, max_len = 3, use_colnames = True)

len(frequent_itemsets)

170

In [15]:
frequent_itemsets

Unnamed: 0,support,itemsets
0,0.38,(Alien)
1,0.57,(American Beauty)
2,0.33,(American Pie)
3,0.44,(Back to the Future)
4,0.32,(Being John Malkovich)
...,...,...
165,0.33,(Lord of the Rings: The Fellowship of the Ring...
166,0.31,"(Lord of the Rings: The Return of the King, Th..."
167,0.32,"(Pulp Fiction, Matrix, The, Silence of the Lam..."
168,0.34,"(Star Wars: Episode IV - A New Hope, Matrix, T..."


In [16]:
# Extra task not done in the course videos: 
# - create a column with the length of the itemset in the row (hint: use apply)
# - check how many itemsets you have with 3 items (18 itemsets)
frequent_itemsets['item_len'] = frequent_itemsets['itemsets'].apply(len)
frequent_itemsets


Unnamed: 0,support,itemsets,item_len
0,0.38,(Alien),1
1,0.57,(American Beauty),1
2,0.33,(American Pie),1
3,0.44,(Back to the Future),1
4,0.32,(Being John Malkovich),1
...,...,...,...
165,0.33,(Lord of the Rings: The Fellowship of the Ring...,3
166,0.31,"(Lord of the Rings: The Return of the King, Th...",3
167,0.32,"(Pulp Fiction, Matrix, The, Silence of the Lam...",3
168,0.34,"(Star Wars: Episode IV - A New Hope, Matrix, T...",3


In [17]:
frequent_itemsets[frequent_itemsets['item_len'] == 3]

Unnamed: 0,support,itemsets,item_len
152,0.3,"(Fight Club, Pulp Fiction, American Beauty)",3
153,0.3,"(Matrix, The, Pulp Fiction, American Beauty)",3
154,0.31,"(Pulp Fiction, American Beauty, Silence of the...",3
155,0.3,"(Back to the Future, Raiders of the Lost Ark (...",3
156,0.31,"(Star Wars: Episode IV - A New Hope, Back to t...",3
157,0.3,"(Fight Club, Lord of the Rings: The Fellowship...",3
158,0.3,"(Fight Club, Pulp Fiction, Matrix, The)",3
159,0.3,"(Fight Club, Matrix, The, Silence of the Lambs...",3
160,0.34,"(Fight Club, Pulp Fiction, Silence of the Lamb...",3
161,0.31,"(Matrix, The, Silence of the Lambs, The, Forre...",3


In [18]:
# Different combinations of support and max_len, and impact on the number of rules
# min_support 0.3, max_len 4: 171 itemsets
# min_support 0.2, max_len 3: 5251 itemsets
# min_support 0.2, max_len 3: 9348 itemsets
# min_support 0.4, max_len 4: 16 itemsets
# min_support 0.35, max_len 4: 57 itemsets

print(len(mlxtend.frequent_patterns.apriori(movie_trans, min_support = 0.3, max_len = 4, use_colnames = True)))
print(len(mlxtend.frequent_patterns.apriori(movie_trans, min_support = 0.2, max_len = 3, use_colnames = True)))
print(len(mlxtend.frequent_patterns.apriori(movie_trans, min_support = 0.2, max_len = 4, use_colnames = True)))
print(len(mlxtend.frequent_patterns.apriori(movie_trans, min_support = 0.4, max_len = 4, use_colnames = True)))
print(len(mlxtend.frequent_patterns.apriori(movie_trans, min_support = 0.35, max_len = 4, use_colnames = True)))

171
5251
9348
16
57


In [19]:
# Let's go with min_support 0.3 and max_len 4 and create the frequent itemsets with .apriori

frequent_itemsets = mlxtend.frequent_patterns.apriori(movie_trans, min_support = 0.3, max_len = 4, use_colnames = True)

In [20]:
# Let's start with a strong condition for confidence when using the association_rules function, min_threshold 0.9
# Do you get anything useful?


rules = mlxtend.frequent_patterns.association_rules(frequent_itemsets, metric = "confidence", min_threshold = 0.95)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(Lord of the Rings: The Return of the King, The)",(Lord of the Rings: The Fellowship of the Ring...,0.36,0.45,0.35,0.972222,2.160494,0.188,19.8
1,"(Lord of the Rings: The Two Towers, The)",(Lord of the Rings: The Fellowship of the Ring...,0.38,0.45,0.38,1.0,2.222222,0.209,inf
2,"(Lord of the Rings: The Return of the King, The)","(Lord of the Rings: The Two Towers, The)",0.36,0.38,0.35,0.972222,2.55848,0.2132,22.32
3,(Lord of the Rings: The Fellowship of the Ring...,"(Lord of the Rings: The Two Towers, The)",0.35,0.38,0.35,1.0,2.631579,0.217,inf
4,"(Lord of the Rings: The Return of the King, Th...",(Lord of the Rings: The Fellowship of the Ring...,0.35,0.45,0.35,1.0,2.222222,0.1925,inf
5,"(Lord of the Rings: The Return of the King, The)",(Lord of the Rings: The Fellowship of the Ring...,0.36,0.38,0.35,0.972222,2.55848,0.2132,22.32
6,"(Lord of the Rings: The Return of the King, Th...",(Lord of the Rings: The Fellowship of the Ring...,0.31,0.45,0.31,1.0,2.222222,0.1705,inf
7,"(Matrix, The, Lord of the Rings: The Two Tower...",(Lord of the Rings: The Fellowship of the Ring...,0.33,0.45,0.33,1.0,2.222222,0.1815,inf
8,"(Lord of the Rings: The Return of the King, Th...","(Lord of the Rings: The Two Towers, The)",0.31,0.38,0.31,1.0,2.631579,0.1922,inf
9,(Lord of the Rings: The Fellowship of the Ring...,"(Lord of the Rings: The Two Towers, The)",0.31,0.38,0.31,1.0,2.631579,0.1922,inf


In [23]:
# Try with confidence 0.8
# How many rules are extracted? (Answer: 102)!!!!!
# After this, you can experiment with lift and also different thresholds
rules = mlxtend.frequent_patterns.association_rules(frequent_itemsets, metric = "confidence", min_threshold = 0.80)
rules


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Alien),"(Matrix, The)",0.38,0.60,0.31,0.815789,1.359649,0.0820,2.171429
1,(Pulp Fiction),(American Beauty),0.47,0.57,0.38,0.808511,1.418440,0.1121,2.245556
2,(Back to the Future),"(Matrix, The)",0.44,0.60,0.38,0.863636,1.439394,0.1160,2.933333
3,(Men in Black (a.k.a. MIB)),(Back to the Future),0.38,0.44,0.31,0.815789,1.854067,0.1428,3.040000
4,(Braveheart),"(Matrix, The)",0.38,0.60,0.35,0.921053,1.535088,0.1220,5.066667
...,...,...,...,...,...,...,...,...,...
97,"(Lord of the Rings: The Return of the King, Th...",(Lord of the Rings: The Fellowship of the Ring...,0.31,0.38,0.31,1.000000,2.631579,0.1922,inf
98,"(Lord of the Rings: The Return of the King, Th...",(Lord of the Rings: The Fellowship of the Ring...,0.35,0.37,0.31,0.885714,2.393822,0.1805,5.512500
99,"(Matrix, The, Lord of the Rings: The Two Tower...",(Lord of the Rings: The Fellowship of the Ring...,0.33,0.35,0.31,0.939394,2.683983,0.1945,10.725000
100,"(Lord of the Rings: The Return of the King, The)",(Lord of the Rings: The Fellowship of the Ring...,0.36,0.33,0.31,0.861111,2.609428,0.1912,4.824000


In [24]:
# Extra task: what movie would you recommend to somebody who has seen Pulp Fiction and Fight club 
# based on the rules extracted with min_confidence 0.8?
# Answer: 3 rules, recommend American Beauty, The Matrix or The Silence of the Lambs
selection = rules['antecedents'].apply(lambda x: 'Pulp Fiction' in x and 'Fight Club' in x)

In [26]:
rules[selection]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
37,"(Fight Club, Pulp Fiction)",(American Beauty),0.37,0.57,0.3,0.810811,1.422475,0.0891,2.272857
51,"(Fight Club, Pulp Fiction)","(Matrix, The)",0.37,0.6,0.3,0.810811,1.351351,0.078,2.114286
56,"(Fight Club, Pulp Fiction)","(Silence of the Lambs, The)",0.37,0.5,0.34,0.918919,1.837838,0.155,6.166667
