## Hw01 Frequent Pattern Mining
### Authors: Matt Turconi, Matt McLaughlin
### Data Mining Spring 2020

In [53]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

from datetime import datetime

In [54]:
start_notebook_time = datetime.now()

# Phase 1

Begin by loading in the ratings and movies csv files to dataframes and displaying some info about them

In [55]:
df_movies = pd.read_csv("../data/ml-latest-small/movies.csv")
df_movies = df_movies.set_index("movieId")
display(df_movies.head(5))
df_movies.info()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


<class 'pandas.core.frame.DataFrame'>
Int64Index: 9742 entries, 1 to 193609
Data columns (total 2 columns):
title     9742 non-null object
genres    9742 non-null object
dtypes: object(2)
memory usage: 228.3+ KB


In [56]:
df_ratings = pd.read_csv("../data/ml-latest-small/ratings.csv")
df_ratings['movieName'] = df_ratings.movieId.apply(lambda x: df_movies.at[x,'title'])
display(df_ratings.head(5))
df_ratings.info()

Unnamed: 0,userId,movieId,rating,timestamp,movieName
0,1,1,4.0,964982703,Toy Story (1995)
1,1,3,4.0,964981247,Grumpier Old Men (1995)
2,1,6,4.0,964982224,Heat (1995)
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995)
4,1,50,5.0,964982931,"Usual Suspects, The (1995)"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 5 columns):
userId       100836 non-null int64
movieId      100836 non-null int64
rating       100836 non-null float64
timestamp    100836 non-null int64
movieName    100836 non-null object
dtypes: float64(1), int64(3), object(1)
memory usage: 3.8+ MB


Use the transaction decoder to create a biniarized dataframe for each user id

In [57]:
te = TransactionEncoder()
movie_lst = df_ratings.groupby("userId").apply(lambda x: list(x.movieName))
movie_trans = te.fit(movie_lst).transform(movie_lst)
df_movie_trans = pd.DataFrame(movie_trans, columns=te.columns_, index=movie_lst.index)
df_movie_trans

Unnamed: 0_level_0,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
607,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
608,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,False
609,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


Now we need to figure out what a good minsup would be.

In [58]:
tmp = df_movie_trans.sum()
df_movie_count = pd.DataFrame({"movie_count": tmp}, index=tmp.index).sort_values(by='movie_count',ascending=False)
df_movie_count['relative support'] = df_movie_count.movie_count / len(df_movie_trans.index)
df_movie_count.head(10)

Unnamed: 0,movie_count,relative support
Forrest Gump (1994),329,0.539344
"Shawshank Redemption, The (1994)",317,0.519672
Pulp Fiction (1994),307,0.503279
"Silence of the Lambs, The (1991)",279,0.457377
"Matrix, The (1999)",278,0.455738
Star Wars: Episode IV - A New Hope (1977),251,0.411475
Jurassic Park (1993),238,0.390164
Braveheart (1995),237,0.388525
Terminator 2: Judgment Day (1991),224,0.367213
Schindler's List (1993),220,0.360656


In [59]:
df_movie_count.mean()

movie_count         10.374730
relative support     0.017008
dtype: float64

It looks like the highest support is 0.5 so we probably won't want to go higher than 0.3. Settled with 0.15 as it gives us a good number of frequent movies.

In [60]:
df_movie_freq = apriori(df_movie_trans, min_support=0.15, use_colnames=True).sort_values(by="support", ascending=False)
df_movie_freq

Unnamed: 0,support,itemsets
56,0.539344,(Forrest Gump (1994))
124,0.519672,"(Shawshank Redemption, The (1994))"
114,0.503279,(Pulp Fiction (1994))
128,0.457377,"(Silence of the Lambs, The (1991))"
95,0.455738,"(Matrix, The (1999))"
...,...,...
2141,0.150820,"(Pulp Fiction (1994), Seven (a.k.a. Se7en) (19..."
2739,0.150820,(Star Wars: Episode V - The Empire Strikes Bac...
1288,0.150820,"(Independence Day (a.k.a. ID4) (1996), Back to..."
1278,0.150820,"(Speed (1994), Apollo 13 (1995), Terminator 2:..."


Now we need to create the rules. We decied to use a min threshold of 0.2 as it is slightly higher than our previous threshold and will allow us to eliminate more rules. 

In [61]:
df_movie_rules = association_rules(df_movie_freq, metric='support', min_threshold=0.2)
rules_by_support = df_movie_rules.sort_values(by='support', ascending=False)
rules_by_support.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(Shawshank Redemption, The (1994))",(Forrest Gump (1994)),0.519672,0.539344,0.378689,0.728707,1.351097,0.098406,1.697998
1,(Forrest Gump (1994)),"(Shawshank Redemption, The (1994))",0.539344,0.519672,0.378689,0.702128,1.351097,0.098406,1.612529
2,(Pulp Fiction (1994)),(Forrest Gump (1994)),0.503279,0.539344,0.377049,0.749186,1.389068,0.105609,1.83664
3,(Forrest Gump (1994)),(Pulp Fiction (1994)),0.539344,0.503279,0.377049,0.699088,1.389068,0.105609,1.65072
4,(Pulp Fiction (1994)),"(Shawshank Redemption, The (1994))",0.503279,0.519672,0.363934,0.723127,1.391506,0.102395,1.734831


This function will allow us to output a nicely formatted rule from a table of rules given a dataframe containing association rules and a number that corresponds to the rule that is to be formatted.

In [62]:
## Matt M's version. Different formatting.
def format_rule(df, rule_num):
    rl = df.iloc[rule_num, :]
    rule = str.format("Support: %0.4f \nConfidence: %0.4f \nLift: %0.4f" % (rl.support, rl.confidence, rl.lift)) + "\n"
    ant = list(rl.antecedents)
    cons = list(rl.consequents)
    for x in range(len(ant)):
        rule += ant[x] + " & "
        
    rule = rule[0:len(rule)-3] + "\n  -->  \n"
    
    for x in range(len(cons)):
        rule += cons[x] + " & "
        
    rule = rule[0:len(rule)-3]
    rule += "\n"
    
    return rule

In [63]:
for i in range(5):
    print(format_rule(rules_by_support, i))

Support: 0.3787 
Confidence: 0.7287 
Lift: 1.3511
Shawshank Redemption, The (1994)
  -->  
Forrest Gump (1994)

Support: 0.3787 
Confidence: 0.7021 
Lift: 1.3511
Forrest Gump (1994)
  -->  
Shawshank Redemption, The (1994)

Support: 0.3770 
Confidence: 0.7492 
Lift: 1.3891
Pulp Fiction (1994)
  -->  
Forrest Gump (1994)

Support: 0.3770 
Confidence: 0.6991 
Lift: 1.3891
Forrest Gump (1994)
  -->  
Pulp Fiction (1994)

Support: 0.3639 
Confidence: 0.7231 
Lift: 1.3915
Pulp Fiction (1994)
  -->  
Shawshank Redemption, The (1994)



#### Rules with *High Support*
* (Shawshank Redemption, The (1994)) -> (Forrest Gump (1994)) support=0.378689
* (Forrest Gump (1994)) -> (Shawshank Redemption, The (1994)) support=0.378689
* (Pulp Fiction (1994)) -> (Forrest Gump (1994)) support=0.377049

Above you can see the five rules with the highest support. We also see that each of these have a lift greater than one, so we know that they are all likely dependent on one another. However we do not believe that these rules are interesting. If you go on IMDB each of these movies are rated in the top 20 best movies of modern era. So we venture on to find some interesting rules. 

In [64]:
rules_by_confidence = df_movie_rules.sort_values(by='confidence', ascending=False)
rules_by_lift = df_movie_rules.sort_values(by='lift', ascending=False)

for i in range(5):
    print(format_rule(rules_by_confidence, i))

Support: 0.2049 
Confidence: 0.9843 
Lift: 2.3920
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) & Star Wars: Episode VI - Return of the Jedi (1983)
  -->  
Star Wars: Episode IV - A New Hope (1977)

Support: 0.2000 
Confidence: 0.9839 
Lift: 2.3911
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) & Star Wars: Episode V - The Empire Strikes Back (1980) & Star Wars: Episode VI - Return of the Jedi (1983)
  -->  
Star Wars: Episode IV - A New Hope (1977)

Support: 0.2000 
Confidence: 0.9839 
Lift: 2.3911
Matrix, The (1999) & Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981) & Star Wars: Episode V - The Empire Strikes Back (1980)
  -->  
Star Wars: Episode IV - A New Hope (1977)

Support: 0.2131 
Confidence: 0.9774 
Lift: 3.0113
Matrix, The (1999) & Lord of the Rings: The Return of the King, The (2003) & Lord of the Rings: The Two Towers, The (2002)
  -->  
Lord of the Rings: The Fellowship of the R

From sorting by Confidence we obtain the following interesting rules:
* If you watched the Godfather part 2, you are very likely to have watched the first Godfather as well. This just supports the idea that if someone has watched a later movie in a series, they are likely to have watched earlier movies in the series. The reverse is not shown to be as true.
* We also notice that there are rules for Star Wars 4 -> Star Wars 6, and a rule for Star Wars 5 -> Star Wars 6 but there is no rule that says you are likely to watch Star Wars 5 if you've watched Star Wars 4 with high confidence. We have no doubt that the rule does in fact exist, It is just quite interesting that it is not a rule with very high confidence. 
* Looking at the few rules that have have a Lord of the rings movie -> another lord of the rings movie, we see that each of these rules has a super high support. An explanation as to why they are so dependent on one another could be that the movies have a huge following because the book has been around for a very long time. This would mean that there is a large number of people that want to watch the movies after reading the book. 

In [65]:
for i in range(5):
    print(format_rule(rules_by_lift, i))

Support: 0.2131 
Confidence: 0.8784 
Lift: 3.2278
Matrix, The (1999) & Lord of the Rings: The Return of the King, The (2003)
  -->  
Lord of the Rings: The Fellowship of the Ring, The (2001) & Lord of the Rings: The Two Towers, The (2002)

Support: 0.2131 
Confidence: 0.7831 
Lift: 3.2278
Lord of the Rings: The Fellowship of the Ring, The (2001) & Lord of the Rings: The Two Towers, The (2002)
  -->  
Matrix, The (1999) & Lord of the Rings: The Return of the King, The (2003)

Support: 0.2131 
Confidence: 0.8725 
Lift: 3.2255
Matrix, The (1999) & Lord of the Rings: The Two Towers, The (2002)
  -->  
Lord of the Rings: The Fellowship of the Ring, The (2001) & Lord of the Rings: The Return of the King, The (2003)

Support: 0.2131 
Confidence: 0.7879 
Lift: 3.2255
Lord of the Rings: The Fellowship of the Ring, The (2001) & Lord of the Rings: The Return of the King, The (2003)
  -->  
Matrix, The (1999) & Lord of the Rings: The Two Towers, The (2002)

Support: 0.2131 
Confidence: 0.7027 
Lif

From sorting by High Lift
* We see for the first time the appearance of rules that involve more than one movie. Its also very intersting to see how much Lord of the rings pops up in these rules!! It seems like these rules are pretty darn strong and all fairly dependent on each other. So I would say that you could predict with pretty good certainty the habits of a viewer if they are watching a Lord of The Rings movie. 

# Phase 2

#### Write the above in a function, dependent on variable genre. Run function on 3 different genres.

In [66]:
def make_genre_df(genres, df_ratings2=df_ratings):
    '''
    Make an association rules dataframe only for certain genres
    '''
    ########################################################################################################################
    # Here- select only movies that match genres ###
    df_ratings2 = df_ratings2[df_ratings2.movieId.apply(lambda x: any(el in df_movies.at[x,'genres'] for el in genres))]
    df_ratings2.reset_index(drop=True)
    ################################################
    
    te = TransactionEncoder()
    movie_lst = df_ratings2.groupby("userId").apply(lambda x: list(x.movieName))
    movie_trans = te.fit(movie_lst).transform(movie_lst)
    df_movie_trans = pd.DataFrame(movie_trans, columns=te.columns_, index=movie_lst.index)
    #
    df_movie_freq = apriori(df_movie_trans, min_support=0.15, use_colnames=True).sort_values(by="support", ascending=False)
    #
    df_movie_rules = association_rules(df_movie_freq, metric='support', min_threshold=0.2)
    ########################################################################################################################


    return df_movie_rules

In [67]:
def rules_by_genre(genres=None):
    '''
    Input a list of strings that are genre names. Function outputs strong rules for movies in those genres
    '''
    if genres == None:
        print("Error- No genre(s) inputted")
        return None
    #Generate dataframe for specific genre(s)
    genre_movie_rules = make_genre_df(genres)
   
   
    rules_by_support = genre_movie_rules.sort_values(by='support', ascending=False)
    rules_by_lift = genre_movie_rules.sort_values(by='lift', ascending=False)
    rules_by_confidence = genre_movie_rules.sort_values(by='confidence', ascending=False)


    #print strong rules
    print("High Support Rules")
    for i in range(5):
        if(i >= rules_by_support.shape[0]):
            break
        print(format_rule(rules_by_support, i))
    print("\nHigh Lift Rules")
    for i in range(5):
        if(i >= rules_by_lift.shape[0]):
            break
        print(format_rule(rules_by_lift, i))
    print("\nHigh Confidence Rules")
    for i in range(5):
        if(i >= rules_by_confidence.shape[0]):
            break
        print(format_rule(rules_by_confidence, i))

In [68]:
rules_by_genre(["Comedy"])

High Support Rules
Support: 0.3777 
Confidence: 0.7492 
Lift: 1.3868
Pulp Fiction (1994)
  -->  
Forrest Gump (1994)

Support: 0.3777 
Confidence: 0.6991 
Lift: 1.3868
Forrest Gump (1994)
  -->  
Pulp Fiction (1994)

Support: 0.2529 
Confidence: 0.7163 
Lift: 1.3259
Toy Story (1995)
  -->  
Forrest Gump (1994)

Support: 0.2529 
Confidence: 0.4681 
Lift: 1.3259
Forrest Gump (1994)
  -->  
Toy Story (1995)

Support: 0.2365 
Confidence: 0.4691 
Lift: 1.5782
Pulp Fiction (1994)
  -->  
Fargo (1996)


High Lift Rules
Support: 0.2217 
Confidence: 0.4103 
Lift: 1.7354
Forrest Gump (1994)
  -->  
Mrs. Doubtfire (1993)

Support: 0.2217 
Confidence: 0.9375 
Lift: 1.7354
Mrs. Doubtfire (1993)
  -->  
Forrest Gump (1994)

Support: 0.2053 
Confidence: 0.3799 
Lift: 1.7139
Forrest Gump (1994)
  -->  
Pretty Woman (1990)

Support: 0.2053 
Confidence: 0.9259 
Lift: 1.7139
Pretty Woman (1990)
  -->  
Forrest Gump (1994)

Support: 0.2365 
Confidence: 0.4691 
Lift: 1.5782
Pulp Fiction (1994)
  -->  
Farg

#### Comedy Comments
* One thing that we notice from the high support rules is that it reccomends watching forest gump if you've watched toy story. This is a pretty bad rule in reality, because Toy story is a kids movie and Forrest Gump is a little more adult.
* Mrs. Doubtfire and Forrest gump is a pretty strong rule on both the lift side and its actual meaure of confidence. We also beleive this rule to be pretty darn true as well.
* Also is pulp fiction really a comedy? It's pretty dark. The same could be said about a lot of the other movies in this genere as well

In [69]:
rules_by_genre(["Drama"])

High Support Rules
Support: 0.3787 
Confidence: 0.7287 
Lift: 1.3511
Shawshank Redemption, The (1994)
  -->  
Forrest Gump (1994)

Support: 0.3787 
Confidence: 0.7021 
Lift: 1.3511
Forrest Gump (1994)
  -->  
Shawshank Redemption, The (1994)

Support: 0.3770 
Confidence: 0.7492 
Lift: 1.3891
Pulp Fiction (1994)
  -->  
Forrest Gump (1994)

Support: 0.3770 
Confidence: 0.6991 
Lift: 1.3891
Forrest Gump (1994)
  -->  
Pulp Fiction (1994)

Support: 0.3639 
Confidence: 0.7231 
Lift: 1.3915
Pulp Fiction (1994)
  -->  
Shawshank Redemption, The (1994)


High Lift Rules
Support: 0.2049 
Confidence: 0.6510 
Lift: 3.0786
Godfather, The (1972)
  -->  
Godfather: Part II, The (1974)

Support: 0.2049 
Confidence: 0.9690 
Lift: 3.0786
Godfather: Part II, The (1974)
  -->  
Godfather, The (1972)

Support: 0.2098 
Confidence: 0.5872 
Lift: 2.0009
Fight Club (1999)
  -->  
Sixth Sense, The (1999)

Support: 0.2098 
Confidence: 0.7151 
Lift: 2.0009
Sixth Sense, The (1999)
  -->  
Fight Club (1999)

Supp

### Drama Comments
* Nothing really that interesting came out of the higest support rules
* From our rules with High lift we get an interesting rule about watching Pulp Fiction and Forrest Gump would mean that person is likely to watch both Braveheart and Shawshank Redemption. This is a pretty intersting rule because we have seen that both Pulp Fiction and Forrest Gump are watched by a lot of people, so we could make reasonable reccomendations to many others as well as these two movies seem to be very popular.
* From our confidence rules we see that we can be fairly certain there is a relation between the following 4 movies, Forrest Gump, Pulp Fiction, Shawshank Redemption, and Braveheart

In [70]:
rules_by_genre(["Horror", "Thriller"])

High Support Rules
Support: 0.3399 
Confidence: 0.7419 
Lift: 1.4718
Silence of the Lambs, The (1991)
  -->  
Pulp Fiction (1994)

Support: 0.3399 
Confidence: 0.6743 
Lift: 1.4718
Pulp Fiction (1994)
  -->  
Silence of the Lambs, The (1991)

Support: 0.2956 
Confidence: 0.6475 
Lift: 1.8088
Matrix, The (1999)
  -->  
Fight Club (1999)

Support: 0.2956 
Confidence: 0.8257 
Lift: 1.8088
Fight Club (1999)
  -->  
Matrix, The (1999)

Support: 0.2923 
Confidence: 0.5798 
Lift: 1.2701
Pulp Fiction (1994)
  -->  
Matrix, The (1999)


High Lift Rules
Support: 0.2233 
Confidence: 0.7640 
Lift: 2.4619
True Lies (1994)
  -->  
Batman (1989)

Support: 0.2233 
Confidence: 0.7196 
Lift: 2.4619
Batman (1989)
  -->  
True Lies (1994)

Support: 0.2118 
Confidence: 0.6386 
Lift: 2.4007
Independence Day (a.k.a. ID4) (1996)
  -->  
Mission: Impossible (1996)

Support: 0.2118 
Confidence: 0.7963 
Lift: 2.4007
Mission: Impossible (1996)
  -->  
Independence Day (a.k.a. ID4) (1996)

Support: 0.2118 
Confide

### Horror and Thriller Comments
* Finally we're getting some serioulsy interesting rules.
* First from the Support rules, we don't see anything that interseting but when we take a look at the rules by Lift we see Batman and True Lies. Which is a pretty neat reccomendation because it doesn't involve some of the movies that are cropping up over and over again.
* We also see that If you watched the Fugitive, Ture Lies would also be a good reccomendation.
* Looking at our confidence, we see that the first two rules botth involve Silence of the Lambs and Pulp fiction as well as two other movies that would point toward someone wanting to watch pulp fiction.
* Lastly we think a person watching the sixth sense and the matrix is an intersting rule as well.

# Phase 3

#### Functions to clean genre data

In [71]:
def split_list(string):
    string = str(string)
    sep_lst = string.split('|')
    return sep_lst

In [72]:
def flatten_list(lst):
    ret_list = []
    for x in lst:
        for i in range(len(x)):
            if(x[i] in ret_list):
                continue
            ret_list.append(x[i])
    return ret_list

#### Group by customer, make list of all genres that customer watched

In [73]:
df_movies.genres = df_movies.genres.apply(split_list)
df_ratings['movieGenres'] = df_ratings.movieId.apply(lambda x: df_movies.at[x, 'genres'])
tmp = df_ratings.groupby(by="userId").apply(lambda x: flatten_list(x.movieGenres))
df_users_genres = pd.DataFrame(tmp, columns=["Genre"], index=tmp.index)

#### Apply transaction encoder

In [74]:
tmp = te.fit(df_users_genres.Genre).transform(df_users_genres.Genre)
df_genre_bin = pd.DataFrame(tmp, columns=te.columns_)

#### Now feed the transaction lists into an Apriori algorithm to generate rules

In [75]:
genre_rules = apriori(df_genre_bin, min_support=0.65, use_colnames=True).sort_values(by="support", ascending=False)

In [76]:
genre_rules.head()

Unnamed: 0,support,itemsets
6,1.0,(Drama)
103,0.998361,"(Drama, Thriller)"
74,0.998361,"(Drama, Comedy)"
14,0.998361,(Thriller)
4,0.998361,(Comedy)


In [77]:
genre_rules = association_rules(genre_rules, metric='confidence', min_threshold=0.7)

In [78]:
rules_by_confidence = genre_rules.sort_values(by="confidence", ascending=False)

In [79]:
ant_3_rules = genre_rules[genre_rules.antecedents.apply(lambda x: len(x) == 3)]
ant_2_rules = genre_rules[genre_rules.antecedents.apply(lambda x: len(x) == 2)]
ant_1_rules = genre_rules[genre_rules.antecedents.apply(lambda x: len(x) == 1)]

In [80]:
ant_3_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
48,"(Drama, Thriller, Comedy)",(Action),0.996721,0.996721,0.993443,0.996711,0.999989,-1.1e-05,0.996721
49,"(Drama, Thriller, Action)",(Comedy),0.995082,0.998361,0.993443,0.998353,0.999992,-8e-06,0.995082
50,"(Drama, Comedy, Action)",(Thriller),0.995082,0.998361,0.993443,0.998353,0.999992,-8e-06,0.995082
51,"(Thriller, Comedy, Action)",(Drama),0.993443,1.0,0.993443,1.0,1.0,0.0,inf
74,"(Drama, Romance, Thriller)",(Comedy),0.991803,0.998361,0.991803,1.0,1.001642,0.001626,inf


In [81]:
for i in range(2):
    print(format_rule(ant_3_rules, i))
print('\n')
    
for i in range(2):
    print(format_rule(ant_2_rules,i))
print('\n')

    
print(format_rule(rules_by_confidence, 1))

Support: 0.9934 
Confidence: 0.9967 
Lift: 1.0000
Drama & Thriller & Comedy
  -->  
Action

Support: 0.9934 
Confidence: 0.9984 
Lift: 1.0000
Drama & Thriller & Action
  -->  
Comedy



Support: 0.9967 
Confidence: 0.9984 
Lift: 1.0000
Drama & Thriller
  -->  
Comedy

Support: 0.9967 
Confidence: 0.9984 
Lift: 1.0000
Drama & Comedy
  -->  
Thriller



Support: 0.6918 
Confidence: 1.0000 
Lift: 1.0016
Romance & Adventure & Horror & Action & Musical
  -->  
Thriller



In [82]:
ant_1_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(Drama),(Thriller),1.0,0.998361,0.998361,0.998361,1.0,0.0,1.0
1,(Thriller),(Drama),0.998361,1.0,0.998361,1.0,1.0,0.0,inf
2,(Drama),(Comedy),1.0,0.998361,0.998361,0.998361,1.0,0.0,1.0
3,(Comedy),(Drama),0.998361,1.0,0.998361,1.0,1.0,0.0,inf
7,(Drama),"(Thriller, Comedy)",1.0,0.996721,0.996721,0.996721,1.0,0.0,1.0


In [83]:
rules_by_confidence.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
269085,"(Animation, Romance, Fantasy, Crime)","(Drama, Adventure)",0.837705,0.993443,0.837705,1.0,1.006601,0.005493,inf
4541600,"(Romance, Adventure, Horror, Action, Musical)",(Thriller),0.691803,0.998361,0.691803,1.0,1.001642,0.001134,inf
4541384,"(War, Horror, Sci-Fi, Crime, Fantasy, Animation)","(Drama, Comedy)",0.691803,0.998361,0.691803,1.0,1.001642,0.001134,inf
4541358,"(War, Horror, Comedy, Sci-Fi, Crime, Fantasy, ...",(Drama),0.691803,1.0,0.691803,1.0,1.0,0.0,inf
4541355,"(Drama, War, Horror, Sci-Fi, Crime, Fantasy, A...",(Comedy),0.691803,0.998361,0.691803,1.0,1.001642,0.001134,inf


#### Comments on Genre Rules
* In the first two blocks we see an enormous correlation between "Comedy & Drama & Thriller --> Action" and "Comedy & Drama & Action --> Thriller". We went back to look at the top 4 rules with 3 antecedents, and they are all of the variations of Comedy, Drama, Thriller, and Action. This allows us to conclude not that these genres have a high dependence on each other, but that they are all so popular that most people will have watched all four of these genres.
* Examining the rules with 2 antecedents reinforces the trend above, as "Comedy & Drama --> Thriller" and "Comedy & Thriller --> Drama". The support and confidence for these rules are slightly stronger than those of the 3-antecedent rules, suggesting it is no accident that Action is left out, and we can say it is slightly less popular as an individual genre. We output the first 10 items in ant_1_rules just to look at this more, and Action doesn't show up at all until 8th place.
* Our final print statement (and the rules_by_confidence.head()) suggests that it is not hard to find perfect confidence in this dataset. This is possibly a flaw of the dataset.

# Phase 4

## Phase 4 Idea 1

In [84]:
df_tags = pd.read_csv("../data/ml-latest-small/tags.csv")

In [85]:
def is_proper_noun(string):
    '''
    Determines if a string is a two word Proper Noun
    Relies on the fact that in english we write proper nouns
    with the first letter being uppercase.
    '''
    split = string.split(" ")
    if(len(split) != 2):
        return False
    if(not split[0][0:1].isupper()):
        return False
    if(not split[1][0:1].isupper()):
        return False
    return True

#### Get only the tags with Actor Names

In [86]:
df_actors = df_tags[df_tags.tag.apply(lambda x: is_proper_noun(x))].drop("timestamp", axis=1)

#### Reset col names

In [87]:
df_actors.columns = ["userId", "movieId", "actor"]
df_actors.head()

Unnamed: 0,userId,movieId,actor
5,2,89774,Tom Hardy
7,2,106782,Leonardo DiCaprio
8,2,106782,Martin Scorsese
10,18,431,Al Pacino
13,18,1221,Al Pacino


In [88]:
def remove_dupes(lst):
    return list(dict.fromkeys(lst))

#### Group by the user ID and create the transaction list

In [89]:
actor_list = df_actors.groupby("userId").apply(lambda x: remove_dupes(list(x.actor)))
actors_trans = te.fit(actor_list).transform(actor_list)
df_actor_trans = pd.DataFrame(actors_trans, columns=te.columns_, index=actor_list.index)
df_actor_trans.head()

Unnamed: 0_level_0,AS Byatt,Adam Sandler,Adrien Brody,Agatha Christie,Al Pacino,Alfred Hitchcock,Alicia Vikander,Amazing Cinematography,American Indians,Amy Adams,...,Van Gogh,Viggo Mortensen,Visually Striking,Wall Street,Well Done,Wesley Snipes,Will Ferrell,Will Smith,Woody Harrelson,Zooey Deschanel
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
18,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
62,False,True,True,False,True,False,True,False,False,True,...,False,True,False,False,False,False,True,True,True,False
125,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
132,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


#### Run the apriori alg

In [90]:
df_actor_freq = apriori(df_actor_trans, min_support=0.05, use_colnames=True)
df_actor_freq.head()

Unnamed: 0,support,itemsets
0,0.130435,(Adam Sandler)
1,0.130435,(Al Pacino)
2,0.086957,(Alfred Hitchcock)
3,0.086957,(Anne Hathaway)
4,0.086957,(Arnold Schwarzenegger)


#### Generate the association rules

In [91]:
df_actor_rules = association_rules(df_actor_freq, metric='support', min_threshold=0.08).sort_values(by="support", ascending=False)

#### Intersting Actor Rules

In [92]:
for i in [0,2,4,5,6]:
    print(format_rule(df_actor_rules, i))

Support: 0.1739 
Confidence: 1.0000 
Lift: 5.7500
Quentin Tarantino
  -->  
Brad Pitt

Support: 0.1304 
Confidence: 1.0000 
Lift: 4.6000
Tom Hanks
  -->  
Leonardo DiCaprio

Support: 0.0870 
Confidence: 1.0000 
Lift: 11.5000
Ben Stiller & Jude Law
  -->  
Brad Pitt & Quentin Tarantino & Tim Burton & Will Ferrell & Leonardo DiCaprio

Support: 0.0870 
Confidence: 1.0000 
Lift: 11.5000
Jude Law & Leonardo DiCaprio & Will Ferrell
  -->  
Ben Stiller & Quentin Tarantino & Tim Burton & Brad Pitt

Support: 0.0870 
Confidence: 1.0000 
Lift: 11.5000
Quentin Tarantino & Tim Burton & Brad Pitt
  -->  
Ben Stiller & Jude Law & Leonardo DiCaprio & Will Ferrell



## Phase 4 Idea 2
#### Generating new association rules with decade - genre, ex "1970s Horror"

In [93]:
def decade_parser(movie_name):
    return movie_name[-5:-2]+"0s "

In [94]:
def genre_parser(decade, genre_list):
    new_genre_list = []
    for x in genre_list:
        new_genre_list.append(str(decade+x))
        
    return new_genre_list

#### Add a new column to dataframe with "decade " + genre

In [95]:
df_movies["Decade"] = df_movies.title.apply(lambda x: decade_parser(x))

df_movies["Decade Genre"] = df_movies.genres

for i in df_movies.index:
    df_movies.at[i,"Decade Genre"] = genre_parser(df_movies.at[i,"Decade"], df_movies.at[i,"Decade Genre"])

df_ratings['Decade Genre'] = df_ratings.movieId.apply(lambda x: df_movies.at[x, 'Decade Genre'])

#### Group by customer, make list of all genres that customer watched
tmp = df_ratings.groupby(by="userId").apply(lambda x: flatten_list(x["Decade Genre"]))
df_users_genres = pd.DataFrame(tmp, columns=["Decade Genre"], index=tmp.index)

df_users_genres.head()

Unnamed: 0_level_0,Decade Genre
userId,Unnamed: 1_level_1
1,"[1990s Adventure, 1990s Animation, 1990s Child..."
2,"[1990s Crime, 1990s Drama, 1990s Comedy, 1990s..."
3,"[1990s Drama, 1990s War, 1990s Action, 1990s C..."
4,"[1990s Comedy, 1990s Crime, 1990s Thriller, 19..."
5,"[1990s Adventure, 1990s Animation, 1990s Child..."


In [96]:
#### Apply transaction encoder
tmp = te.fit(df_users_genres["Decade Genre"]).transform(df_users_genres["Decade Genre"])
df_genre_bin = pd.DataFrame(tmp, columns=te.columns_)

#### Now feed the transaction lists into an Apriori algorithm to generate rules
genre_rules = apriori(df_genre_bin, min_support=0.65, use_colnames=True).sort_values(by="support", ascending=False)

genre_rules = association_rules(genre_rules, metric='confidence', min_threshold=0.7)

In [97]:
genre_rules.head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(1990s Drama),(1990s Comedy),0.985246,0.967213,0.959016,0.973378,1.006374,0.006074,1.231557
1,(1990s Comedy),(1990s Drama),0.967213,0.985246,0.959016,0.991525,1.006374,0.006074,1.740984
2,(1990s Drama),(1990s Thriller),0.985246,0.952459,0.947541,0.96173,1.009734,0.009135,1.242267
3,(1990s Thriller),(1990s Drama),0.952459,0.985246,0.947541,0.994836,1.009734,0.009135,2.857377
4,(1990s Drama),(1990s Action),0.985246,0.945902,0.942623,0.956739,1.011457,0.010677,1.250504


#### These rules are not as interesting as I would hope. They for the most part follow the genre rules that do not include decade, from decade X to decade X. I tried to modify this to only show rules that mapped across different decades but had trouble working with frozensets.

## Phase 4 Idea 3
#### Using the first set of movie recomendations (df_movie_rules), filter out any movies that have an average rating less than 3.5

#### Need to get an average rating for each movie.

In [98]:
df_movies['avgRating'] = df_ratings.groupby("movieId").mean().rating
df_movies.head()

Unnamed: 0_level_0,title,genres,Decade,Decade Genre,avgRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1990s,"[1990s Adventure, 1990s Animation, 1990s Child...",3.92093
2,Jumanji (1995),"[Adventure, Children, Fantasy]",1990s,"[1990s Adventure, 1990s Children, 1990s Fantasy]",3.431818
3,Grumpier Old Men (1995),"[Comedy, Romance]",1990s,"[1990s Comedy, 1990s Romance]",3.259615
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",1990s,"[1990s Comedy, 1990s Drama, 1990s Romance]",2.357143
5,Father of the Bride Part II (1995),[Comedy],1990s,[1990s Comedy],3.071429


In [99]:
df_movie_titles = df_movies.set_index("title")
def has_bad_review(movie):
    for x in movie:
        if df_movie_titles.at[x, 'avgRating'] < 3.5:
            return True   

#### Eliminate any rules that have a low rating (< 3.5)

In [100]:
df_rules_filtered = df_movie_rules
for i in df_movie_rules.index:
    row = df_movie_rules.iloc[i, :]
    ant = row.antecedents
    cons = row.consequents
    if(has_bad_review(ant) or has_bad_review(cons)):
        df_rules_filtered = df_rules_filtered.drop(i)
print("Rules eliminated: " + str(df_movie_rules.shape[0] - df_rules_filtered.shape[0]))

Rules eliminated: 96


#### As we can see we have eliminated 96 bad rules!! We can now use the set of rules below to reccomend things to customers that are actually good movies!!

In [101]:
df_rules_filtered.sort_values(by="confidence", ascending=False)[20:30]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
556,"(Pulp Fiction (1994), Star Wars: Episode V - T...",(Star Wars: Episode IV - A New Hope (1977)),0.227869,0.411475,0.214754,0.942446,2.290407,0.120992,10.225615
84,"(Matrix, The (1999), Star Wars: Episode V - Th...",(Star Wars: Episode IV - A New Hope (1977)),0.283607,0.411475,0.265574,0.936416,2.275752,0.148877,9.255887
230,"(Shawshank Redemption, The (1994), Jurassic Pa...",(Forrest Gump (1994)),0.252459,0.539344,0.236066,0.935065,1.733707,0.099903,7.094098
640,"(Apollo 13 (1995), Jurassic Park (1993))",(Forrest Gump (1994)),0.22623,0.539344,0.211475,0.934783,1.733184,0.08946,7.063388
807,"(Silence of the Lambs, The (1991), Star Wars: ...",(Star Wars: Episode IV - A New Hope (1977)),0.221311,0.411475,0.206557,0.933333,2.26826,0.115493,8.827869
132,(Lord of the Rings: The Fellowship of the Ring...,"(Lord of the Rings: The Two Towers, The (2002))",0.270492,0.308197,0.252459,0.933333,3.028369,0.169094,10.377049
896,"(Lord of the Rings: The Two Towers, The (2002)...",(Lord of the Rings: The Fellowship of the Ring...,0.219672,0.32459,0.204918,0.932836,2.873888,0.133615,10.056102
834,"(Star Wars: Episode IV - A New Hope (1977), Fi...","(Matrix, The (1999))",0.219672,0.455738,0.204918,0.932836,2.04687,0.104805,8.103461
180,"(Fight Club (1999), Forrest Gump (1994))","(Matrix, The (1999))",0.262295,0.455738,0.244262,0.93125,2.04339,0.124725,7.916542
133,(Lord of the Rings: The Fellowship of the Ring...,"(Lord of the Rings: The Return of the King, Th...",0.272131,0.303279,0.252459,0.927711,3.058938,0.169927,9.637978


### Just as a curiosity- how long it takes to run notebook

In [102]:
end_notebook_time = datetime.now()

In [103]:
print("Time to run entire notebook: " + str(end_notebook_time - start_notebook_time))

Time to run entire notebook: 0:03:55.724266
