In [170]:
import numpy as np
import pandas

class popularity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.popularity_recommendations = None
        
 
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id


        train_data_grouped = train_data.groupby([self.item_id]).agg({self.user_id: 'count'}).reset_index()
        train_data_grouped.rename(columns = {'user_id': 'score'},inplace=True)
    

        print(train_data_grouped.head(5))
        train_data_grouped['score']=train_data_grouped[user_id]
        train_data_grouped.drop(user_id,axis=1,inplace=True)
        train_data_sort = train_data_grouped.sort_values(['score', self.item_id], ascending = [0,1])
    
        #Generate a recommendation rank based upon score
        train_data_sort['Rank'] = train_data_sort['score'].rank(ascending=0, method='first')
        
        #Get the top 10 recommendations
        self.popularity_recommendations = train_data_sort.head(10)

    #Use the popularity based recommender system model to
    #make recommendations
    def recommend(self, user_id):    
        user_recommendations = self.popularity_recommendations
        
        #Add user_id column for which the recommendations are being generated
        user_recommendations['user_id'] = user_id
    
        #Bring user_id column to the front
        cols = user_recommendations.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        user_recommendations = user_recommendations[cols]
        
        return user_recommendations
    

#Class for Item similarity based Recommender System model
class item_similarity_recommender_py():
    def __init__(self):
        self.train_data = None
        self.user_id = None
        self.item_id = None
        self.cooccurence_matrix = None
        self.m_dict = None
        self.rev_m_dict = None
        self.item_similarity_recommendations = None
        

    def get_user_items(self, user):
        user_data = self.train_data[self.train_data[self.user_id] == user]
        user_items = list(user_data[self.item_id].unique())
        
        return user_items
        

    def get_item_users(self, item):
        item_data = self.train_data[self.train_data[self.item_id] == item]
        item_users = set(item_data[self.user_id].unique())
            
        return item_users
        

    def get_all_items_train_data(self):
        all_items = list(self.train_data[self.item_id].unique())
            
        return all_items
        

    def construct_cooccurence_matrix(self, user_m, all_m):
            

        user_m_users = []        
        for i in range(0, len(user_m)):
            user_m_users.append(self.get_item_users(user_m[i]))

        cooccurence_matrix = np.matrix(np.zeros(shape=(len(user_m), len(all_m))), float)
           

        for i in range(0,len(all_m)):
            
            m_i_data = self.train_data[self.train_data[self.item_id] == all_m[i]]
            users_i = set(m_i_data[self.user_id].unique())
            
            for j in range(0,len(user_m)):       
                    
                
                users_j = user_m_users[j]
                    
                
                users_intersection = users_i.intersection(users_j)
                
                
                if len(users_intersection) != 0:
                    
                    users_union = users_i.union(users_j)
                    
                    cooccurence_matrix[j,i] = float(len(users_intersection))/float(len(users_union))
                else:
                    cooccurence_matrix[j,i] = 0
                    
        
        return cooccurence_matrix

    
    def generate_top_recommendations(self, user, cooccurence_matrix, all_m, user_m):
        print("Non zero values in cooccurence_matrix :%d" % np.count_nonzero(cooccurence_matrix))
        
        
        user_sim_scores = cooccurence_matrix.sum(axis=0)/float(cooccurence_matrix.shape[0])
        user_sim_scores = np.array(user_sim_scores)[0].tolist()
 
       
        sort_index = sorted(((e,i) for i,e in enumerate(list(user_sim_scores))), reverse=True)
    
        
        columns = ['user_id', 'movie', 'score', 'rank']
        #index = np.arange(1) # array of numbers for the number of samples
        df = pandas.DataFrame(columns=columns)
         
        #Fill the dataframe with top 10 item based recommendations
        rank = 1 
        for i in range(0,len(sort_index)):
            if ~np.isnan(sort_index[i][0]) and all_m[sort_index[i][1]] not in user_m and rank <= 10:
                df.loc[len(df)]=[user,all_m[sort_index[i][1]],sort_index[i][0],rank]
                rank = rank+1
        
        #Handle the case where there are no recommendations
        if df.shape[0] == 0:
            print("The current user has no movies for training the item similarity based recommendation model.")
            return -1
        else:
            return df
 
    #Create the item similarity based recommender system model
    def create(self, train_data, user_id, item_id):
        self.train_data = train_data
        self.user_id = user_id
        self.item_id = item_id

    #Use the item similarity based recommender system model to
    #make recommendations
    def recommend(self, user):
        
        
        user_m = self.get_user_items(user)    
            
        print("No. of unique movies for the user: %d" % len(user_m))
        
  
        all_m = self.get_all_items_train_data()
        
        print("no. of unique movies in the training set: %d" % len(all_m))
         
    
        cooccurence_matrix = self.construct_cooccurence_matrix(user_m, all_m)
        

        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_m, user_m)
                
        return df_recommendations
    
    #Get similar items to given items
    def get_similar_items(self, item_list):
        
        user_m = item_list
        

        all_m = self.get_all_items_train_data()
        
        print("no. of unique movies in the training set: %d" % len(all_m))
         

        cooccurence_matrix = self.construct_cooccurence_matrix(user_m, all_m)

        
        user = ""
        df_recommendations = self.generate_top_recommendations(user, cooccurence_matrix, all_m, user_m)
         
        return df_recommendations

In [171]:
import pandas as pd
m_df_1 = pd.read_csv('ratings.csv')
m_df_1.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [172]:
m_df_2 = pd.read_csv('movies.csv')
m_df_2.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [173]:
# combine both data
m_df = pd.merge(m_df_1,m_df_2.drop_duplicates(['movieId']), on='movieId', how='left')
m_df.dropna(inplace=True)
m_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [174]:
m_grouped = m_df.groupby(['title']).agg({'rating':'count'}).reset_index()
m_grouped.head()

Unnamed: 0,title,rating
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [175]:
grouped_sum = m_grouped['rating'].sum()
m_grouped['percentage'] = (m_grouped['rating'] / grouped_sum ) * 100
m_grouped.sort_values(['rating', 'title'], ascending=[0,1])

Unnamed: 0,title,rating,percentage
3158,Forrest Gump (1994),329,0.326272
7593,"Shawshank Redemption, The (1994)",317,0.314372
6865,Pulp Fiction (1994),307,0.304455
7680,"Silence of the Lambs, The (1991)",279,0.276687
5512,"Matrix, The (1999)",278,0.275695
...,...,...,...
9705,Zoom (2006),1,0.000992
9706,Zoom (2015),1,0.000992
9709,Zulu (2013),1,0.000992
9713,anohana: The Flower We Saw That Day - The Movi...,1,0.000992


Popularity Recommendation Engine

In [176]:
pr = popularity_recommender_py()

In [177]:
pr.create(m_df, 'userId', 'title')

                                     title  userId
0                               '71 (2014)       1
1  'Hellboy': The Seeds of Creation (2004)       1
2                   'Round Midnight (1986)       2
3                      'Salem's Lot (2004)       1
4                'Til There Was You (1997)       2


In [178]:
pr.recommend(m_df['userId'][5])

Unnamed: 0,user_id,title,score,Rank
3158,1,Forrest Gump (1994),329,1.0
7593,1,"Shawshank Redemption, The (1994)",317,2.0
6865,1,Pulp Fiction (1994),307,3.0
7680,1,"Silence of the Lambs, The (1991)",279,4.0
5512,1,"Matrix, The (1999)",278,5.0
8001,1,Star Wars: Episode IV - A New Hope (1977),251,6.0
4662,1,Jurassic Park (1993),238,7.0
1337,1,Braveheart (1995),237,8.0
8363,1,Terminator 2: Judgment Day (1991),224,9.0
7421,1,Schindler's List (1993),220,10.0


Item Similarity Recommendation

In [179]:
ir = item_similarity_recommender_py()
ir.create(m_df, 'userId', 'title')

In [180]:
user_items = ir.get_user_items(m_df['userId'][5])

In [181]:
for user_item in user_items:
    print(user_item)

Toy Story (1995)
Grumpier Old Men (1995)
Heat (1995)
Seven (a.k.a. Se7en) (1995)
Usual Suspects, The (1995)
From Dusk Till Dawn (1996)
Bottle Rocket (1996)
Braveheart (1995)
Rob Roy (1995)
Canadian Bacon (1995)
Desperado (1995)
Billy Madison (1995)
Clerks (1994)
Dumb & Dumber (Dumb and Dumber) (1994)
Ed Wood (1994)
Star Wars: Episode IV - A New Hope (1977)
Pulp Fiction (1994)
Stargate (1994)
Tommy Boy (1995)
Clear and Present Danger (1994)
Forrest Gump (1994)
Jungle Book, The (1994)
Mask, The (1994)
Blown Away (1994)
Dazed and Confused (1993)
Fugitive, The (1993)
Jurassic Park (1993)
Mrs. Doubtfire (1993)
Schindler's List (1993)
So I Married an Axe Murderer (1993)
Three Musketeers, The (1993)
Tombstone (1993)
Dances with Wolves (1990)
Batman (1989)
Silence of the Lambs, The (1991)
Pinocchio (1940)
Fargo (1996)
Mission: Impossible (1996)
James and the Giant Peach (1996)
Space Jam (1996)
Rock, The (1996)
Twister (1996)
Independence Day (a.k.a. ID4) (1996)
She's the One (1996)
Wizard of O

In [182]:
ir.recommend(m_df['userId'][5])

No. of unique movies for the user: 232
no. of unique movies in the training set: 9719
Non zero values in cooccurence_matrix :1471671


Unnamed: 0,user_id,movie,score,rank
0,1,Ferris Bueller's Day Off (1986),0.203002,1
1,1,Mars Attacks! (1996),0.195291,2
2,1,Back to the Future Part II (1989),0.194719,3
3,1,Die Hard (1988),0.190291,4
4,1,"Breakfast Club, The (1985)",0.189557,5
5,1,"Fifth Element, The (1997)",0.189308,6
6,1,Aliens (1986),0.185961,7
7,1,Jaws (1975),0.185371,8
8,1,Unbreakable (2000),0.184817,9
9,1,Austin Powers: The Spy Who Shagged Me (1999),0.184603,10


In [183]:
ir.get_similar_items(['Ferris Bueller\'s Day Off (1986)', 'Mars Attacks! (1996)'])

no. of unique movies in the training set: 9719
Non zero values in cooccurence_matrix :15343


Unnamed: 0,user_id,movie,score,rank
0,,Indiana Jones and the Temple of Doom (1984),0.416207,1
1,,"Breakfast Club, The (1985)",0.391935,2
2,,Monty Python and the Holy Grail (1975),0.377393,3
3,,Total Recall (1990),0.37627,4
4,,Groundhog Day (1993),0.36615,5
5,,Starship Troopers (1997),0.362972,6
6,,Back to the Future Part II (1989),0.361658,7
7,,RoboCop (1987),0.359392,8
8,,Ghostbusters (a.k.a. Ghost Busters) (1984),0.356585,9
9,,"Terminator, The (1984)",0.356091,10
