# Make a Recommendation System that recommends top 10 movies related to the desired movie based on title

## Load Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as py

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA 

from surprise import Reader, Dataset, SVD, Trainset

# Handle Datasets 

In [None]:
# read Datasets

disc = {'customer_id': str, 'rating': np.float16}
data_cust_1 = pd.read_csv('../input/netflix-prize-data/combined_data_1.txt', 
                          names= disc.keys(), dtype= disc)

data_cust_2 = pd.read_csv('../input/netflix-prize-data/combined_data_2.txt', 
                          names= disc.keys(), dtype= disc)
data_cust_3 = pd.read_csv('../input/netflix-prize-data/combined_data_3.txt',
                          names= disc.keys(), dtype= disc) 
data_cust_4 = pd.read_csv('../input/netflix-prize-data/combined_data_4.txt', 
                          names= disc.keys(), dtype= disc) 

In [3]:
def handleDataSet(dataset):
    movie_id = dataset[dataset['rating'].isna()]['customer_id']
    size = dataset.size // 2
    dataset = dataset[dataset['rating'].notna()]
    
    len_ = len(movie_id); movie_lis = []
    
    for i in range(len_ - 1):
        temp = [movie_id.iloc[i]] * ((movie_id.index[i+1] - movie_id.index[i]) - 1)
        movie_lis.extend(temp)    
    temp = [movie_id.iloc[-1]] * ((size - movie_id.index[-1]) - 1)
    movie_lis.extend(temp)
    
    movie_lis = [np.int16(i[:-1]) for i in movie_lis]
    dataset['movie_id'] = movie_lis
    
    del movie_id, size, len_, temp, movie_lis
    return dataset

In [6]:
data_cust_1 = handleDataSet(data_cust_1)
data_cust_2 = handleDataSet(data_cust_2)
data_cust_3 = handleDataSet(data_cust_3)
data_cust_4 = handleDataSet(data_cust_4)

In [7]:
data = pd.concat([data_cust_1, data_cust_2, data_cust_3, data_cust_4], axis= 0)

In [None]:
data = pd.concat([data.iloc['customer_id'], data.iloc['movie_id'], data.iloc['rating']], axis= 1)

In [None]:
data.to_csv('./customers_rating.csv', index= False)

# Make Recommendations

## Laod Data

In [4]:
# drop year not imprtant 
# make Movie Id as index 
disc = {'movie_id': np.int16, 'year': np.float16, 'title': str}
data_movies_titles = pd.read_csv('../input/netflixmoviescustomers-rating/movie_titles.csv',
                          names= disc.keys(), dtype= disc, encoding= 'ISO-8859-1')\
                            .drop(['year'], axis= 1)

In [6]:
disc = {'customer_id': np.int32, 'rating': np.int8, 'movie_id': np.int16}
data = pd.read_csv('../input/netflixmoviescustomers-rating/customers_rating.csv', dtype= disc)

In [7]:
del disc

In [8]:
data_movies_titles.head()

Unnamed: 0,movie_id,title
0,1,Dinosaur Planet
1,2,Isle of Man TT 2004 Review
2,3,Character
3,4,Paula Abdul's Get Up & Dance
4,5,The Rise and Fall of ECW


In [9]:
data.head()

Unnamed: 0,customer_id,movie_id,rating
0,1488844,1,3
1,822109,1,5
2,885013,1,4
3,30878,1,4
4,823519,1,3


In [10]:
class Recommendation :
    '''
    Recommendation System 
    '''
    
    def __init__(self, movies_titles_set, customer_rating_set):
        
        self.__titles_set = movies_titles_set
        self.__rating_set = customer_rating_set
        self.__indexes, self.__what, self.__vectorie, \
        self.__similars, self.__svd = None, None, None, None, None     
        
        self.__handleDataSet()
        pass
    
    def __handleDataSet(self):
        self.__titles_set.drop_duplicates(inplace= True)
        self.__titles_set = \
            self.__titles_set.apply(lambda x: x.lower())
        self.__titles_set.reset_index(drop=True, inplace= True)
        pass 
    
    def changeDataSet(self, movies_titles_set):
        self.__titles_set = movies_titles_set
        self.__handleDataSet()
        pass
        
    def __vectorize(self):
        '''
        Apply TF-IDF or Count Vectorization
        ''' 
        
        if self.__what == 'tf_idf': vectorizer = \
            TfidfVectorizer(stop_words='english')
        elif self.__what == 'count': vectorizer = \
            CountVectorizer(stop_words='english')
        else: 
            raise ValueError("Must one of [tf_idf, count]")
            pass
        
        return vectorizer.fit_transform(self.__titles_set)
        
        
        
    def fit_similarity(self, what= 'tf_idf'):
        '''Fit vectorization and Calculate similarity'''
         
        self.__what = what
        # get representation of word 
        self.__vectories = self.__vectorize()
        
        ## Reduce dimensionality (Number of word that not affect)
        #self.__pca = PCA(self.__vectories.shape[1] // 2)
        #self.__vectories = self.__pca.fit_transform(self.__vectories)
        
        # calculate similarity
        self.__similars = cosine_similarity(
            self.__vectories, self.__vectories)
    
        pass        
    
    def fit_svd(self, what= 'tf_idf'):
        '''Fit vectorization and fit SVD'''
         
        self.__what = what
        # get representation of word 
        self.__vectories = self.__vectorize()
        new_data = Dataset.load_from_df(self.__rating_set, Reader())
        self.__svd = SVD()
        trainset = new_data.build_full_trainset()
        self.__svd.fit(trainset)
        pass

    
    def top_movies(self, top= 10):
        '''
        Get top-k movies based on popularity 
        
        @top : number of movies to returned
        
        @return : tuple of titles of popular movies and thier scores
        '''
        
        # group rating by movie
        groubs = self.__rating_set.groupby('movie_id') 
        
        # number of customers who voted to the movie
        num_cus = groubs['customer_id'].count()
        # avarage of movie ratings 
        avg_rat = groubs['rating'].mean()  
        # the minimum numer of votes
        thresold = num_cus.quantile(0.9)
        # avarage of whole rating   
        avg = avg_rat.mean()
        
        # Calculation based on the IMDB formula
        ratings = (num_cus / (num_cus + thresold) * avg_rat)\
                + (thresold /(thresold + num_cus) * avg)
        ratings = list(enumerate(ratings, 1))
        
        # get top-k movies
        ratings = sorted(ratings, key= lambda x:x[1]
                          , reverse= True)[1: top+1]
        
        index = [i[0] for i in ratings ] 
        scores = [i[1] for i in ratings ] 
        
        del groubs, num_cus, avg_rat, thresold, avg, ratings 
        return (self.__titles_set['title'] \
               [self.__titles_set['movie_id'] \
                .isin(index)], scores)
    
    def recommend_movies_customer_svd(self, customer_id, k= 10):
        '''
        recommend k-movies to customer. 
        
        using SVD algoritm to get the rating of movies 
        that customer wasn't watch them. and then return top-k movies 
        
        @customer_id : ID of customer
        @k : number of movies to returned
        
         @return : tuple of titles of k-movies and thier rating
        '''
        
        # get movies that customer was watched them
        cus_watch = self.__rating_set['movie_id']\
                    [self.__rating_set['customer_id'] == customer_id]
        
        # get movies that customer wasn't watch them
        cus_not_watch = self.__titles_set['movie_id']\
                        [~ self.__titles_set['movie_id'].isin(cus_watch)]
        
        # use 'SVD' to predict rating of movie
        ratings = [self.__svd.predict(customer_id, movie).est \
                   for movie in cus_not_watch]
            
        ratings = list(enumerate(ratings))
        ratings = sorted(ratings, key= lambda x:x[1]
                          , reverse= True)[: k]
        
        index = [i[0] for i in ratings ] 
        ratings = [i[1] for i in ratings ] 
        movies_id = cus_not_watch.iloc[index]
        
        return (self.__titles_set['title'] \
               [self.__titles_set['movie_id'] \
                .isin(movies_id)], ratings)
    
    def recommend_movies_customer_similarity(self, customer_id, k= 10):
        '''
        recommend k-movies to customer. 
        
        using cosine similarity to get the rating of movies 
        that customer wasn't watch them. and then return top-k movies 
        
        @customer_id : ID of customer
        @k : number of movies to returned
        
        @return : tuple of titles of k-movies and thier score
        '''
        
        # get movies that customer was watched them
        cus_watch = self.__rating_set['movie_id']\
                    [self.__rating_set['customer_id'] == customer_id]
       
    # get movies that customer wasn't watch them
        cus_not_watch = self.__titles_set['movie_id']\
                        [~ self.__titles_set['movie_id'].isin(cus_watch)].index
        
        ratings = []
        
        # get rating for each movie customer wasn't watch it 
        for movie in cus_not_watch:
            
            # get similer movies based on title
            similars = list(enumerate(self.__similars[movie],1))
            
            # drop similer movies that customer wasn't watch them
            #similars.drop(cus_watch, axis= 0, inplace= True)
            
            similars = sorted(similars, key= lambda x:x[1], reverse= True)[1: 6]
            movies_id  = [i[0] for i in similars ] 
            # how much the movie is similer to the thier simileries
            scores = [i[1] for i in similars ]
             
            rating = self.__rating_set['rating']\
                      [(self.__rating_set['customer_id'] == customer_id) \
                    & (self.__rating_set['movie_id'].isin(movies_id))]
            
            # calculate new (predicted) rating by take the rating of movie 
            # that customer is voted to it and its similarity 
            ratings.append((movie, sum(rating * scores) / len(scores)))
            pass
        
        ratings = sorted(ratings, key= lambda x:x[1], reverse= True)[:k]
        movies_id  = [i[0] for i in ratings ] 
        scores = [i[1] for i in ratings ]
        
        return (self.__titles_set['title'] \
               [self.__titles_set['movie_id'] \
                .isin(movies_id)], scores)
        
    def recommend_similar_movies(self, movie_title, top= 10):
        '''
        @text: the diserd text to recommend the more related texts in dataset
        @what: type of Vectorization
        @top: number of related texts to be recommended
        
        @return: @top texts related to the input text
        '''
        
        try :
            index = self.__titles_set[self.__titles_set ==  movie_title.lower()].index[0]
            pass
        except :
            raise IndexError("This text doesn't found in the dataset")
    
        # get similar values
        similars = list(enumerate(self.__similars[index]))
        
        # sort 
        similars = sorted(similars, key= lambda x:x[1], reverse= True)[1: top+1]
        index = [i[0] for i in similars ] 
        scores = [i[1] for i in similars ] 
    
        return (self.__titles_set.iloc[index], scores)

In [11]:
recom = Recommendation(data_movies_titles['title'], data)

In [14]:
recom.fit_similarity()
movies, scores = recom.recommend_similar_movies('Dinosaur Planet')

In [15]:
print(movies)
print(scores)

4076                      dinosaur
13346    what planet are you from?
4670         the man from planet x
14874                   red planet
5982                   dark planet
6856                 living planet
2737                  alien planet
1338                strange planet
10498               phantom planet
9020               treasure planet
Name: title, dtype: object
[0.7852837294136404, 0.61913606284742, 0.49258768087133376, 0.4649920668420548, 0.46055290970905044, 0.431879790365031, 0.42579865933411104, 0.4153782710893785, 0.4103908414805759, 0.4044617095439022]
