In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Get the appropriate number of components that can capture 
# the target variance in the data
def get_n_components(matrix, goal_variance):
    svd = TruncatedSVD(n_components=matrix.shape[1]-1, random_state=20)
    decomposed_matrix = svd.fit_transform(matrix)
    variance_sum = 0
    n_components = 0
    for variance in svd.explained_variance_ratio_:
        variance_sum += variance
        n_components += 1
        if variance_sum >= goal_variance: break

    return n_components

In [3]:
# Get movies recommendations for a movie using collaborative filtering
def get_c_f_ratings_recommendation(movieId):
    df_ratings = pd.read_csv('ratings.csv')
    df_movies = pd.read_csv('movies.csv')

    # Get lists of all movie and user IDs
    movieId_list = df_movies['movieId'].to_list()
    movieId_list.sort()
    userId_list = list(set(df_ratings['userId'].to_list()))
    userId_list.sort()

    # Create a movie-user rating matrix
    rating_matrix = df_ratings.pivot_table(values='rating', index='movieId', columns='userId', fill_value=0)
    rating_matrix = rating_matrix.fillna(0)

    # Apply truncated SVD on the movie-user matrix with a target explained vaiance of 0.95
    goal_variance = 0.95
    svd = TruncatedSVD(n_components=get_n_components(rating_matrix, goal_variance),\
                       random_state=20)
    decomposed_rating_matrix = svd.fit_transform(rating_matrix)
    # Create a correlation matrix
    with np.errstate(invalid='ignore'):
        correlation_rating_matrix = np.nan_to_num(np.corrcoef(decomposed_rating_matrix))

    # Get correlations for a particular movie
    try:
        index = rating_matrix.index.get_loc(movieId)
    # Return no recommendations for invalid movie
    except:
        return pd.DataFrame()

    # Sort recommendations by highest correlation
    all_recommendations = correlation_rating_matrix[index]
    rating_recommendations = pd.DataFrame({'correlation': all_recommendations,\
                                    'movieId': rating_matrix.index})\
                                    .sort_values('correlation', ascending=False)
    rating_recommendations = rating_recommendations.iloc[1: , :]

    return rating_recommendations

In [4]:
def get_c_b_genres_recommendations(movieId):
    df_movies = pd.read_csv('movies.csv')

    # Get movie genres, set to lower case and separate with spaces
    df_movies['genres'] = df_movies['genres'].map(lambda x: x.lower() if isinstance(x, str) else x)
    df_movies['genres'] = df_movies['genres'].map(lambda x: x.replace("|", " ") if isinstance(x, str) else x)
    df_movies['genres'] = df_movies['genres'].fillna("")

    # Get cosine similarity between all movie genres
    cv = CountVectorizer()
    cv_matrix = cv.fit_transform(df_movies['genres'])
    cos_sim = cosine_similarity(cv_matrix)

    # Get similarity coefficients for a particular movie
    index = df_movies.index[df_movies['movieId'] == movieId]
    if index.empty:
        return pd.DataFrame()

    # Sort recommendations by highest similarity
    all_recommendations = cos_sim[index][0]
    genre_recommendations = pd.DataFrame({'correlation': all_recommendations,\
                                    'movieId': df_movies['movieId']})\
                                    .sort_values('correlation', ascending=False)
    genre_recommendations = genre_recommendations.iloc[1: , :]

    return genre_recommendations

In [5]:
def get_c_b_tags_recommendations(movieId):
    df_movies = pd.read_csv('movies.csv')
    df_tags = pd.read_csv('tags.csv')

    df_tags.drop('userId', axis=1, inplace=True)
    df_tags.drop('timestamp', axis=1, inplace=True)
    
    # Combine movie tags, set to lower case and separate with spaces
    df_tags = df_tags.groupby('movieId').agg({'tag' : ' '.join}).reset_index()
    df_tags['tag'] = df_tags['tag'].map(lambda x: x.lower() if isinstance(x, str) else x)
    
    # Get cosine similarity between all movie tags
    cv = CountVectorizer()
    cv_matrix = cv.fit_transform(df_tags['tag'])
    cos_sim = cosine_similarity(cv_matrix)

    # Get similarity coefficients for a particular movie
    index = df_tags.index[df_tags['movieId'] == movieId]
    if index.empty:
        return pd.DataFrame()

    # Sort recommendations by highest similarity
    all_recommendations = cos_sim[index][0]
    tag_recommendations = pd.DataFrame({'correlation': all_recommendations,\
                                    'movieId': df_tags['movieId']})\
                                    .sort_values('correlation', ascending=False)
    tag_recommendations = tag_recommendations.iloc[1: , :]

    return tag_recommendations

In [6]:
def get_hybrid_recommendations(movieId):
    valid_recommendations = []

    r = get_c_f_ratings_recommendation(movieId)
    if not r.empty:
        r['correlation'] = 0.4*r['correlation']
        valid_recommendations.append(r)

    g = get_c_b_genres_recommendations(movieId)
    if not g.empty:
        g['correlation'] = 0.3*g['correlation']
        valid_recommendations.append(g)

    t = get_c_b_tags_recommendations(movieId)
    if not t.empty:
        t['correlation'] = 0.3*t['correlation']
        valid_recommendations.append(t)

    if not valid_recommendations:
        return "No recommendations!"

    # Combine all recommendations and ge the first ten
    df = pd.concat(valid_recommendations).groupby(['movieId']).sum().reset_index()
    df = df.sort_values('correlation', ascending=False)
    df = df.head(10)

    df_movies = pd.read_csv('movies.csv')
    full_recommendations = df.merge(df_movies)
    
    return full_recommendations

In [7]:
# Recommendations for "Toy Story"
print(get_hybrid_recommendations(1))

   movieId  correlation                                   title  \
0     2355     0.733255                    Bug's Life, A (1998)   
1     3114     0.698457                      Toy Story 2 (1999)   
2     4886     0.506594                   Monsters, Inc. (2001)   
3     4306     0.488734                            Shrek (2001)   
4     6377     0.465124                     Finding Nemo (2003)   
5      588     0.454101                          Aladdin (1992)   
6     2294     0.447401                             Antz (1998)   
7    78499     0.447337                      Toy Story 3 (2010)   
8     1136     0.435881  Monty Python and the Holy Grail (1975)   
9     8961     0.430779                 Incredibles, The (2004)   

                                              genres  
0                Adventure|Animation|Children|Comedy  
1        Adventure|Animation|Children|Comedy|Fantasy  
2        Adventure|Animation|Children|Comedy|Fantasy  
3  Adventure|Animation|Children|Comedy|Fan

In [8]:
# Recommendations for "Usual Suspects, The"
print(get_hybrid_recommendations(50))

   movieId  correlation                             title  \
0     4226     0.588825                    Memento (2000)   
1       47     0.581239       Seven (a.k.a. Se7en) (1995)   
2     1089     0.566560             Reservoir Dogs (1992)   
3     4963     0.531041             Ocean's Eleven (2001)   
4     1625     0.528318                  Game, The (1997)   
5    44665     0.499042        Lucky Number Slevin (2006)   
6      628     0.492600                Primal Fear (1996)   
7    48516     0.488107              Departed, The (2006)   
8     2959     0.480129                 Fight Club (1999)   
9      593     0.474149  Silence of the Lambs, The (1991)   

                         genres  
0              Mystery|Thriller  
1              Mystery|Thriller  
2        Crime|Mystery|Thriller  
3                Crime|Thriller  
4        Drama|Mystery|Thriller  
5           Crime|Drama|Mystery  
6  Crime|Drama|Mystery|Thriller  
7          Crime|Drama|Thriller  
8   Action|Crime|Drama