# Movie Recommendation Engine

In [1]:
import pandas as pd
import numpy as np
import json
import ast
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier

# Ratings Numpy

In [2]:
def get_ratings_matrix():
    movies_dataset = pd.read_csv('./the-movies-dataset/ratings_small.csv', low_memory=False)
    ratings = movies_dataset.values
    movie_ids = np.array(movies_dataset.iloc[:, 1].unique())
    no_movie_ids = movie_ids.max() + 1
    user_ids = np.array(movies_dataset.iloc[:, 0].unique())
    no_user_ids = user_ids.max()
    matrix = np.zeros((no_user_ids, no_movie_ids))
    for i in range(1, no_user_ids + 1):
        var1 = (ratings[ratings[:, 0] == i])[:, 1]
        var1 = var1.astype(int)
        var2 = (ratings[ratings[:, 0] == i])[:, 2]
        temp = np.zeros(no_movie_ids)
        temp[var1] = var2
        matrix[i - 1] = temp
    matrix = matrix.transpose()
    temp = np.arange(1, matrix.shape[0] + 1).reshape(matrix.shape[0], 1)
    matrix = np.hstack([temp, matrix])
    user_ids = np.insert(user_ids, 0, 0)
    matrix = np.vstack([user_ids, matrix])
    ratings = pd.DataFrame(data=matrix[1:, 1:], index=matrix[1:, 0], columns=matrix[0, 1:])
    ratings.index = ratings.index.astype('int32')
    ratings.columns = ratings.columns.astype('int32')
    ratings.index.name = "id"
    return ratings, movie_ids, user_ids

# Features Numpy

In [3]:
# global variable
set_genres = set()
dict_genres = {}


def get_genres(x):
    js = json.loads(str(x))
    genres = [li['name'] for li in js]
    set(genres)
    return genres


def list_genres(x):
    global set_genres
    set_genres |= set(x)


def features(x):
    global dict_genres
    list_feature = [0] * len(dict_genres)
    if len(x) == 0:
        return np.nan
    value = 1.0 / (float(len(x)))
    for item in x:
        list_feature[dict_genres[item]] = value
    return np.around(list_feature, decimals=2)


features_matrix = np.arange(20).reshape((1, 20))


def generate_features_matrix(x):
    global features_matrix
    x.reshape((1, 20))
    features_matrix = np.vstack([features_matrix, x])


def get_features():
    movies_metadata = pd.read_csv('./the-movies-dataset/movies_metadata.csv', low_memory=False)
    movies_metadata.drop(movies_metadata[movies_metadata.id == "1997-08-20"].index, inplace=True)
    movies_metadata.drop(movies_metadata[movies_metadata.id == "2012-09-29"].index, inplace=True)
    movies_metadata.drop(movies_metadata[movies_metadata.id == "2014-01-01"].index, inplace=True)
    movies_metadata["genres"] = movies_metadata.genres.apply(lambda x: str(x).replace("'", '"'))
    movies_metadata["belongs_to_collection"] = movies_metadata.belongs_to_collection.apply(
        lambda x: str(x).replace("'", '"'))
    movies_metadata["genres_list"] = movies_metadata.genres.apply(get_genres)
    movies_metadata.genres_list.apply(list_genres)
    global set_genres
    global dict_genres
    set_genres = list(set_genres)
    set_genres.sort()
    for index, item in enumerate(set_genres):
        dict_genres[item] = index
    movies_metadata["features_genres"] = movies_metadata["genres_list"].apply(features)
    movies_metadata.dropna(subset=['features_genres'], inplace=True)
    movie_names = movies_metadata.loc[:, ['original_title', 'id']]
    movie_names['id'] = pd.to_numeric(movie_names.id, errors='coerce').fillna(0).astype(np.int64)
    movie_names.set_index(['id'], inplace=True)
    features_dataframe = movies_metadata.loc[:, ['id', 'features_genres']]
    features_dataframe.set_index(['id'], inplace=True)
    movie_ids = movies_metadata['id'].values
    movie_ids = movie_ids.astype(np.int)
    features_dataframe.features_genres.apply(generate_features_matrix)
    features_dataframe = pd.DataFrame(data=features_matrix[1:, :], columns=features_matrix[0, :], index=movie_ids)
    features_dataframe.index.name = 'id'
    features_dataframe.columns = features_dataframe.columns.astype('int32')
    features_dataframe.index = features_dataframe.index.astype('int32')
    return features_dataframe, movie_ids, np.array(list(set_genres)), movie_names


# Filter

In [4]:
def filter():
    ratings, movie_ids_ratings, user_ids = get_ratings_matrix()
    features_dataframe, movie_ids_features, list_genres, movie_names = get_features()
    movie_ids_ratings = set(movie_ids_ratings)
    movie_ids_features = set(movie_ids_features)
    movies_common = movie_ids_features.intersection(movie_ids_ratings)
    movies_common = list(movies_common)
    ratings = ratings.loc[movies_common, :]
    features_dataframe = features_dataframe.loc[movies_common, :]
    return features_dataframe.values[1:, :], ratings.values[:, 1:], np.array(movies_common), user_ids, list_genres, movie_names

# Main

In [5]:
X, ratings_vector, movies_ids, user_ids, list_genres, movie_names = filter()

In [None]:
movie_names

In [11]:
clf = LinearRegression()
top_5_recommendations = np.empty((ratings_vector.shape[1], 5),dtype=np.dtype(object))   
for i in range(ratings_vector.shape[1]):
    single_y = ratings_vector[:, i]
    if single_y[single_y != 0].shape[0] != 0:
        positions = np.nonzero(single_y)[0]                                                              
        remaining_positions = np.where(single_y == 0)[0]                                                 
        reg = clf.fit(X[positions], single_y[positions])                                                 
        predictions = reg.predict(X[remaining_positions])                                                
        top_5_recommendations_each = movies_ids[np.argsort(predictions)[-5:]]                            
        top_5_recommendations[i] = movie_names.loc[top_5_recommendations_each,:].values.reshape(1,5)[0]  

In [21]:
single_y = ratings_vector[:, 0]    
positions = np.nonzero(single_y)[0]
single_y = single_y[positions]
single_y = single_y[single_y >=4.0]
print("Ratings given by user 1")
print("========================")
for i in range(single_y.shape[0]):                                                                               
    print(movie_names.loc[movies_ids[positions],'original_title'].values[i]+"      -        " + str(single_y[i]))
print()
print("Movies Recommended for user 1")
print("=============================")
print(top_5_recommendations[0])

Ratings given by user 1
Star Wars      -        4.0
The Fifth Element      -        5.0
Twelve Monkeys      -        4.0
Scarface      -        4.0
Breaking the Waves      -        5.0
Star Trek II: The Wrath of Khan      -        4.0
Edward Scissorhands      -        4.0
La Boum      -        5.0
Predator 2      -        5.0
Lucky Number Slevin      -        4.0
Sin City      -        4.0
Rebecca      -        4.0
Człowiek z marmuru      -        4.0
Muriel's Wedding      -        4.0
The War of the Roses      -        4.0
King Kong      -        4.0
The King of Comedy      -        4.0
Le Mépris      -        4.0
Carne trémula      -        5.0
Das weisse Rauschen      -        5.0
The Silence of the Lambs      -        5.0
A River Runs Through It      -        5.0
Rio Bravo      -        4.0

Movies Recommended for user 1
['Deconstructing Harry' "C'era una volta il West" "Miller's Crossing"
 'Meet Me in St. Louis' "I'll Sleep When I'm Dead"]
