In [18]:
""" NNRecommender.py
    Script to develop a recommender system based on movielens data set.
    Script uses 100k data set whcih can be found here
    https://grouplens.org/datasets/movielens/100k/
"""

import numpy as np
import pandas as pd

#All the genres used to categorize movies. 
genreList = ['Unknown','Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western']

""" Users with the following columns of information
    -ID
    -Age
    -Gender
    -Occupation
    -Zipcode
"""
users = pd.read_csv('./ml-100k/u.user',sep = '|',header = None, index_col=False, names = ['UserID','Age','Gender','Occupation','Zip-code'],engine='python')

""" Movies with the following columns of information
    -ID
    -Title
    -Release Date
    -Video Date
    -IMDBURL
"""
movies = pd.read_csv('./ml-100k/u.item',sep = '|',header = None, index_col=False, names = ['MovieID','Title','ReleaseDate','VideoDate','IMDBURL'] + genreList ,engine='python')

""" All ratings with the following columns of information
    -User ID
    -Move ID
    -Rating
    -Time Stamp
"""
ratings = pd.read_csv('./ml-100k/u.data',sep = '\t',header = None, index_col=False,names =['UserID','MovieID','Rating','Timestamp'] ,engine='python')

#Drop unused data 
ratings = ratings.drop('Timestamp',axis=1)
users = users.drop('Zip-code',axis=1)
users = users.drop('Occupation',axis=1)

#Reformat Gender 
users = users.replace(to_replace=['M','F'],value=[1,0])


In [2]:
#Simple functions

def get_ratings_mov(MovieID):
    """ Get the dataframe object of all movie ratings for a given movie
    """
    return ratings[ratings.MovieID == MovieID]

def get_ratings_usr(UserID):
    """ Get the dataframe object of all movie ratings for a given user
    """
    return ratings[ratings.UserID == UserID]

def get_rating_umv(UserID, MovieID):
    """ Get the rating that a user gave a particular movie
    """
    users = get_ratings_usr(UserID)
    answer = users[users.MovieID == MovieID]
    if answer.empty:
        return None
    return answer

def get_movies_gnr(genre):
    """ Get the dataframe object with all movies of a particular genre
    """
    return movies[getattr(movies,genre) == 1]

def get_users_age(Age):
    """ Get the dataframe object with all userIDs of a particular age
    """
    return users[users.Age == Age]

def get_users_gnd(Gender):
    """ Get the dataframe object with all userIDs of a particular gender
    """
    return users[users.Gender == Gender]

def get_users_usr(UserID):
    """ Get the dataframe object with a particular users information
    """
    return users[users.UserID == UserID]

In [14]:
def three_year_mean(user_ID,movie_ID):
    """Function used to find average rating for a movie in a chosen age range
    """
    #Find user's age
    user_age = get_users_usr(user_ID).Age.values[0]
    
    #Get all users within a year of the user's age
    Xusers = get_users_age(user_age).append(get_users_age(user_age+1)).append(get_users_age(user_age-1))
    
    #Use only users, movies, and ratings
    Yusers = pd.DataFrame(columns = ['UserID', 'MovieID', 'Rating'])
    
    #Loop through all rows of users within age range and get all their ratings
    for ind,val in Xusers.iterrows():
        Yusers = Yusers.append(ratings.loc[ratings['UserID'] == val[0]])
        
    #Find all the ratings for one given movie from users in age range and take average
    Ydata = Yusers[Yusers.MovieID == movie_ID]
    return np.mean(Ydata.Rating)

In [23]:
def gender_mean(user_ID,movie_ID):
    """Function used to find average rating for a movie for one gender
    """
    #Get gender of user
    user_gnd = get_users_usr(user_ID).Gender.values[0]
    
    #Get all users of male or female
    Xusers = get_users_gnd(user_gnd)
    Yusers = pd.DataFrame(columns = ['UserID', 'MovieID', 'Rating'])
    
    #Loop through all rows of users of same gender and get their ratings
    for ind,val in Xusers.iterrows():
        Yusers = Yusers.append(ratings.loc[ratings['UserID'] == val[0]])
        if ind > 100:
            break
            
    #Find all ratings for one given movie from users with same gender and take average. 
    Ydata = Yusers[Yusers.MovieID == movie_ID]
    return np.mean(Ydata.Rating)

In [24]:
def user_mean(user_ID):
    """Get average rating for a given user
    """
    Ydata = get_ratings_usr(user_ID)
    return np.mean(Ydata.Rating)
def movie_mean(movie_ID):
    """Get average rating for a given movie
    """
    Ydata = get_ratings_mov(movie_ID)
    return np.mean(Ydata.Rating)

In [90]:
X = np.zeros((100000,4))
Y = np.zeros((100000,1))
for ind,val in ratings.iterrows():
    if ind >= 100000:
        break
    user_ID = val[0]
    movie_ID = val[1]
    rt = val[2]
    Y[ind] = rt
    X[ind] = [three_year_mean(user_ID,movie_ID), gender_mean(user_ID,movie_ID), user_mean(user_ID),movie_mean(movie_ID)]
X = np.nan_to_num(X)


In [91]:
from sklearn.model_selection import train_test_split
X.reshape(30000,4)
Y.reshape(30000,1)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.20)


In [114]:
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(hidden_layer_sizes=(16,12,8,4),max_iter=2000)
mlp.fit(X_train,Y_train.ravel())


MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(16, 12, 8, 4), learning_rate='constant',
       learning_rate_init=0.001, max_iter=2000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [117]:
from sklearn.metrics import mean_squared_error, explained_variance_score

predictions = mlp.predict(X_test)
print(np.sqrt(mean_squared_error(Y_test,predictions)))
print(np.sqrt( explained_variance_score(Y_test,predictions)))


0.85698151172
0.67511105814
