In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import csv
import numba
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
from sklearn import linear_model

### Features:

Age, Gender, Occupation, Zipcode, Genre

In [4]:
# Load data
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', engine='python')
users = pd.read_csv('ml-1m/users.dat', sep='::', engine='python')
movies = pd.read_csv('ml-1m/movies.dat', sep='::', engine='python')

ratings.columns = map(str.lower, ratings.columns)
users.columns = map(str.lower, users.columns)
movies.columns = map(str.lower, movies.columns)


df = pd.merge(ratings,users)
df = pd.merge(df,movies)
df['gender'] = df['gender'].replace(['M','F'],[0,1])
df['genres'] = df['genres'].str.replace('|',',')
df['genres'] = df['genres'].str.replace('Action','0')
df['genres'] = df['genres'].str.replace('Adventure','1')
df['genres'] = df['genres'].str.replace('Animation','2')
df['genres'] = df['genres'].str.replace("Children's",'3')
df['genres'] = df['genres'].str.replace('Comedy','4')
df['genres'] = df['genres'].str.replace('Crime','5')
df['genres'] = df['genres'].str.replace('Documentary','6')
df['genres'] = df['genres'].str.replace('Drama','7')
df['genres'] = df['genres'].str.replace('Fantasy','8')
df['genres'] = df['genres'].str.replace('Film-Noir','9')
df['genres'] = df['genres'].str.replace('Horror','10')
df['genres'] = df['genres'].str.replace('Musical','11')
df['genres'] = df['genres'].str.replace('Mystery','12')
df['genres'] = df['genres'].str.replace('Romance','13')
df['genres'] = df['genres'].str.replace('Sci-Fi','14')
df['genres'] = df['genres'].str.replace('Thriller','15')
df['genres'] = df['genres'].str.replace('War','16')
df['genres'] = df['genres'].str.replace('Western','17')


df['age'] = df['age'].replace('1',0)
df['age'] = df['age'].replace('18',1)
df['age'] = df['age'].replace('25',2)
df['age'] = df['age'].replace('35',3)
df['age'] = df['age'].replace('45',4)
df['age'] = df['age'].replace('50',5)
df['age'] = df['age'].replace('56',6)

In [5]:
df.head()

Unnamed: 0,userid,movieid,rating,timestamp,gender,age,occupation,zip-code,title,genres
0,1,1193,5,978300760,1,1,10,48067,One Flew Over the Cuckoo's Nest (1975),7
1,2,1193,5,978298413,0,56,16,70072,One Flew Over the Cuckoo's Nest (1975),7
2,12,1193,4,978220179,0,25,12,32793,One Flew Over the Cuckoo's Nest (1975),7
3,15,1193,4,978199279,0,25,7,22903,One Flew Over the Cuckoo's Nest (1975),7
4,17,1193,5,978158471,0,50,1,95350,One Flew Over the Cuckoo's Nest (1975),7


In [6]:
n_users = users['userid'].max(); print("Number of users: {0}".format(n_users))
n_items = movies['movieid'].max(); print("Number of items: {0}".format(n_items))

Number of users: 6040
Number of items: 3952


### Building the features matrix

In [7]:
users = df['userid'].values-1
movies = df['movieid'].values-1
rating = df['rating'].values
gender = df['gender'].values
age = df['age'].values
occupation = df['occupation'].values
genres = df['genres'].values

In [8]:
# Building the features matrix
feat_matrix = np.zeros((n_users,n_items,48))

uid = ratings['userid'].values - 1
mid = ratings['movieid'].values - 1
rt =  ratings['rating'].values

for i in range(df.shape[0]):
    # Check gender
    u_index = uid[i]
    m_index = mid[i]
    
    if gender[i] == 0:
        feat_matrix[u_index, m_index, 0]=1
    else:
        feat_matrix[u_index, m_index ,1]=1

    # Check age
    for j in range(0,6):
        if age[i] == j:
            feat_matrix[u_index, m_index, j+2] = 1

    # Check occupation
    for k in range(0,20):
        if occupation[i] == k:
            feat_matrix[u_index, m_index, k+9] = 1

    # Check genres
    for l in range(0,17):
        if '%d' %(l) in genres[i]:
            feat_matrix[u_index, m_index, l+30]=1       

In [9]:
# Building the predictions matrix (ground truth) 
rating_matrix = np.zeros((n_users, n_items))

for i in range(df.shape[0]):
    # Check gender
    u_index = uid[i]
    m_index = mid[i]  
    rating_matrix[u_index,m_index] = rating[i]

In [10]:
@numba.jit()
def predict(feat_matrix, theta):
    m,n,_ = feat_matrix.shape
    pred = np.empty((m,n))
    
    for i in range(m):
        for j in range(n):
            pred[i,j] = np.dot(feat_matrix[i,j,:], theta)
    
    return pred

In [11]:
def loss(y_real, y_pred):
    return np.sum((y_real-y_pred)**2)

In [None]:
# 3D to 2D mapping
m,n,_ = feat_matrix.shape
_feat_matrix = np.empty((m*n,48))
for i in range(m):
    for j in range(n):
        _feat_matrix[i*j,:] = feat_matrix[i,j,:]

In [None]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(_feat_matrix, pred_matrix.ravel())