## Capstone Project
#### Introduction to Machine Learning
#### Oviya Adhan

In [1]:
import random
random.seed(10945490) #set random seed to N-number
print(random.random())

0.2701886388051843


In [2]:
#Import packages
import numpy as np
import scipy
import sklearn
import matplotlib.pyplot as plt
import datetime as dt
from sklearn import metrics
from sklearn import model_selection
from sklearn.datasets import make_blobs, load_breast_cancer, load_iris
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.metrics import mean_squared_error
import pandas as pd
import autograd.numpy as np

### Pre-Processing

In [3]:
#Load movie titles
titles = pd.read_csv('movieTitles.csv', names = ['movieID', 'releaseDate', 'movieTitle'])
#movies = pd.read_table(data, names = ['movieID', 'releaseDate', 'movieTitle'], header=None)
print(titles.shape)
titles.head()

(5000, 3)


Unnamed: 0,Unnamed: 1,movieID,releaseDate,movieTitle
1,2003.0,Dinosaur Planet,,
2,2004.0,Isle of Man TT 2004 Review,,
3,1997.0,Character,,
4,1994.0,Paula Abdul's Get Up & Dance,,
5,2004.0,The Rise and Fall of ECW,,


In [4]:
#Parse through data --> ran once, commented out to run faster in re-runs
'''
data = open('ratings.csv', mode='w')
    
row = list()

with open('data.txt') as f:
    for line in f: 
        line = line.strip()
        if line.endswith(':'):
            # All below are ratings for this movie, until another movie appears.
            movie_id = line.replace(':', '')
        else:
            row = [x for x in line.split(',')]
            row.insert(0, movie_id)
            data.write(','.join(row))
            data.write('\n')
data.close()
'''

"\ndata = open('ratings.csv', mode='w')\n    \nrow = list()\n\nwith open('data.txt') as f:\n    for line in f: \n        line = line.strip()\n        if line.endswith(':'):\n            # All below are ratings for this movie, until another movie appears.\n            movie_id = line.replace(':', '')\n        else:\n            row = [x for x in line.split(',')]\n            row.insert(0, movie_id)\n            data.write(','.join(row))\n            data.write('\n')\ndata.close()\n"

In [5]:
#Read new csv with ratings
df = pd.read_csv('ratings.csv', names = ['movieID', 'userID', 'rating', 'date'])
print(df.shape)
df.head()

(27010225, 4)


Unnamed: 0,movieID,userID,rating,date
0,1,1488844,3,2005-09-06
1,1,822109,5,2005-05-13
2,1,885013,4,2005-10-19
3,1,30878,4,2005-12-26
4,1,823519,3,2004-05-03


In [6]:
#Convert date to numerical variable 
df['date_num'] = pd.to_datetime(df['date'])
df['date_num'] = df['date_num'].map(dt.datetime.toordinal)

In [7]:
#Replace NaN values (ratings) with 3
df.isnull().values.any() #False
df['ratings'] = df['rating'].fillna(3)

### Splitting Data

In [8]:
#Test train split
testIndex = []

#Test set (1 random review for each movie)
test = df.groupby('movieID').sample(n=1, random_state = 1) #how to ensure random state is coming from pre-set seed?
testIndex = test.index
print('Test set: ', test.shape)

#Training set (all other reviews)
train = df.drop(testIndex, axis=0)
print('Training set: ', train.shape)
test.head()

Test set:  (5000, 6)
Training set:  (27005225, 6)


Unnamed: 0,movieID,userID,rating,date,date_num,ratings
480,1,2047577,3,2005-05-27,732093,3
597,2,222290,4,2005-03-14,732019,4
799,3,104768,5,2003-11-25,731544,5
2755,4,1949730,3,2004-04-28,731699,3
3191,5,1716193,5,2005-01-10,731956,5


In [9]:
#Split X and y
X_train = train[['movieID', 'userID', 'date_num']]
y_train = train[['rating']]

X_test = test[['movieID', 'userID', 'date_num']]
y_test = test[['rating']]

### Multiple Regression Model
All variables

In [10]:
#Fit the model
model = LinearRegression().fit(X_train,y_train)

In [11]:
#Check RMSE (using test set)

#Make predictions
y_pred = model.predict(X_test)

#Compare predictions with actual values
rmse = mean_squared_error(y_test, y_pred, squared=False)
print('RMSE: ', rmse)

#Normalize rmse
norm_rmse = rmse / (5 - 1) #over difference between max value (max rating is 5) and minimum value (min rating is 1)
print('Normalized RMSE: ', norm_rmse)

RMSE:  1.2554509110091085
Normalized RMSE:  0.31386272775227714


### Linear Regressions for each Variable

My purpose here is to find the best predictor variable and see if any of the models with single linear regressions (only one predictor variable) perform better than the aggregated multiple regression model above.

Model 1: movieID

Model 2: userID

Model 3: date_num

In [12]:
X1train = X_train[['movieID']]
X1test = X_test[['movieID']]

X2train = X_train[['userID']]
X2test = X_test[['userID']]

X3train = X_train[['date_num']]
X3test = X_test[['date_num']]

In [13]:
#Model, predictions, normalized rmse with only movieID as predictor
model1 = LinearRegression().fit(X1train,y_train)
y1pred = model1.predict(X1test)
rmse1 = mean_squared_error(y_test, y1pred, squared=False)/ (5 - 1)


In [14]:
#Model, predictions, normalized rmse with only userID as predictor
model2 = LinearRegression().fit(X2train,y_train)
y2pred = model2.predict(X2test)
rmse2 = mean_squared_error(y_test, y2pred, squared=False)/ (5 - 1)


In [15]:
#Model, predictions, normalized rmse with only date as predictor
model3 = LinearRegression().fit(X3train,y_train)
y3pred = model3.predict(X3test)
rmse3 = mean_squared_error(y_test, y3pred, squared=False)/ (5 - 1)


In [16]:
#Compare RMSE of 3 linear regressions and the prior multiple regression
print('NORMALIZED RMSE OF EACH MODEL')
print('Multiple Regression: ', norm_rmse)
print('Simple Linear Regression (Movie ID): ', rmse1)
print('Simple Linear Regression (User ID): ', rmse2)
print('Simple Linear Regression (Date): ', rmse3)

NORMALIZED RMSE OF EACH MODEL
Multiple Regression:  0.31386272775227714
Simple Linear Regression (Movie ID):  0.3173333178027985
Simple Linear Regression (User ID):  0.317197154310339
Simple Linear Regression (Date):  0.3137075344341073


### Extra Credit

In [18]:
#Just first movie reviews
df2 = df.loc[df['movieID'] == 1]
df.shape

#Correlation between date and rating
df2['date_num'].corr(df2['rating'])

0.12582654658290654