In [3]:
#Libraries
import pandas as pd
import numpy as np
import surprise
from surprise import Reader, Dataset, SVD
from surprise.model_selection.validation import cross_validate

In [4]:
#Create function to read in txt files
def readFile(file_path, rows=100000):
    data_dict = {'Cust_Id' : [], 'Movie_Id' : [], 'Rating' : [], 'Date' : []}
    f = open(file_path, 'r')
    count = 0
    for line in f:
        count += 1
        if count > rows:
            break
        if ':' in line:
            movidId = line[:-2] #remove the last character ':'
            movieId = int(movidId)
        else:
            customerId, rating, date = line.split(',')
            data_dict['Cust_Id'].append(customerId)
            data_dict['Movie_Id'].append(movieId)
            data_dict['Rating'].append(rating)
            data_dict['Date'].append(date.rstrip("\n"))
    f.close()
    return pd.DataFrame(data_dict)

In [5]:
#Load in dataframes
df1 = readFile('combined_data_1.txt')
df2 = readFile('combined_data_2.txt')
df3 = readFile('combined_data_3.txt')
df4 = readFile('combined_data_4.txt')

In [6]:
#Change rating to float
df1['Rating'] = df1['Rating'].astype(float)
df2['Rating'] = df2['Rating'].astype(float)
df3['Rating'] = df3['Rating'].astype(float)
df4['Rating'] = df4['Rating'].astype(float)

In [7]:
df = df1.copy()
df = df.append(df2)
df = df.append(df3)
df = df.append(df4)

  df = df.append(df2)
  df = df.append(df3)
  df = df.append(df4)


In [8]:
df.index = np.arange(0, len(df))
df.head()

Unnamed: 0,Cust_Id,Movie_Id,Rating,Date
0,1488844,1,3.0,2005-09-06
1,822109,1,5.0,2005-05-13
2,885013,1,4.0,2005-10-19
3,30878,1,4.0,2005-12-26
4,823519,1,3.0,2004-05-03


In [9]:
df_title = pd.read_csv('movie_titles.csv', encoding = 'ISO-8859-1', header = None, names = ['Movie_Id', 'Year', 'Name'])
df_title = df_title.reset_index()
#Fix column name misalignemnt
title = pd.DataFrame()
title['Movie_Id'] = df_title['level_0']
title['Year'] = df_title['level_1']
title['Name'] = df_title['level_2']
title.head()

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW


In [10]:
#Load Reader
reader = Reader()

In [11]:
data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']], reader)

In [12]:
#Load SVD
svd = SVD()

In [13]:
#Run 5-fold cross-validation and print results
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0172  1.0146  1.0149  1.0179  1.0151  1.0159  0.0014  
MAE (testset)     0.8092  0.8072  0.8071  0.8081  0.8075  0.8078  0.0008  
Fit time          29.93   30.62   31.11   24.10   22.83   27.72   3.52    
Test time         1.11    1.16    0.74    0.87    0.76    0.93    0.17    


{'test_rmse': array([1.01720167, 1.01458459, 1.01487126, 1.01788628, 1.01506516]),
 'test_mae': array([0.80923237, 0.80715581, 0.80706109, 0.80808894, 0.80749916]),
 'fit_time': (29.93142008781433,
  30.618131160736084,
  31.114326238632202,
  24.099038124084473,
  22.8329439163208),
 'test_time': (1.1067419052124023,
  1.1581075191497803,
  0.7448475360870361,
  0.866708517074585,
  0.7574775218963623)}

In [14]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x22ed2c919a0>

In [15]:
titles = title.copy()

titles['Estimate_Score'] = titles['Movie_Id'].apply(lambda x: svd.predict(30878, x).est)

In [16]:
titles = titles.sort_values(by=['Estimate_Score'], ascending=False)
titles.head()

Unnamed: 0,Movie_Id,Year,Name,Estimate_Score
12,13,2003.0,Lord of the Rings: The Return of the King: Ext...,4.469691
9235,9236,1998.0,South Park: Season 2,4.099968
4505,4506,1961.0,Breakfast at Tiffany's,4.034622
24,25,1997.0,Inspector Morse 31: Death Is Now My Neighbour,4.031928
13379,13380,1949.0,Stray Dog,3.996518
