In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
import random
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('./data/cleaned_df.csv', index_col=0)

In [3]:
#creating ratings matrix R
R = df.pivot_table(index='userId',columns='title', values='rating')

In [4]:
# handling missing data
def fill_nan(df):
    med_values = df.median().median()
    df.fillna(med_values,inplace=True)
    return df

In [5]:
R=fill_nan(R)

In [6]:
#cleaning column names

def clean_columns(df):
    cols = df.columns
    new_columns = []
    for col in cols:
        new = col.lstrip().rstrip().replace ("'", "") #strip beginning and ending spaces, remove quotes
        new_columns.append(new)  
        
    df.columns = new_columns

In [7]:
clean_columns(R)
R

Unnamed: 0_level_0,71 (2014),Hellboy: The Seeds of Creation (2004),Round Midnight (1986),Salems Lot (2004),Til There Was You (1997),Tis the Season for Love (2015),"burbs, The (1989)",night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,4.0,3.5
2,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
3,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
4,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
607,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
608,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,4.5,3.5,3.5,3.5,3.5
609,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5


#### Train NMF

In [8]:
# instantiate NMF model
m = NMF(n_components=20)

In [9]:
m.fit(R)



NMF(n_components=20)

#### Check out the sub-matrices, and the reconstruction error

In [10]:
Q = m.components_
P = m.transform(R)
error = m.reconstruction_err_ 
P.shape, Q.shape, error

((610, 20), (20, 9719), 274.2853387857467)

#### Reconstruct the original matrix 

In [11]:
new_R = np.dot(P,Q)
pd.DataFrame(new_R.round(1), columns=R.columns, index=R.index)

Unnamed: 0_level_0,71 (2014),Hellboy: The Seeds of Creation (2004),Round Midnight (1986),Salems Lot (2004),Til There Was You (1997),Tis the Season for Love (2015),"burbs, The (1989)",night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.5,3.5,3.5,3.5,3.5,3.5,3.6,3.5,3.6,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
2,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
3,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.4,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
4,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.5,3.5,3.5,3.5,3.5,3.5,3.3,3.5,3.7,3.5,...,3.5,3.5,3.5,3.5,3.5,3.6,3.5,3.5,3.5,3.5
607,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5
608,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.7,3.5,...,3.5,3.5,3.5,3.5,3.5,4.2,3.6,3.6,3.2,3.5
609,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,...,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5,3.5


#### Make a prediction based on new user input

In [12]:
#assigning random ratings from a new user
new_ratings=[]
for i in range(1,len(R.columns)+1):
    new=round(random.uniform(0, 5), 1)
    new_ratings.append(new)

In [13]:
new_ratings=np.array(new_ratings).reshape(1,9719)

In [14]:
#Prediction step 1 - generate extra a user_P
user_P = m.transform(new_ratings)



In [15]:
#new user R - reconstruct R but for this new user only
user_R = np.dot(user_P,Q)

In [16]:
#remove films that are already seen, and return a zip of film title and rating, sorted by highest rating
user_R = user_R[0][:50]

In [17]:
user_R

array([2.48717825, 2.48879545, 2.48865362, 2.48992129, 2.48994924,
       2.47975519, 2.53129238, 2.48745996, 2.46938817, 2.49082377,
       2.48785149, 2.4875365 , 2.4858844 , 2.48956338, 2.48857756,
       2.48604042, 2.44926163, 2.48915947, 2.42052052, 2.48865362,
       2.54446061, 2.48769715, 2.49586732, 2.44550417, 2.44374426,
       2.49194098, 2.48865362, 2.46645165, 2.4896101 , 2.4908321 ,
       2.48892188, 2.48168966, 2.49260878, 2.48913186, 2.50995438,
       2.47937169, 2.47546969, 2.46892005, 2.46175912, 2.4528631 ,
       2.43974615, 2.48886173, 2.49538384, 2.41402951, 2.4703409 ,
       2.48865362, 2.47546969, 2.49573851, 2.43134609, 2.46696197])

In [18]:
recommendations = list(zip(user_R,R.columns[:50]))

In [19]:
sorted(recommendations, key = lambda x: x[0])[-10:]

[(2.4908237719867437, '*batteries not included (1987)'),
 (2.4908321046475654, '10th Kingdom, The (2000)'),
 (2.491940979287717, '101 Dalmatians II: Patchs London Adventure (2003)'),
 (2.492608783636327, '11:14 (2003)'),
 (2.4953838403193878, '13 Ghosts (1960)'),
 (2.4957385118535695, '13th (2016)'),
 (2.495867324513456, '100 Streets (2016)'),
 (2.5099543775398403, '12 Angry Men (1957)'),
 (2.531292376627733, 'burbs, The (1989)'),
 (2.5444606080793712, '10,000 BC (2008)')]

#### Colaborative Filtering

In [20]:
cf=df.pivot_table(index='title', columns='userId', values='rating')

In [21]:
#filling nan with median
cf=fill_nan(cf) 

In [22]:
cf

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
'Hellboy': The Seeds of Creation (2004),4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
'Round Midnight (1986),4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
'Salem's Lot (2004),4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
'Til There Was You (1997),4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,5.0,4.0,4.0,4.0,4.0,4.5,4.0,4.0
xXx (2002),4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,3.5,4.0,2.0
xXx: State of the Union (2005),4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,1.5
¡Three Amigos! (1986),4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0


In [23]:
def cosim(vec1, vec2):
    """function to calcualte the cosine similarity between two vectors"""  
    num = np.dot(vec1, vec2)
    denom = np.sqrt(np.dot(vec1, vec1)) * np.sqrt(np.dot(vec2, vec2))
    return num / denom

In [24]:
#converting column names to string
cf.columns=cf.columns.astype(str)

In [25]:
cosim(cf['1'], cf['500'])

0.9987424868281689

In [26]:
cf.index

Index([''71 (2014)', ''Hellboy': The Seeds of Creation (2004)',
       ''Round Midnight (1986)', ''Salem's Lot (2004)',
       ''Til There Was You (1997)', ''Tis the Season for Love (2015)',
       ''burbs, The (1989)', ''night Mother (1986)',
       '(500) Days of Summer (2009)', '*batteries not included (1987)',
       ...
       'Zulu (2013)', '[REC] (2007)', '[REC]² (2009)',
       '[REC]³ 3 Génesis (2012)',
       'anohana: The Flower We Saw That Day - The Movie (2013)',
       'eXistenZ (1999)', 'xXx (2002)', 'xXx: State of the Union (2005)',
       '¡Three Amigos! (1986)', 'À nous la liberté (Freedom for Us) (1931)'],
      dtype='object', name='title', length=9719)

In [27]:
data = []
for i in cf.columns:
    row = []
    for j in cf.columns:
        c = cosim(cf[i], cf[j])
        row.append(c)
    data.append(row)

In [28]:
cs = pd.DataFrame(data, index=cf.columns, columns=cf.columns).round(2)
cs

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,...,1.00,1.00,0.99,1.00,1.00,1.00,1.00,0.99,1.00,1.00
2,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,0.99,1.00,1.00
3,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,...,1.00,1.00,0.99,1.00,1.00,1.00,1.00,0.99,1.00,1.00
4,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,...,1.00,1.00,0.99,1.00,1.00,1.00,1.00,0.99,1.00,1.00
5,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,0.99,1.00,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,...,1.00,1.00,0.99,1.00,1.00,1.00,1.00,0.99,1.00,0.99
607,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,...,1.00,1.00,0.99,1.00,1.00,1.00,1.00,0.99,1.00,1.00
608,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99,0.99,...,0.99,0.99,0.99,0.99,0.99,0.99,0.99,1.00,0.99,0.99
609,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,1.00,...,1.00,1.00,1.00,1.00,1.00,1.00,1.00,0.99,1.00,1.00
