## Auxiliar Notebook to help the webapp and unsupervised learning be deployed interactively

In [1]:
import pandas as pd
import numpy as np
import pickle
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

from sklearn.decomposition import NMF


#### 1. First, concatenating the movies and ratings dataframes


In [2]:
ratings = pd.read_csv('../data/ratings.csv')
ratings = ratings.drop(['timestamp'], axis=1)
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [3]:
movies = pd.read_csv('../data/movies.csv')
movies = movies.drop(['genres'], axis=1)
movies

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017)
9738,193583,No Game No Life: Zero (2017)
9739,193585,Flint (2017)
9740,193587,Bungo Stray Dogs: Dead Apple (2018)


In [4]:
merged = pd.merge(movies, ratings, on='movieId')
merged

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5
...,...,...,...,...
100831,193581,Black Butler: Book of the Atlantic (2017),184,4.0
100832,193583,No Game No Life: Zero (2017),184,3.5
100833,193585,Flint (2017),184,3.5
100834,193587,Bungo Stray Dogs: Dead Apple (2018),184,3.5


In [5]:
users_rating = pd.pivot_table(merged, index='userId', columns='title', values='rating')
users_rating

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [6]:
#Dropping the movies with less than 50 ratings.
#In other words, talking only the movies that the where rated more than 50 times, 
#as a way to free ourselves from computation time
users_rating = users_rating.dropna(thresh=50, axis=1)
users_rating

title,10 Things I Hate About You (1999),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",A.I. Artificial Intelligence (2001),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),...,Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)","Wolf of Wall Street, The (2013)",X-Men (2000),X-Men: The Last Stand (2006),X2: X-Men United (2003),You've Got Mail (1998),Young Frankenstein (1974),Zombieland (2009),Zoolander (2001)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,4.0,,,...,5.0,5.0,,5.0,,,,5.0,,
2,,,,,,,,,,,...,,,5.0,,,,,,3.0,
3,,,,,,,,,,,...,,,,,,,0.5,,,
4,,5.0,,,,,,,,,...,4.0,5.0,,,,,,,,
5,,,,,,,,,3.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,5.0,,,,3.5,,,2.0,...,,,,,,,3.5,3.5,,
607,,,,,,,,,,,...,,5.0,,3.0,,,,,,
608,,,3.0,3.5,5.0,,4.5,3.0,3.5,2.0,...,3.5,2.5,,4.0,4.0,4.0,,,,3.0
609,,,,,,,,,,,...,,,,,,,,,,


In [10]:
users_rating.mean()

title
10 Things I Hate About You (1999)    3.527778
12 Angry Men (1957)                  4.149123
2001: A Space Odyssey (1968)         3.894495
28 Days Later (2002)                 3.974138
300 (2007)                           3.681250
                                       ...   
X2: X-Men United (2003)              3.723684
You've Got Mail (1998)               3.120000
Young Frankenstein (1974)            3.992754
Zombieland (2009)                    3.877358
Zoolander (2001)                     3.509259
Length: 450, dtype: float64

In [11]:
ratings_mean = users_rating.mean()
users_rating_imputed = users_rating.fillna(value= ratings_mean)
users_rating_imputed

title,10 Things I Hate About You (1999),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",A.I. Artificial Intelligence (2001),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),...,Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)","Wolf of Wall Street, The (2013)",X-Men (2000),X-Men: The Last Stand (2006),X2: X-Men United (2003),You've Got Mail (1998),Young Frankenstein (1974),Zombieland (2009),Zoolander (2001)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.527778,4.149123,3.894495,3.974138,3.68125,3.547297,3.339286,4.000000,3.040373,2.727273,...,5.00000,5.000000,3.916667,5.000000,3.355769,3.723684,3.12,5.000000,3.877358,3.509259
2,3.527778,4.149123,3.894495,3.974138,3.68125,3.547297,3.339286,3.475806,3.040373,2.727273,...,3.87395,3.880435,5.000000,3.699248,3.355769,3.723684,3.12,3.992754,3.000000,3.509259
3,3.527778,4.149123,3.894495,3.974138,3.68125,3.547297,3.339286,3.475806,3.040373,2.727273,...,3.87395,3.880435,3.916667,3.699248,3.355769,3.723684,0.50,3.992754,3.877358,3.509259
4,3.527778,5.000000,3.894495,3.974138,3.68125,3.547297,3.339286,3.475806,3.040373,2.727273,...,4.00000,5.000000,3.916667,3.699248,3.355769,3.723684,3.12,3.992754,3.877358,3.509259
5,3.527778,4.149123,3.894495,3.974138,3.68125,3.547297,3.339286,3.475806,3.000000,2.727273,...,3.87395,3.880435,3.916667,3.699248,3.355769,3.723684,3.12,3.992754,3.877358,3.509259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.527778,4.149123,5.000000,3.974138,3.68125,3.547297,3.500000,3.475806,3.040373,2.000000,...,3.87395,3.880435,3.916667,3.699248,3.355769,3.723684,3.50,3.500000,3.877358,3.509259
607,3.527778,4.149123,3.894495,3.974138,3.68125,3.547297,3.339286,3.475806,3.040373,2.727273,...,3.87395,5.000000,3.916667,3.000000,3.355769,3.723684,3.12,3.992754,3.877358,3.509259
608,3.527778,4.149123,3.000000,3.500000,5.00000,3.547297,4.500000,3.000000,3.500000,2.000000,...,3.50000,2.500000,3.916667,4.000000,4.000000,4.000000,3.12,3.992754,3.877358,3.000000
609,3.527778,4.149123,3.894495,3.974138,3.68125,3.547297,3.339286,3.475806,3.040373,2.727273,...,3.87395,3.880435,3.916667,3.699248,3.355769,3.723684,3.12,3.992754,3.877358,3.509259


In [25]:
users_rating_imputed.to_csv('ratings.csv')

#### 2. Next steps:
1. Follow the NMF guide to factorize and save the Model in a .pkl
2. Follow the cosine similarity notebook and adapt it also to the users_rating_imputed data frame
3. Put all cleaned and essential steps in the functions taking the user queries
4. Look Alis notebooks for help given the case

In [None]:
factorizer = NMF(n_components=450, max_iter=4000)
factorizer.fit(users_rating_imputed)
factorizer.reconstruction_err_

1.0301726268190194

In [None]:
def NMF_fit():
    #I want to keep this inside a function to not have to call this process again
    #Because it laste 27 minutes
    #This way I found bareable hyperparameters after several tries
    factorizer = NMF(n_components=450, max_iter=4000)
    factorizer.fit(users_rating_imputed)
    factorizer.reconstruction_err_

In [None]:
def minimal_error_plot():
    # find k with minimal reconstruction_err
    # Written by Saleh:)
    k_list = []
    err_list = []
    for k in range(250, min(users_rating_imputed.shape)): # k << min(m,n) with m x n = size(R)  
        # Instatiate the NMF model
        factorizer = NMF(n_components=k, max_iter=1)

        # Fit the model on the full imputed user/movie dataframe/matrix
        factorizer.fit(users_rating_imputed)

        # save and print k and err
        k_list.append(k)
        err = round(factorizer.reconstruction_err_, 2)
        err_list.append(err)
        print(f"k = {k}, err = {err}")
    # plot it
    plt.plot(k_list, err_list)
    plt.xlabel("k")
    plt.ylabel("error")
    plt.title("find the optimal number of components k") 

In [None]:
Q = pd.DataFrame(factorizer.components_ #The actual Q but as a bidimensial numpy array
                 ,columns = users_rating_imputed.columns)
Q

title,10 Things I Hate About You (1999),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",A.I. Artificial Intelligence (2001),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),...,Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)","Wolf of Wall Street, The (2013)",X-Men (2000),X-Men: The Last Stand (2006),X2: X-Men United (2003),You've Got Mail (1998),Young Frankenstein (1974),Zombieland (2009),Zoolander (2001)
0,0.827956,1.171025,0.941614,1.077769,0.880602,0.80415,0.821580,0.919889,0.480037,0.479314,...,0.875100,1.016807,1.087485,0.940651,0.906733,0.990360,0.796754,1.106525,0.851356,0.954195
1,0.028412,0.005322,0.000000,0.000000,0.058383,0.00000,0.033592,0.005596,0.000000,0.000000,...,0.000000,0.004095,0.010738,0.009238,0.020991,0.000000,0.000920,0.023011,0.123919,0.000000
2,0.000000,0.000000,0.000000,0.075534,0.105551,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.012201,0.019453,0.165617,0.000000,0.083132,0.003293,0.000000,0.000000,0.024312
3,0.000000,0.521578,0.763738,0.000000,0.626006,0.00000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.611440,0.000000,0.000000,0.003585,0.007204,0.000000,0.020664,0.027539,0.000000
4,0.000000,0.000000,0.000111,0.030792,0.000000,0.00000,0.000000,0.000575,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.024974,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,0.000000,0.321886,0.000000,0.616628,0.717807,0.00000,0.319061,0.255469,0.000000,0.352463,...,0.000000,0.358624,0.000000,0.000000,0.108059,0.000000,0.625884,1.006539,0.702279,0.393390
446,0.000000,0.000000,0.006785,0.000000,0.000000,0.00000,0.028897,0.016890,0.098915,0.002857,...,0.000000,0.002128,0.012146,0.000000,0.000000,0.000000,0.000000,0.072212,0.001845,0.210701
447,0.000000,0.000000,0.000000,0.015157,0.006941,0.00000,0.050659,0.122132,0.000000,0.012393,...,0.000000,0.089408,0.000000,0.000000,0.032590,0.024254,0.122757,0.000000,0.000000,0.015286
448,3.616217,0.000000,0.000000,0.000000,0.000000,8.37256,0.590597,0.000000,0.000000,0.278259,...,3.664370,2.184824,0.000000,0.556968,0.000000,0.399358,0.105457,0.267088,0.219038,0.126806


In [None]:
#Skipping this cuz takes too long
#P = factorizer.transform(users_rating_imputed)
#P = pd.DataFrame(P
#                 ,index= users_rating_imputed.index)

KeyboardInterrupt: 

In [26]:
MOVIES = users_rating_imputed.columns
USERS = users_rating_imputed.index

In [27]:
pd.DataFrame(MOVIES).to_csv('movies.csv')


In [None]:
with open('factorizer_NMF.pkl', 'wb') as file_out:
    pickle.dump(factorizer, file_out)

In [28]:
with open('factorizer_NMF.pkl', 'rb') as file_in:
    fitted_model = pickle.load(file_in)

In [29]:
user_initial_ratings = {
    '2001: A Space Odyssey (1968)': 5,
    'American Psycho (2000)': 2,
    'Almost Famous (2000)': 3.5
}

In [30]:
user_input = pd.DataFrame(user_initial_ratings, index = ['new_user'], columns= MOVIES)
user_input.iloc[:,:25]

title,10 Things I Hate About You (1999),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",A.I. Artificial Intelligence (2001),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),...,Aliens (1986),Almost Famous (2000),Amadeus (1984),"Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",American Beauty (1999),American History X (1998),American Pie (1999),"American President, The (1995)",American Psycho (2000),Anchorman: The Legend of Ron Burgundy (2004)
new_user,,,5,,,,,,,,...,,3.5,,,,,,,2,


user_imput

In [None]:
user_input_imputed = user_input.fillna(value= users_rating.mean())
assert user_input_imputed.isna().sum().sum() == 0
user_input_imputed

title,10 Things I Hate About You (1999),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",A.I. Artificial Intelligence (2001),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),...,Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)","Wolf of Wall Street, The (2013)",X-Men (2000),X-Men: The Last Stand (2006),X2: X-Men United (2003),You've Got Mail (1998),Young Frankenstein (1974),Zombieland (2009),Zoolander (2001)
new_user,3.527778,4.149123,5,3.974138,3.68125,3.547297,3.339286,3.475806,3.040373,2.727273,...,3.87395,3.880435,3.916667,3.699248,3.355769,3.723684,3.12,3.992754,3.877358,3.509259


In [None]:
P_user = fitted_model.transform(user_input_imputed)
P_user

array([[4.20500790e-01, 4.71397797e-01, 8.47352348e-01, 8.98417962e-02,
        3.57560478e+00, 5.45680018e-01, 5.04302858e-01, 4.26908678e-01,
        1.81062996e-01, 9.08699448e-01, 9.69329277e-02, 1.07787676e-01,
        3.36957962e-01, 3.69728858e-01, 5.47891975e-02, 9.78894194e-02,
        3.06658474e-01, 1.86344959e-01, 7.80408740e-02, 1.37487769e-01,
        1.20439224e-01, 1.02363080e-01, 3.44202805e-01, 2.23299973e-01,
        2.21837172e-02, 1.97055153e-01, 7.22492904e-02, 1.25421734e-01,
        1.12346312e-01, 2.35496483e-01, 2.02960151e-01, 1.33115800e-01,
        1.53337516e-01, 7.05609235e-02, 7.33015962e-02, 3.96714388e-02,
        6.62986166e-02, 1.46859853e-01, 2.83535498e-02, 6.61077729e-02,
        1.20908719e-01, 1.02968419e-02, 1.54714347e-01, 5.31758068e-02,
        1.36356875e-01, 6.54392810e-02, 2.68811741e-01, 1.46318620e-01,
        4.37032627e-02, 6.33942945e-02, 1.61647264e-01, 1.86833626e-02,
        6.18915244e-02, 2.09666711e-01, 1.29002321e-01, 5.117684

In [None]:
P_user = pd.DataFrame(P_user, index = ['new_user'])
P_user

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,440,441,442,443,444,445,446,447,448,449
new_user,0.420501,0.471398,0.847352,0.089842,3.575605,0.54568,0.504303,0.426909,0.181063,0.908699,...,0.052411,0.0,0.102459,0.0,0.201066,0.0,0.04142,0.0,0.002145,0.0


In [None]:
R_user_hat = np.dot(P_user, Q)
R_user_hat = pd.DataFrame(R_user_hat, columns=MOVIES, index=['new_user'])
R_user_hat

title,10 Things I Hate About You (1999),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",A.I. Artificial Intelligence (2001),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),...,Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)","Wolf of Wall Street, The (2013)",X-Men (2000),X-Men: The Last Stand (2006),X2: X-Men United (2003),You've Got Mail (1998),Young Frankenstein (1974),Zombieland (2009),Zoolander (2001)
new_user,3.521399,4.154158,4.927308,3.961395,3.701293,3.597298,3.332946,3.423854,3.035459,2.735808,...,3.877531,3.892774,3.914815,3.675806,3.393147,3.73626,3.132983,3.993581,3.831856,3.537366


In [None]:
R_user_hat_transposed = R_user_hat.T.sort_values(by='new_user', ascending=False)
R_user_hat_transposed

Unnamed: 0_level_0,new_user
title,Unnamed: 1_level_1
2001: A Space Odyssey (1968),4.927308
"Shawshank Redemption, The (1994)",4.413408
"Godfather, The (1972)",4.318901
Rear Window (1954),4.296811
Fight Club (1999),4.281584
...,...
Judge Dredd (1995),2.670494
City Slickers II: The Legend of Curly's Gold (1994),2.635329
Coneheads (1993),2.411916
American Psycho (2000),2.357011


In [None]:
user_initial_ratings_list = list(user_initial_ratings.keys())
user_initial_ratings_list

['2001: A Space Odyssey (1968)',
 'American Psycho (2000)',
 'Almost Famous (2000)']

In [None]:
recommendables = list(R_user_hat_transposed.index)
recommendables

['2001: A Space Odyssey (1968)',
 'Shawshank Redemption, The (1994)',
 'Godfather, The (1972)',
 'Rear Window (1954)',
 'Fight Club (1999)',
 'Cool Hand Luke (1967)',
 'Godfather: Part II, The (1974)',
 'Goodfellas (1990)',
 'Casablanca (1942)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Reservoir Dogs (1992)',
 "Schindler's List (1993)",
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)',
 'Chinatown (1974)',
 'American History X (1998)',
 'Departed, The (2006)',
 'Apocalypse Now (1979)',
 'Matrix, The (1999)',
 'Dark Knight, The (2008)',
 'Usual Suspects, The (1995)',
 'Princess Bride, The (1987)',
 'Amadeus (1984)',
 "One Flew Over the Cuckoo's Nest (1975)",
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
 'Pulp Fiction (1994)',
 'Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966)',
 'Monty Python and the Holy Grail (1975)',
 'North by 

In [None]:
user_recommendations = [movie for movie in recommendables if movie not in user_initial_ratings_list]
user_recommendations

['Shawshank Redemption, The (1994)',
 'Godfather, The (1972)',
 'Rear Window (1954)',
 'Fight Club (1999)',
 'Cool Hand Luke (1967)',
 'Godfather: Part II, The (1974)',
 'Goodfellas (1990)',
 'Casablanca (1942)',
 'Star Wars: Episode IV - A New Hope (1977)',
 'Reservoir Dogs (1992)',
 "Schindler's List (1993)",
 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)',
 'Chinatown (1974)',
 'American History X (1998)',
 'Departed, The (2006)',
 'Apocalypse Now (1979)',
 'Matrix, The (1999)',
 'Dark Knight, The (2008)',
 'Usual Suspects, The (1995)',
 'Princess Bride, The (1987)',
 'Amadeus (1984)',
 "One Flew Over the Cuckoo's Nest (1975)",
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
 'Pulp Fiction (1994)',
 'Good, the Bad and the Ugly, The (Buono, il brutto, il cattivo, Il) (1966)',
 'Monty Python and the Holy Grail (1975)',
 'North by Northwest (1959)',
 'Forrest Gump

In [1]:
import pandas as pd
import numpy as np
import pickle
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings(action='ignore')

from sklearn.decomposition import NMF

In [2]:
user_query = {
    '2001: A Space Odyssey (1968)': 5,
    'American Psycho (2000)': 2,
    'Almost Famous (2000)': 3.5
}

In [3]:
from recommender import NMF_recommender, random_recommender, RATINGS, NMF_model
ratings_matrix = RATINGS
RATINGS

Unnamed: 0_level_0,10 Things I Hate About You (1999),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",A.I. Artificial Intelligence (2001),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),...,Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)","Wolf of Wall Street, The (2013)",X-Men (2000),X-Men: The Last Stand (2006),X2: X-Men United (2003),You've Got Mail (1998),Young Frankenstein (1974),Zombieland (2009),Zoolander (2001)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.527778,4.149123,3.894495,3.974138,3.68125,3.547297,3.339286,4.000000,3.040373,2.727273,...,5.00000,5.000000,3.916667,5.000000,3.355769,3.723684,3.12,5.000000,3.877358,3.509259
2,3.527778,4.149123,3.894495,3.974138,3.68125,3.547297,3.339286,3.475806,3.040373,2.727273,...,3.87395,3.880435,5.000000,3.699248,3.355769,3.723684,3.12,3.992754,3.000000,3.509259
3,3.527778,4.149123,3.894495,3.974138,3.68125,3.547297,3.339286,3.475806,3.040373,2.727273,...,3.87395,3.880435,3.916667,3.699248,3.355769,3.723684,0.50,3.992754,3.877358,3.509259
4,3.527778,5.000000,3.894495,3.974138,3.68125,3.547297,3.339286,3.475806,3.040373,2.727273,...,4.00000,5.000000,3.916667,3.699248,3.355769,3.723684,3.12,3.992754,3.877358,3.509259
5,3.527778,4.149123,3.894495,3.974138,3.68125,3.547297,3.339286,3.475806,3.000000,2.727273,...,3.87395,3.880435,3.916667,3.699248,3.355769,3.723684,3.12,3.992754,3.877358,3.509259
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.527778,4.149123,5.000000,3.974138,3.68125,3.547297,3.500000,3.475806,3.040373,2.000000,...,3.87395,3.880435,3.916667,3.699248,3.355769,3.723684,3.50,3.500000,3.877358,3.509259
607,3.527778,4.149123,3.894495,3.974138,3.68125,3.547297,3.339286,3.475806,3.040373,2.727273,...,3.87395,5.000000,3.916667,3.000000,3.355769,3.723684,3.12,3.992754,3.877358,3.509259
608,3.527778,4.149123,3.000000,3.500000,5.00000,3.547297,4.500000,3.000000,3.500000,2.000000,...,3.50000,2.500000,3.916667,4.000000,4.000000,4.000000,3.12,3.992754,3.877358,3.000000
609,3.527778,4.149123,3.894495,3.974138,3.68125,3.547297,3.339286,3.475806,3.040373,2.727273,...,3.87395,3.880435,3.916667,3.699248,3.355769,3.723684,3.12,3.992754,3.877358,3.509259


In [4]:
MOVIES = ratings_matrix.columns
user_query_list = list(user_query.keys())

imputed_query = pd.DataFrame(user_query, index = ['new_user'], columns= MOVIES)
imputed_query.fillna(value = ratings_matrix.mean(), inplace= True)
imputed_query

Unnamed: 0,10 Things I Hate About You (1999),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",A.I. Artificial Intelligence (2001),"Abyss, The (1989)",Ace Ventura: Pet Detective (1994),Ace Ventura: When Nature Calls (1995),...,Willy Wonka & the Chocolate Factory (1971),"Wizard of Oz, The (1939)","Wolf of Wall Street, The (2013)",X-Men (2000),X-Men: The Last Stand (2006),X2: X-Men United (2003),You've Got Mail (1998),Young Frankenstein (1974),Zombieland (2009),Zoolander (2001)
new_user,3.527778,4.149123,5,3.974138,3.68125,3.547297,3.339286,3.475806,3.040373,2.727273,...,3.87395,3.880435,3.916667,3.699248,3.355769,3.723684,3.12,3.992754,3.877358,3.509259


In [5]:
NMF_recommender(user_query, RATINGS, NMF_model, 5)

['Shawshank Redemption, The (1994)',
 'Godfather, The (1972)',
 'Rear Window (1954)',
 'Fight Club (1999)',
 'Cool Hand Luke (1967)']