# What do I actually do once I've got a trained NMF model?

In [1]:
import pickle
import pandas as pd
from sklearn.decomposition import NMF
from fuzzywuzzy import process
import numpy as np
from sqlalchemy import create_engine

In [2]:
df = pd.read_csv('/Users/maximcondon/Desktop/Spiced/06_Week_6/peppermint_movies - Sheet1.csv')
df.head()

Unnamed: 0,Reviewer,Movie,Rating
0,Nedra,IT,2
1,Paul,IT,4
2,Nedra,Titanic,2
3,Michael,Guesthouse Paradiso,5
4,Michael,Toxic Avenger,2


In [3]:
R = df.set_index(['Reviewer', 'Movie']).unstack(1)
#df.pivot(index='Reviewer', columns='Movie', values='Rating')

## First Feature Engineering Challenge!
- What to do about NaNs?

In [4]:
R.fillna(3.0, inplace=True)

In [5]:
R

Unnamed: 0_level_0,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating
Movie,A Star Is Born,Argo,Bohemian Rhapsody,Das Leben der Anderen,Dirty Dancing,Dora the Explorer,Guesthouse Paradiso,Harry Potter and the Sorcerer's Stone,High School Musical 4,IT,...,Power Rangers,Princess Diaries,Roma,Schindlers List,Shawshank Redemption,Titanic,Toxic Avenger,Trainspotting,Who am I,Zootopia
Reviewer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Anders,3.0,3.0,3.0,3.0,5.0,3.0,3.0,3.0,5.0,3.0,...,3.0,5.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
Josh,4.0,3.0,3.0,3.0,2.0,3.0,3.0,2.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0
Karl,3.0,5.0,4.0,4.0,1.0,3.0,3.0,2.0,3.0,3.0,...,1.0,3.0,3.0,3.0,3.0,3.0,3.0,5.0,3.0,3.0
Max,3.0,3.0,3.0,3.0,3.0,5.0,1.0,3.0,3.0,3.0,...,5.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
Michael,3.0,3.0,3.0,3.0,3.0,3.0,5.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,1.0,2.0,3.0,5.0,3.0
Nedra,3.0,3.0,5.0,4.0,3.0,3.0,3.0,3.0,3.0,2.0,...,3.0,3.0,3.0,5.0,3.0,2.0,3.0,4.0,3.0,3.0
Paul,4.0,5.0,5.0,5.0,3.0,2.0,3.0,4.0,2.0,4.0,...,3.0,3.0,1.0,3.0,5.0,5.0,2.0,4.0,3.0,3.0


Now we create a variable which is an instance of this NMF class, and we get a bunch of **hyperparameters we can look at**
- The most important is **n_components**, tells you how many **hidden features you're going to get out**!

Tells us something about our data - some background calculation is performed and we obtain a matrix! 
- Remember, this matrix is then multiplied by Q again!

e.g. Choose 3, maybe there's 3 components in my data? We don't know it's just a guess! Component 1 might be how Action-y it is, 2 how popular, 3 how Dramatic...

In [9]:
nmf = NMF(n_components=3) #nmf is like m or model 

In [10]:
nmf.fit(R)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=3, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

### Going to use pickle to store our model! Stores it as some binary model

- If we read / open an NMF model that has already been trained, then we speed up the code!

In [11]:
# This encodes your nmf model to reuse again
binary = pickle.dumps(nmf)

In [12]:
open('trained_nmf_model.bin', 'wb').write(binary)
# .bin file is the standard to use
#gives output: the number of bytes in the file

1120

### Sometime later in the future...

In [13]:
binary = open('trained_nmf_model.bin', 'rb').read()

In [14]:
nmf = pickle.loads(binary)

In [15]:
nmf

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=3, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

### Pros/Cons of Pickle

#### Pros:
- You can store ANY Python object as a binary file!
- 2 lines of code! Write, then read!

#### Cons:
- Can only be read by Python
- Sensitive to different versions

## Using the NMF model

### Fuzzywuzzy - Checks for spelling errors and brings up similar title names 
- Ensures that when people mistype their movie entries there isn't an issue!

In [16]:
R.head()

Unnamed: 0_level_0,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating
Movie,A Star Is Born,Argo,Bohemian Rhapsody,Das Leben der Anderen,Dirty Dancing,Dora the Explorer,Guesthouse Paradiso,Harry Potter and the Sorcerer's Stone,High School Musical 4,IT,...,Power Rangers,Princess Diaries,Roma,Schindlers List,Shawshank Redemption,Titanic,Toxic Avenger,Trainspotting,Who am I,Zootopia
Reviewer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Anders,3.0,3.0,3.0,3.0,5.0,3.0,3.0,3.0,5.0,3.0,...,3.0,5.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
Josh,4.0,3.0,3.0,3.0,2.0,3.0,3.0,2.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0
Karl,3.0,5.0,4.0,4.0,1.0,3.0,3.0,2.0,3.0,3.0,...,1.0,3.0,3.0,3.0,3.0,3.0,3.0,5.0,3.0,3.0
Max,3.0,3.0,3.0,3.0,3.0,5.0,1.0,3.0,3.0,3.0,...,5.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
Michael,3.0,3.0,3.0,3.0,3.0,3.0,5.0,3.0,3.0,3.0,...,3.0,3.0,3.0,3.0,3.0,1.0,2.0,3.0,5.0,3.0


In [17]:
movie_titles = list(R['Rating'].columns)

In [18]:
user_input = 'bohemian rap city'

In [19]:
process.extract(user_input, movie_titles)

[('Bohemian Rhapsody', 76),
 ('IT', 60),
 ('Titanic', 49),
 ('Argo', 45),
 ('Roma', 45)]

In [20]:
process.extract(user_input, movie_titles)[0][0]

'Bohemian Rhapsody'

In [21]:
user_input = input('')

my butt is worn


In [22]:
process.extract(user_input, movie_titles)[0][0]

'A Star Is Born'

### This is one of several solutions to get around the user input problem!
- Another option would be a dropdown menu
- Another option would be to give them a picture of a movie poster and they give the rating
- Another option would be to give them 5 movies randomly from our database to score

## Using our pre-trained model to output a prediction based on user input

In [24]:
R['Rating'].loc['Paul'].values

array([4., 5., 5., 5., 3., 2., 3., 4., 2., 4., 3., 3., 3., 3., 1., 3., 5.,
       5., 2., 4., 3., 3.])

In [25]:
R.loc['Paul']

        Movie                                
Rating  A Star Is Born                           4.0
        Argo                                     5.0
        Bohemian Rhapsody                        5.0
        Das Leben der Anderen                    5.0
        Dirty Dancing                            3.0
        Dora the Explorer                        2.0
        Guesthouse Paradiso                      3.0
        Harry Potter and the Sorcerer's Stone    4.0
        High School Musical 4                    2.0
        IT                                       4.0
        Inside Out                               3.0
        Murder on the Orient Express             3.0
        Power Rangers                            3.0
        Princess Diaries                         3.0
        Roma                                     1.0
        Schindlers List                          3.0
        Shawshank Redemption                     5.0
        Titanic                                  5.0


In [26]:
new_user_input = {'Shawshank Redemption':4, 'Power Rangers': 1, 'Dirty Dancing':5}

#### But, we need it in the form:

[NAN, NAN, NAN, NAN, 5, NAN, ... 1, NAN, NAN, NAN, 4, ..., NAN] 

### Feature Engineering Challenge 2:
- Taking the user input and creating a corresponding array out of it!

The idea revolves around taking a list and filling it with NaNs to match the number of films in our list.

e.g. Here we have everything in alphabetical order which helps a lot! In our case, we have a numerically ascending movieid!

In [27]:
import numpy as np

In [28]:
naans = [np.nan] * len(movie_titles)

In [29]:
d = dict(zip(movie_titles, naans))

In [30]:
d['Shawshank Redemption'] = 4
d['Power Rangers'] = 1
d['Dirty Dancing'] = 5

In [31]:
d

{'A Star Is Born': nan,
 'Argo': nan,
 'Bohemian Rhapsody': nan,
 'Das Leben der Anderen': nan,
 'Dirty Dancing': 5,
 'Dora the Explorer': nan,
 'Guesthouse Paradiso': nan,
 "Harry Potter and the Sorcerer's Stone": nan,
 'High School Musical 4': nan,
 'IT': nan,
 'Inside Out': nan,
 'Murder on the Orient Express': nan,
 'Power Rangers': 1,
 'Princess Diaries': nan,
 'Roma': nan,
 'Schindlers List': nan,
 'Shawshank Redemption': 4,
 'Titanic': nan,
 'Toxic Avenger': nan,
 'Trainspotting': nan,
 'Who am I': nan,
 'Zootopia': nan}

#### But our model is expecting a list - well actually a numpy array so we can convert this back into np.array!

In [32]:
new_user_profile = list(d.values())

In [33]:
new_user_profile[:10]

[nan, nan, nan, nan, 5, nan, nan, nan, nan, nan]

In [34]:
new_user_profile = pd.DataFrame(new_user_profile, index=movie_titles).transpose()

In [35]:
new_user_profile

Unnamed: 0,A Star Is Born,Argo,Bohemian Rhapsody,Das Leben der Anderen,Dirty Dancing,Dora the Explorer,Guesthouse Paradiso,Harry Potter and the Sorcerer's Stone,High School Musical 4,IT,...,Power Rangers,Princess Diaries,Roma,Schindlers List,Shawshank Redemption,Titanic,Toxic Avenger,Trainspotting,Who am I,Zootopia
0,,,,,5.0,,,,,,...,1.0,,,,4.0,,,,,


In [36]:
new_user_profile_filled = new_user_profile.fillna(3.0)
new_user_profile_filled

Unnamed: 0,A Star Is Born,Argo,Bohemian Rhapsody,Das Leben der Anderen,Dirty Dancing,Dora the Explorer,Guesthouse Paradiso,Harry Potter and the Sorcerer's Stone,High School Musical 4,IT,...,Power Rangers,Princess Diaries,Roma,Schindlers List,Shawshank Redemption,Titanic,Toxic Avenger,Trainspotting,Who am I,Zootopia
0,3.0,3.0,3.0,3.0,5.0,3.0,3.0,3.0,3.0,3.0,...,1.0,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0


In [37]:
P = nmf.transform(new_user_profile_filled) # User-genre matrix
P # How much the new user is 'into' each hidden component

array([[1.73548833, 0.95096622, 1.14208395]])

In [38]:
P.shape

(1, 3)

#### These are the 3 n_components from earlier - the hidden features!

- 22 represents each movie!

In [39]:
nmf.components_.shape

(3, 22)

In [40]:
# This is our Q matrix - movie-genre matrix
# Wouldn't over analyse this - there isn't much you can glean!
Q = nmf.components_ # Movie-genre matrix
Q

array([[9.95235417e-01, 7.45106234e-01, 8.65355698e-01, 8.55686981e-01,
        1.44139526e+00, 1.40554865e+00, 4.75785665e-01, 1.00809331e+00,
        1.28787632e+00, 9.41539629e-01, 1.04354054e+00, 9.55486400e-01,
        1.60309466e+00, 1.50649036e+00, 9.64042344e-01, 9.74823834e-01,
        9.46930293e-01, 9.70584262e-01, 9.76722722e-01, 7.59053004e-01,
        9.21569862e-01, 9.11459330e-01],
       [6.54411554e-01, 1.10255080e+00, 1.01333380e+00, 1.00040671e+00,
        4.24439502e-02, 1.00524526e-01, 4.33325474e-01, 5.39137471e-01,
        5.45894588e-02, 6.41383010e-01, 4.36414685e-01, 4.36211791e-01,
        8.84804642e-02, 2.13893299e-01, 1.51591716e-05, 4.62065985e-01,
        8.72408422e-01, 9.79905506e-01, 2.78351110e-01, 8.97379581e-01,
        3.15736524e-01, 4.36110345e-01],
       [6.61818985e-01, 8.07308430e-01, 8.54971414e-01, 7.58048602e-01,
        3.04430108e-01, 5.08308650e-01, 1.41639548e+00, 4.12118784e-01,
        7.43671221e-01, 5.13137393e-01, 8.08694818e-01

### P has a shape of (1, 3), Q has a shape of (3, 22) therefore, as R = P*Q, our resulting matrix will have a shape of (1, 22)

In [41]:
ypred = np.dot(P, Q)
ypred

array([[3.10539558, 3.26362575, 3.44191007, 3.3021429 , 2.88957215,
        3.11543986, 2.85544091, 2.73290994, 3.13634202, 2.8300106 ,
        3.14966543, 2.87842981, 3.0042878 , 3.39186899, 2.69573375,
        3.15796414, 3.06112573, 2.6162947 , 2.63439729, 3.31204495,
        3.18379893, 2.742812  ]])

In [42]:
ypred.shape

(1, 22)

#### This is the model's predicted ranking that this new user would give to all movies BASED on the model's understanding of the training data!

In [43]:
pd.DataFrame(data= ypred[0], index=movie_titles).sort_values(by=0, ascending=False).head()

Unnamed: 0,0
Bohemian Rhapsody,3.44191
Princess Diaries,3.391869
Trainspotting,3.312045
Das Leben der Anderen,3.302143
Argo,3.263626


In [44]:
recommendations = list(pd.DataFrame(data= ypred[0], index=movie_titles).sort_values(by=0, ascending=False).head().index)
recommendations

['Bohemian Rhapsody',
 'Princess Diaries',
 'Trainspotting',
 'Das Leben der Anderen',
 'Argo']

### Now, take these ideas and package it all into a function!

In [45]:
df = pd.read_csv('/Users/maximcondon/Desktop/Spiced/06_Week_6/peppermint_movies - Sheet1.csv')

In [46]:
df.head(5)

Unnamed: 0,Reviewer,Movie,Rating
0,Nedra,IT,2
1,Paul,IT,4
2,Nedra,Titanic,2
3,Michael,Guesthouse Paradiso,5
4,Michael,Toxic Avenger,2


#### A function to pickle our df:
- remember we've already done this so can just use nmf 

In [47]:
# def pickle_df(df):
    
#     R = df.set_index(['Reviewer', 'Movie']).unstack(1)
#     #makes our table into a matrix

#     #What to do about NaNs?

#     R.fillna(3.0, inplace=True) # fills NaNs
    
#     nmf = NMF(n_components=3) #sets our NMF model
    
#     nmf.fit(R) # Fits our dataframe R to the model
    
#     binary = pickle.dumps(nmf) # Encodes your nmf model to reuse again
 
#     open('trained_nmf_model.bin', 'wb').write(binary)
#     # opens a file and writes binary into it

#     binary = open('trained_nmf_model.bin', 'rb').read()
#     # reads the binary file we saved earlier
    
#     nmf = pickle.loads(binary) # does the same as NMF(n_components=3)
    
#     return nmf

In [48]:
nmf

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
  n_components=3, random_state=None, shuffle=False, solver='cd',
  tol=0.0001, verbose=0)

In [115]:
def clean_text(df, movie_input, rating):

    R = df.set_index(['Reviewer', 'Movie']).unstack(1)

    movie_titles = list(R['Rating'].columns) # list of movie titles

    film = process.extract(movie_input, movie_titles)[0][0]
        
    return film, rating

In [116]:
clean_text(df, 'glasshouse paraplegic', 5)

('Guesthouse Paradiso', 5)

In [None]:
    
    #taking the user input and creating an array, filling with NaNs
    new_user_input = {'Shawshank Redemption':4, 'Power Rangers': 1, 'Dirty Dancing':5}

    naans = [np.nan] * len(movie_titles) # makes a list of naans

    d = dict(zip(movie_titles, naans)) # zips nan values to movie titles in dictionary form

    d['Shawshank Redemption'] = 4 #assigns new values for the films 
    d['Power Rangers'] = 1
    d['Dirty Dancing'] = 5

    # need to convert this to a list and then a np.array!

    new_user_profile = list(d.values()) # list of values

    #makes a dataframe so we can fillna
    new_user_profile = pd.DataFrame(new_user_profile, index=movie_titles).transpose()

    new_user_profile_filled = new_user_profile.fillna(3.0)

    P = nmf.transform(new_user_profile_filled) # User-genre matrix

    Q = nmf.components_ # Movie-genre matrix

    ypred = np.dot(P,Q) # dot product gives us R!

    print(ypred)

    recommendations = list(pd.DataFrame(data= ypred[0], index=movie_titles).sort_values(by=0, ascending=False).head(5).index)

    return recommendations

In [None]:
# binary = open('trained_nmf_model.bin', 'rb').read()

# model = pickle.loads(binary)

# recommendations = get_recommendations(m, {'Shawshank Redemption': 4, ...})

In [147]:
new_user_input = {'Shawshank Redemption':4, 'Power Rangers': 1, 'Dirty Dancing':5}

In [136]:
movies_input = list(new_user_input.keys())
movies_input

['Shawshank Redemption', 'Power Rangers', 'Dirty Dancing']

In [137]:
ratings_input = list(new_user_input.values())
ratings_input

[4, 1, 5]

In [141]:
R.head(1)

Unnamed: 0_level_0,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating,Rating
Movie,A Star Is Born,Argo,Bohemian Rhapsody,Das Leben der Anderen,Dirty Dancing,Dora the Explorer,Guesthouse Paradiso,Harry Potter and the Sorcerer's Stone,High School Musical 4,IT,...,Power Rangers,Princess Diaries,Roma,Schindlers List,Shawshank Redemption,Titanic,Toxic Avenger,Trainspotting,Who am I,Zootopia
Reviewer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Anders,3.0,3.0,3.0,3.0,5.0,3.0,3.0,3.0,5.0,3.0,...,3.0,5.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0


In [145]:
def test(df, new_user_input):    
    
    R = df.set_index(['Reviewer', 'Movie']).unstack(1)
    
    movie_titles = list(R['Rating'].columns)
    
    movies_input = list(new_user_input.keys())

    ratings_input = list(new_user_input.values())

    film = process.extract(movies_input, movie_titles)[0][0]

    # Using our pre-trained model to output a prediction based on user input
    
    new_user_input = {'Shawshank Redemption':4, 'Power Rangers': 1, 'Dirty Dancing':5}
    
    # FE challenge - Taking user input and creating a corresponding array 
    
    naans = [np.nan] * len(movie_titles)

    d = dict(zip(movie_titles, naans))
    
    for i in movies_input:
        
        d[film] = ratings_input[0]
        

    ### But our model is expecting a list - well actually a numpy array so we can convert this back into np.array!

    new_user_profile = list(d.values())

    new_user_profile = pd.DataFrame(new_user_profile, index=movie_titles).transpose().fillna(3.0)

    P = nmf.transform(new_user_profile) # User-genre matrix

    #### These are the 3 n_components from earlier - the hidden features!

    # This is our Q matrix - movie-genre matrix
    Q = nmf.components_ 
    
    ypred = np.dot(P, Q)

    recommendations = list(pd.DataFrame(data= ypred[0], index=movie_titles).sort_values(by=0, ascending=False).head().index)
    
    return recommendations