In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF 
import pickle




In [2]:
datapath_movies = "./data/ml-latest-small/movies.csv"
datapath_ratings = "./data/ml-latest-small/ratings.csv"
model_path = '/Users/max/spiced/binomial-baharat-student-code/week_10/models/model_mnf'


In [3]:
def load_movies(movies_path:str):
    movies = pd.read_csv(datapath_movies)
    movies['movieId']=movies['movieId'].astype(str)
    return movies 

def load_data(ratings_path:str, movies_path:str):
    """
    load both csv files with with the user ratings and the movies
    both files need a column with the moveId
    """
    movies = load_movies(movies_path)
    ratings = pd.read_csv(datapath_ratings)
    ratings['movieId']=ratings['movieId'].astype(str)
    combined_df = ratings.join(other=movies.set_index('movieId'), on='movieId',how='left')
    return combined_df

def get_initial_rating_df(combined_df:pd.core.frame.DataFrame):
        return combined_df.pivot_table(index='userId',columns='movieId',values='rating')

def calcuate_r_matrix(combined_df:pd.core.frame.DataFrame):
    '''
    takes the combined dataframe from ratings and movies
    returns a pivot table: UserID vs movidID with the ratings in the matix
    and returns the mean ratings 4 later user
    fills the missing values with the average value
    '''
    means = combined_df.mean()
    r_matrix = combined_df.fillna(means)

    return r_matrix, means

def create_nmf_model(n_components:int,r_matrix:pd.core.frame.DataFrame):
        '''
        creates a NMF model object based on the number of components and r dataframe
        '''
        nmf_model = NMF(n_components=n_components,max_iter=1000)
        nmf_model.fit(r_matrix)
        return nmf_model

def get_Q_and_P_matrix (nmf_model:NMF,r_matrix:pd.core.frame.DataFrame):
        '''
        returns the Q and the P matrix based on a NMF model and r dataframe
        '''
        Q_matrix = nmf_model.components_
        Q_matrix = pd.DataFrame(data=Q_matrix,
                columns=nmf_model.feature_names_in_,
                index= nmf_model.get_feature_names_out())
        P_matrix = nmf_model.transform(r_matrix)
        P_matrix = pd.DataFrame(data=P_matrix,
                columns=nmf_model.get_feature_names_out(),
                index = r_matrix.index)
        return Q_matrix,P_matrix
        
def get_best_components(r_matrix:pd.core.frame.DataFrame,max_components:int):
        '''
        findes the number of componets with the smalles reconstruction error
        need a r Dataframe and the the maximum number of components you want to search for
        returns a integer for the componets with the smallest error
        '''
        components = pd.DataFrame()
        for i in np.linspace(1, max_components, num=max_components):
                model = create_nmf_model(n_components=int(i),r_matrix=r_matrix)
                components = pd.concat([components,
                pd.DataFrame({'components' : [int(i)],'error':[model.reconstruction_err_]})])
        components.set_index('components').plot()
        display(components[components['error']==components['error'].min()])
        return components[components['error']==components['error'].min()]['components'].values[0]
        
def get_r_predtion(P_matrix:pd.core.frame.DataFrame,Q_matrix:pd.core.frame.DataFrame):
        '''
        returns the reconsturced matrix from the P and Q matrix
        '''
        r_hat_matrix = np.dot(P_matrix,Q_matrix)
        return  pd.DataFrame(data=r_hat_matrix,
                columns=Q_matrix.columns,
                index = P_matrix.index)


def save_NMF_and_imputed(nmf_model:NMF, imputed_values:pd.core.series.Series, q_matrix:pd.core.frame.DataFrame,p_matrix:pd.core.frame.DataFrame, path_no_ending:str):
        '''
        saves the model,imputed values,P and Q matrix for later use
        the file path should be without file ending - just the base file
        {pathfile}_imputed.pkl {pathfile}_model.pkl {pathfile}_Q_matrix.pkl {pathfile}_P_matrix.pkl
        '''
        with open(f'{path_no_ending}_q_matrix.pkl',mode='wb') as file:
                pickle.dump(q_matrix,file)
        with open(f'{path_no_ending}_p_matrix.pkl',mode='wb') as file:
                pickle.dump(p_matrix,file)
        with open(f'{path_no_ending}_imputed.pkl',mode='wb') as file:
                pickle.dump(imputed_values,file)
        with open(f'{path_no_ending}_model.pkl',mode='wb') as file:
                pickle.dump(nmf_model,file)

def load_NMF_and_imputed(pathfile:str):
        '''
        loads a NMF model, the imputed values and the Q and P matrix for it
        give the path without the file ending
        {pathfile}_imputed.pkl {pathfile}_model.pkl {pathfile}_Q_matrix.pkl {pathfile}_P_matrix.pkl
        '''
        with open(f'{pathfile}_q_matrix.pkl','rb') as file:
                q_matrix = pickle.load(file)
        with open(f'{pathfile}_p_matrix.pkl','rb') as file:
                p_matrix = pickle.load(file)
        with open(f'{pathfile}_model.pkl','rb') as file:
                model = pickle.load(file)
        with open(f'{pathfile}_imputed.pkl','rb') as file:
                imputed = pickle.load(file)
        return model, imputed, q_matrix, p_matrix

def recommend_nmf_new_user(user_query:dict,q_matrix:pd.core.frame.DataFrame,imputed_values:pd.core.series.Series, nmf_model=NMF):
    '''
    predicts values for a new user based thier input dictionary
    needs the q_matrix to the used model and the imputed values 
    returns a DF for the user
    '''
    new_user_dataframe =  pd.DataFrame(data=user_query,
                columns=q_matrix.columns,
                index = ['new_user'])
    new_user_dataframe_imputed = new_user_dataframe.fillna(imputed_values)
    P_new_user_matrix = nmf_model.transform(new_user_dataframe_imputed)
    P_new_user = pd.DataFrame(data=P_new_user_matrix,
                            columns=nmf_model.get_feature_names_out(),
                            index = ['new_user'])
    R_hat_new_user_matrix = np.dot(P_new_user_matrix,q_matrix)
    R_hat_new_user = pd.DataFrame(data=R_hat_new_user_matrix,
                            columns=nmf_model.feature_names_in_,
                            index = ['new_user'])
    return R_hat_new_user

def get_top_rated_movies (new_user_r_matrix:pd.core.frame.DataFrame,n_top:int = 4):
    '''returns the top n number for the new user matrix'''
    return new_user_r_matrix.transpose().sort_values(by=['new_user'],ascending=False).sort_index().head(n_top)


def get_ID_to_title(movie_df:pd.core.frame.DataFrame, movieIds:list):
    '''
    returns a the title from from IDs if they exist
    needs a DF with the a ID and a title
    '''
    titles = [movie_df[movie_df['movieId']==id]['title'].values[0] for id in movieIds]
    return titles

def get_title_to_ID(movie_df:pd.core.frame.DataFrame, movieTitles:list): ### needs to be fuzzy
    '''
    returns a the IDs from titels if they exist
    needs a DF with the a ID and a title
    '''
    Ids = [movie_df[movie_df['title']==title]['movieId'].values[0] for title in movieTitles]
    return Ids

In [4]:
combined_df = load_data(ratings_path=datapath_ratings,movies_path=datapath_movies)

In [5]:
combined_df = get_initial_rating_df(combined_df)

In [6]:
combined_df

movieId,1,10,100,100044,100068,100083,100106,100159,100163,100194,...,99750,99764,998,99813,99846,99853,999,99910,99917,99992
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,4.0,,,,,,,,,...,,,,,,,,,,
609,3.0,4.0,,,,,,,,,...,,,,,,,,,,


In [7]:
r_matrix, mean_ratings = calcuate_r_matrix(combined_df=combined_df)

In [8]:
#best_components = get_best_components(r_matrix=r_matrix,max_components=60) # for 60 components it takes around 10 mins
best_components = 43


In [9]:
model = create_nmf_model(r_matrix=r_matrix,n_components=best_components)

In [10]:
Q_matrix, P_matrix = get_Q_and_P_matrix(nmf_model=model,r_matrix=r_matrix)



In [11]:
get_r_predtion(P_matrix=P_matrix,Q_matrix=Q_matrix)

Unnamed: 0_level_0,1,10,100,100044,100068,100083,100106,100159,100163,100194,...,99750,99764,998,99813,99846,99853,999,99910,99917,99992
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.000315,3.423037,2.789907,3.996172,3.500712,3.429009,3.500712,4.491612,2.962933,4.491612,...,3.500712,4.219729,3.169355,3.973989,3.001114,3.996172,3.129385,3.219931,3.171578,3.001114
2,4.213155,3.406160,2.788441,3.998719,3.500384,3.509218,3.500384,4.496769,2.908130,4.496769,...,3.500384,4.240239,3.003821,3.930058,3.000480,3.998719,3.039183,3.251594,3.171969,3.000480
3,4.195387,3.407285,2.774562,4.001336,3.505099,3.501383,3.505099,4.497589,2.916674,4.497589,...,3.505099,4.259561,2.999885,3.932470,3.007133,4.001336,3.035821,3.247031,3.163694,3.007133
4,4.073151,3.443561,2.807071,3.981765,3.486090,3.477461,3.486090,4.476828,2.931812,4.476828,...,3.486090,4.250293,3.069485,3.981718,2.989384,3.981765,3.082595,3.269027,3.221545,2.989384
5,4.230254,3.350321,2.788069,3.998064,3.500455,3.509836,3.500455,4.495279,2.914274,4.495279,...,3.500455,4.240799,3.011849,3.925095,3.001292,3.998064,3.030078,3.254278,3.172363,3.001292
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.809130,3.130797,2.841693,3.988305,3.494275,3.466318,3.494275,4.482422,2.923382,4.482422,...,3.494275,4.229973,2.973260,3.931209,2.998466,3.988305,2.971212,3.246054,3.193238,2.998466
607,4.501505,3.506154,2.807444,4.001025,3.503136,3.472573,3.503136,4.498604,2.925166,4.498604,...,3.503136,4.252463,2.971391,3.960600,3.004170,4.001025,3.049585,3.258080,3.190175,3.004170
608,4.057633,3.084503,2.802397,3.998760,3.510195,3.442075,3.510195,4.487827,2.785807,4.487827,...,3.510195,4.188746,2.840547,3.877440,3.014829,3.998760,2.967726,3.279878,3.204442,3.014829
609,4.110594,3.410181,2.785288,3.999288,3.500481,3.511807,3.500481,4.497650,2.911542,4.497650,...,3.500481,4.238179,3.007503,3.939351,3.000167,3.999288,3.049017,3.247593,3.174210,3.000167


In [12]:
save_NMF_and_imputed(nmf_model=model,imputed_values=mean_ratings,q_matrix=Q_matrix,p_matrix=P_matrix,
path_no_ending=model_path)

In [13]:
model,imputed_values,Q_matrix,P_matrix  =load_NMF_and_imputed(pathfile=model_path)

In [14]:
new_user_query = {"The Lords of the Rings": 5,
                 "Avatar":2,
                 "Night on Earth":3.5}

In [15]:
new_user_query = {"1": 5,
                 "2":2,
                 "3":3.5}

In [16]:
r_new_user = recommend_nmf_new_user(user_query=new_user_query,q_matrix=Q_matrix,imputed_values=imputed_values,nmf_model=model)



In [17]:
top10 = get_top_rated_movies(r_new_user,n_top=10)

In [18]:
movie = load_movies(datapath_movies)

In [19]:
movietitles = get_ID_to_title(movie_df=movie,movieIds=top10.index)
movietitles

['Toy Story (1995)',
 'GoldenEye (1995)',
 'City Hall (1996)',
 'Human Planet (2011)',
 'Comme un chef (2012)',
 'Movie 43 (2013)',
 "Pervert's Guide to Ideology, The (2012)",
 'Sightseers (2012)',
 'Hansel & Gretel: Witch Hunters (2013)',
 'Jim Jefferies: Fully Functional (EPIX) (2012)']

In [20]:
get_title_to_ID(movie_df=movie,movieTitles=movietitles)

['1',
 '10',
 '100',
 '100044',
 '100068',
 '100083',
 '100106',
 '100159',
 '100163',
 '100194']

In [21]:
movie[movie['movieId'].isin( sorted(top10.index.to_list(),reverse=True))]['title']

0                                    Toy Story (1995)
9                                    GoldenEye (1995)
88                                   City Hall (1996)
8086                              Human Planet (2011)
8087                             Comme un chef (2012)
8088                                  Movie 43 (2013)
8089          Pervert's Guide to Ideology, The (2012)
8090                                Sightseers (2012)
8091            Hansel & Gretel: Witch Hunters (2013)
8092    Jim Jefferies: Fully Functional (EPIX) (2012)
Name: title, dtype: object

In [22]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

In [23]:
new_user_query = {"10": 5,
                 "2":2,
                 "3":3.5}

In [24]:
def get_cosine_similertiy(new_user_query:dict,original_DF:pd.core.frame.DataFrame):
    new_user_dataframe =  pd.DataFrame(data=new_user_query,
                columns=original_DF.columns,
                index = ['new_user'])
    dataframe_with_new_user = pd.concat([original_DF,new_user_dataframe],axis=0)
    dataframe_with_new_user =dataframe_with_new_user.T
    return dataframe_with_new_user, pd.DataFrame(cosine_similarity(dataframe_with_new_user.fillna(dataframe_with_new_user.mean()).T),
     columns = dataframe_with_new_user.columns,
     index = dataframe_with_new_user.columns)

     
def get_unseen_movies(data:pd.core.frame.DataFrame,user:str='new_user'):
    unseen = data[data['new_user'].isna()].index
    return unseen.tolist()

def get_similar_user(similiarity_matrix:pd.core.frame.DataFrame ,n:int):
    top_five_similar = similiarity_matrix['new_user'].sort_values(ascending= False).index[1:(n+1)]
    return top_five_similar.tolist()

def get_recommended_movies(unseen_movies:list,closes_users:list,data:pd.core.frame.DataFrame,similiarity_matrix:pd.core.frame.DataFrame,n_movies:int=5, user:str='new_user' ):
    movie_scores = pd.DataFrame()
    for movie in unseen_movies:
        others_user = data.columns[~data.loc[movie].isna()]
        others_user = set(others_user)
        if len(set(closes_users).intersection(others_user))>0:
            num = 0
            den = 0
            for user in set(closes_users).intersection(others_user): 
                ratings = data[user][movie] 
                sim = similiarity_matrix['new_user'][user]
                num = num + (ratings*sim)
                den = den + sim + 0.000001
        
            pred_ratings = num/den
            movie_scores = pd.concat([movie_scores,pd.DataFrame({'movie':[movie],'pred_ratings':[pred_ratings]})],axis=0)
    return movie_scores.sort_values(by='pred_ratings', ascending=False).head(n_movies)


In [25]:
new_df, cossin = get_cosine_similertiy(new_user_query=new_user_query, original_DF=combined_df)

In [26]:
unseen = get_unseen_movies(data= new_df)

In [27]:
top = get_similar_user(similiarity_matrix = cossin, n=10)

In [28]:
test = get_recommended_movies(unseen_movies=unseen,closes_users=top,data=new_df,similiarity_matrix=cossin)

In [29]:
def recommend_random(movie_list:list,n:int=3):
    return movie_list.sample(n).to_list()

In [30]:
test = recommend_random(test['movie'])

In [31]:
get_ID_to_title(movie_df=movie,movieIds=test)

['Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)',
 'Finding Forrester (2000)',
 'Roman Holiday (1953)']

In [32]:
def recommend_with_cossin(query:dict,original_DF:pd.core.frame.DataFrame,movie_df:pd.core.frame.DataFrame,n:int=3):
    df_with_new_user, cosine_similarity_matrtix = get_cosine_similertiy(new_user_query=query, original_DF=original_DF)
    unseen_movies = get_unseen_movies(data= df_with_new_user)
    close_users = get_similar_user(similiarity_matrix = cosine_similarity_matrtix, n=10)
    movies_recs = get_recommended_movies(unseen_movies=unseen_movies,closes_users=close_users,data=df_with_new_user,similiarity_matrix=cosine_similarity_matrtix)
    recs = get_ID_to_title(movie_df=movie_df,movieIds=movies_recs['movie'])
    return recs

In [33]:
combined_df

movieId,1,10,100,100044,100068,100083,100106,100159,100163,100194,...,99750,99764,998,99813,99846,99853,999,99910,99917,99992
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,4.0,,,,,,,,,...,,,,,,,,,,
609,3.0,4.0,,,,,,,,,...,,,,,,,,,,


In [34]:
recommend_with_cossin(query=new_user_query,original_DF=combined_df,movie_df=movie)

['Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)',
 'Airheads (1994)',
 'Kalifornia (1993)',
 'Finding Forrester (2000)',
 'Roman Holiday (1953)']

In [35]:
df_with_new_user, cosine_similarity_matrtix = get_cosine_similertiy(new_user_query=new_user_query, original_DF=combined_df)

In [36]:
movie = load_movies(datapath_movies)
new_user_query = {"Jumanji (1995)": 5}

In [37]:
print(movie.columns)

def get_title_to_ID(movie_df:pd.core.frame.DataFrame, movieTitles:list): ### needs to be fuzzy
    '''
    returns a the IDs from titels if they exist
    needs a DF with the a ID and a title
    '''
    Id = [movie_df[movie_df['title']==title]['movieId'].values[0] for title in movieTitles]
    return Id


new_dic={get_title_to_ID(movie_df=movie,movieTitles=key):value for key, value in new_user_query.items()     }

Index(['movieId', 'title', 'genres'], dtype='object')


IndexError: index 0 is out of bounds for axis 0 with size 0

In [38]:
movie_df = movie.copy()
movieTitles = new_user_query.keys()
print (movieTitles)
movie_df.head()

dict_keys(['Jumanji (1995)'])


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [39]:

def movie_title_2_id (title:str,movie_df):
    movid_Title_2_id=dict(zip(movie_df['title'],movie_df['movieId']))
    return int(movid_Title_2_id[title])

    

In [40]:
movie_title_2_id(title="Toy Story (1995)",movie_df=movie_df)

1

In [41]:
new_user_query = {"Jumanji (1995)": 5}

In [42]:

temp= { movie_title_2_id(title=title,movie_df=movie_df):rating for title, rating in new_user_query.items() }

In [43]:
temp

{2: 5}