In [2]:
import pandas as pd
import numpy as np
import random
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
from sklearn.model_selection import train_test_split as scikit_train_test_split
from surprise import Reader, Dataset, accuracy
from surprise import KNNBasic, KNNWithMeans, SVD, CoClustering, SlopeOne, SVDpp, NMF, BaselineOnly
from surprise.model_selection import KFold, RepeatedKFold, cross_validate, GridSearchCV, LeaveOneOut
from surprise.model_selection import train_test_split as surprise_train_test_split
import time

In [249]:
d_cols = ["uid","iid","rating","timestamp"]
df = pd.read_csv("data/ml-100k/u.data",delim_whitespace=True,names=d_cols)
df.drop("timestamp",axis=1,inplace=True)
df

Unnamed: 0,uid,iid,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


In [250]:
i_cols = ["movie_id", "movie_title" ,"release_date","video_release_date", "IMDb_URL", "unknown", "Action", "Adventure",
 "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
 "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

items = pd.read_csv("data/ml-100k/u.item.csv",sep="|", names=i_cols)

In [251]:
item_dict = dict(zip(items["movie_id"],items["movie_title"]))
item_dict_inv = {v: k for k, v in item_dict.items()}

Introduction to the Data

The dataset consists of 943 individual user who in total gave 100000 ratings (between 1 and 5) on 1682 movies. Each user rated at least 20 movies. 

In [240]:
def get_top_rated_movies_of_user(uid,n):
    df_subset = df[df["uid"] == uid][["iid","rating"]].nlargest(n=n,columns="rating")
    ratings = df_subset["rating"].to_list()
    iids = df_subset["iid"].to_list()
    movies = [item_dict[x] for x in df_subset["iid"].to_list()]
    return_df = pd.DataFrame()
    return_df["movies"] = movies
    return_df["rating"] = ratings
    return return_df.set_index("movies")

In [241]:
get_top_rated_movies_of_user(8,10)

Unnamed: 0_level_0,rating
movies,Unnamed: 1_level_1
Braveheart (1995),5
Star Wars (1977),5
GoodFellas (1990),5
"Empire Strikes Back, The (1980)",5
Lawrence of Arabia (1962),5
Die Hard (1988),5
Pulp Fiction (1994),5
Alien (1979),5
Contact (1997),5
"Godfather, The (1972)",5


In [252]:
uids = sorted(df["uid"].unique())
iids = sorted(df["iid"].unique())

In [253]:
perc_users = 40
n_skills = 2
sample_size = int(np.ceil((len(uids)/100)*perc_users))
random.seed(1)
sample = random.sample(population=uids,k=sample_size)
min_rating = 5

In [248]:
holdout = pd.DataFrame()

user_sample = []
for uid in sample: 
    df_subset = df[(df["uid"] == uid) & (df["rating"] >= min_rating)]
    # try to get two random high ranking skills (if the employee doesn't have high ranking skills they will be skipped)
    try:
        holdout_uid = df_subset.sample(n=n_skills,random_state=1)
        user_sample.append(uid)
        df.drop(holdout_uid.index,inplace=True)
        holdout = pd.concat([holdout,holdout_uid],ignore_index=1)
    except ValueError:
        pass 
holdout

Unnamed: 0,uid,iid,rating
0,138,238,5
1,138,194,5
2,583,524,5
3,583,357,5
4,868,12,5
...,...,...,...
635,749,484,5
636,758,13,5
637,758,209,5
638,576,7,5


In [254]:
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(df,reader)

In [255]:
NUM_TRIALS = 3
NUM_SPLITS = 5

In [226]:
def gridsearch_cv(algo_class,algo_name,param_grid,data,best_model_dict,best_params_dict):
    start = time.time()
    rmse = np.zeros(NUM_TRIALS)
    mse = np.zeros(NUM_TRIALS)
    mae = np.zeros(NUM_TRIALS)
    models = []
    params = []

    for i in range(NUM_TRIALS):
        print("Running CV for model", algo_name ,"in Iteration:", i ,"at", time.time()-start)
        cv = LeaveOneOut(
                        n_splits=NUM_SPLITS,
                        random_state=i,
                        min_n_ratings=i
                    )
                        # COMMENT: min_n_ratings -> what's the impact of this parameter? 
        gs = GridSearchCV(
                        algo_class,
                        param_grid, 
                        measures=["rmse","mse","mae"], 
                        cv=cv,refit="rmse"
                    )
        gs.fit(data)
        rmse[i] = gs.best_score["rmse"]
        mse[i] = gs.best_score["mse"]
        mae[i] = gs.best_score["mae"]
        models.append(gs.best_estimator["rmse"])
        params.append(gs.best_params["rmse"])

    # hier noch erläutern, warum rmse als ausschlaggebene measure gewählt wird
    best_model_dict[algo_name] = models[np.argmin(rmse)]
    best_params_dict[algo_name] = params[np.argmin(rmse)]

    print("Total time: ", (time.time()-start), "sec.")
    return rmse, mse, mae 

In [44]:
def add_result(results, name, rmse, mse, mae):
    '''
    Function adding the results returned by nested_cv to a dataframe. 
    Results will be aggregated for better comparison. 
    Parameters: 

    Output:
    results DataFrame containing an additional row
    '''
    row = pd.DataFrame({
        "name":name,
        "rmse_mean":rmse.mean(), 
        "rmse_std":rmse.std(), 
        "mse_mean":mse.mean(), 
        "mse_std":mse.std(), 
        "mae_mean":mae.mean(), 
        "mae_std":mae.std()
        },index=[0])
    return pd.concat([results,row],ignore_index=True)

In [45]:
results = pd.DataFrame()
best_model_dict = {}
best_params_dict = {}

In [46]:
algo_list = [
                (BaselineOnly,"Baseline",{
                                                "verbose":[False]
                                    }),
                (KNNBasic,"k-NN",{
                                                "k": [20, 40, 60, 80],
                                                "min_k": [1, 5, 10, 20],
                                                "verbose":[False]
                                    }),
                (KNNWithMeans,"Centered k-NN",{
                                                "k": [20, 40, 60, 80],
                                                "min_k": [1, 5, 10, 20],
                                                "verbose":[False]
                                    }),
                (SVD,"SVD",{
                                                "n_factors": [20, 40, 60, 80, 100],
                                                "n_epochs": [10, 20, 40, 60],
                                                "biased":[True,False],
                                                "random_state":[1]
                                    }),
                # (SVDpp,"SVD++",{
                #                                 "n_factors": [20, 40, 60, 80, 100],
                #                                 "n_epochs": [10, 20, 40, 60],
                #                                 "random_state":[1]
                #                     }),
                # (CoClustering,"CoClustering",{
                #                                 "n_cltr_u": [2, 3, 4],
                #                                 "n_cltr_i": [2, 3, 4],
                #                                 "n_epochs":[10, 20, 40, 60]
                #                     }),
                (SlopeOne,"SlopeOne",{
                                    }),
                (NMF,"NMF",{
                                                "n_factors": [10, 15, 20, 40],
                                                "n_epochs": [20, 40, 80, 120],
                                                "biased":[True,False],
                                                "random_state":[1]
                                    })
            ]

In [48]:
# for algo in algo_list:
#     rmse, mse, mae = gridsearch_cv(algo[0],algo[1],algo[2],data,best_model_dict,best_params_dict)
#     results = add_result(results,algo[1],rmse, mse, mae)

In [None]:
# results

In [256]:
#model = NMF(n_factors= 40, n_epochs = 120, biased = False, random_state= 1)
#model = best_model_dict["SVD++"]
model = SVDpp(n_factors = 40, n_epochs= 60, random_state=1)
trainset = data.build_full_trainset()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x2776fccf0d0>

In [257]:
predictions = model.test(trainset.build_anti_testset())

In [229]:
from collections import defaultdict

def get_top_n(predictions, n):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = {}
    
    for uid, iid, true_r, est, _ in predictions:
        try:
                top_n[uid].append((iid, est))
        except KeyError:
                top_n[uid] = [(iid, est)]

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [230]:
def get_recoms_for_user(uid,n):
    df_subset = df[df["uid"] == uid]
    recom_df = pd.DataFrame()
    recom_df["iid"] = iids
    recom_df["uid"] = uid
    employee_ratings = np.zeros(len(iids))
    for i in range(0,len(iids)): 
        try:
            employee_ratings[i] = df_subset[df_subset["iid"] == iids[i]]["rating"]
        except ValueError:
            employee_ratings[i] = np.nan
    recom_df["rating"] = employee_ratings
    recom_df = recom_df[["uid","iid","rating"]]
    #return recom_df.to_numpy()
    preds = model.test(recom_df.to_numpy())
    recoms = get_top_n(preds,n)
    items = []
    for recom in recoms[uid]:
        items.append(item_dict[recom[0]])
    return items

In [231]:
get_recoms_for_user(8,60)

['Star Wars (1977)',
 'Fargo (1996)',
 'Godfather, The (1972)',
 'Wizard of Oz, The (1939)',
 'Mr. Smith Goes to Washington (1939)',
 "Monty Python's Life of Brian (1979)",
 'Raiders of the Lost Ark (1981)',
 '12 Angry Men (1957)',
 'Psycho (1960)',
 'Graduate, The (1967)',
 'Patton (1970)',
 'Sling Blade (1996)',
 'Jaws (1975)',
 'Leaving Las Vegas (1995)',
 'Secrets & Lies (1996)',
 'Titanic (1997)',
 "One Flew Over the Cuckoo's Nest (1975)",
 'Miracle on 34th Street (1994)',
 'Day the Earth Stood Still, The (1951)',
 'Thin Man, The (1934)',
 "It's a Wonderful Life (1946)",
 'Boot, Das (1981)',
 'Christmas Carol, A (1938)',
 'Persuasion (1995)',
 'Some Folks Call It a Sling Blade (1993)',
 'Some Like It Hot (1959)',
 'Pather Panchali (1955)',
 'When We Were Kings (1996)',
 'Jungle Book, The (1994)',
 'Aliens (1986)',
 'Winnie the Pooh and the Blustery Day (1968)',
 'Before Sunrise (1995)',
 'Snow White and the Seven Dwarfs (1937)',
 'Dr. Strangelove or: How I Learned to Stop Worrying

In [245]:
top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
recommendations = pd.DataFrame()

for uid, user_ratings in top_n.items():
    recom = pd.DataFrame(index=[uid],data=[[iid for (iid, _) in user_ratings]])
    #print([iid for (iid, _) in user_ratings])
    recommendations = pd.concat([recommendations,recom])

recommendations

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
196,298,157,56,435,512,190,753,582,880,100
186,465,86,515,498,132,318,133,378,97,510
22,98,135,214,357,50,150,13,661,1010,429
244,305,150,306,81,269,1065,213,880,137,236
166,195,82,121,73,434,378,87,480,185,177
...,...,...,...,...,...,...,...,...,...,...
939,302,392,1,95,194,603,201,332,100,181
936,474,98,603,1137,515,427,480,23,514,483
930,707,676,133,291,744,318,939,251,87,86
920,276,234,493,56,100,174,98,480,508,1449


In [258]:
top_n = get_top_n(predictions, n=10)

# Print the recommended items for each user
recommendations = pd.DataFrame()

for uid, user_ratings in top_n.items():
    recom = pd.DataFrame(index=[uid],data=[[item_dict[iid] for (iid, _) in user_ratings]])
    #print([iid for (iid, _) in user_ratings])
    recommendations = pd.concat([recommendations,recom])

recommendations

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
196,Wings of Desire (1987),Three Colors: Red (1994),Short Cuts (1993),"Hudsucker Proxy, The (1994)",Welcome To Sarajevo (1997),8 1/2 (1963),Dances with Wolves (1990),"Wizard of Oz, The (1939)",Pink Floyd - The Wall (1982),Desperado (1995)
186,Homeward Bound: The Incredible Journey (1993),"English Patient, The (1996)",As Good As It Gets (1997),White Squall (1996),"Sum of Us, The (1994)","Secret Garden, The (1993)","Last of the Mohicans, The (1992)",Freeway (1996),Shine (1996),Wings of Desire (1987)
22,Restoration (1995),"Silence of the Lambs, The (1991)",Beautiful Thing (1996),Fargo (1996),Dead Poets Society (1989),Jean de Florette (1986),"Close Shave, A (1995)","Philadelphia Story, The (1940)",Delicatessen (1991),"Shawshank Redemption, The (1994)"
244,Evil Dead II (1987),Wings of Desire (1987),Good Will Hunting (1997),Big Night (1996),"Close Shave, A (1995)",Apt Pupil (1998),"Ice Storm, The (1997)","Secret of Roan Inish, The (1994)",Secrets & Lies (1996),Koyaanisqatsi (1983)
166,Return of the Jedi (1983),Jean de Florette (1986),Assassins (1995),Twelve Monkeys (1995),Jurassic Park (1993),Indiana Jones and the Last Crusade (1989),Demolition Man (1993),Manon of the Spring (Manon des sources) (1986),How to Make an American Quilt (1995),Stargate (1994)
...,...,...,...,...,...,...,...,...,...,...
939,Kolya (1996),Legends of the Fall (1994),Sabrina (1954),Toy Story (1995),Aladdin (1992),"Silence of the Lambs, The (1991)",Sleepless in Seattle (1993),Rear Window (1954),Beautiful Thing (1996),"Last of the Mohicans, The (1992)"
936,"Silence of the Lambs, The (1991)",Rear Window (1954),Beautiful Thing (1996),"Sound of Music, The (1965)","Boot, Das (1981)","African Queen, The (1951)",It's a Wonderful Life (1946),"Wizard of Oz, The (1939)","Quiet Man, The (1952)","Shining, The (1980)"
930,"Remains of the Day, The (1993)","Thin Blue Line, The (1988)","Postman, The (1997)",How to Make an American Quilt (1995),"Age of Innocence, The (1993)",Raise the Red Lantern (1991),Mr. Holland's Opus (1995),Sleepers (1996),Titanic (1997),To Kill a Mockingbird (1962)
920,Leaving Las Vegas (1995),"Birds, The (1963)",Psycho (1960),"Maltese Falcon, The (1941)","39 Steps, The (1935)",Raiders of the Lost Ark (1981),Schindler's List (1993),Sunset Blvd. (1950),Ed Wood (1994),Sling Blade (1996)


In [216]:
get_top_rated_movies_of_user(941,10)

Unnamed: 0_level_0,rating
movies,Unnamed: 1_level_1
Lone Star (1996),5
"Rock, The (1996)",5
Return of the Jedi (1983),5
Face/Off (1997),5
"Close Shave, A (1995)",5
"City of Lost Children, The (1995)",5
Toy Story (1995),5
"Long Kiss Goodnight, The (1996)",4
Hercules (1997),4
Contact (1997),4


In [235]:
def measure_holdout_occurence(holdout,recommendations):
    score = 0 
    for uid in holdout["uid"].unique():
        items = holdout[holdout["uid"] == uid]["iid"].values
        for item in items:
            if item in recommendations.loc[uid].to_list():
                score = score + 1
    return score/len(holdout)

In [259]:
measure_holdout_occurence(holdout,recommendations)

0.0

In [None]:
def get_recoms_for_new_user(new_uid,item_dict,model):
    if new_uid in uids:
        return "Error: ID already taken. Please choose a different ID an try again."
    for item in item_dict.keys():
        if item not in iids:
            return "Item " + item + " not in database."
            # COMMENT: this is only a placeholder. For ideas on how to handle new skills, see "HOW TO HANDLE NEW SKILL" section 

    new_test = []
    for item in iids:
        try:
            item_rating = item_dict[item]
            new_test.append((new_uid,item,item_rating))
        except KeyError:
            new_test.append((new_uid,item,np.nan))
    
    predictions = model.test(new_test)

    return get_top_n(predictions, n=10)[new_uid]

In [None]:
new_uid = 8774
item_dict = {
    
}