In [2]:
import pandas as pd
import numpy as np
import random
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
from sklearn.model_selection import train_test_split as scikit_train_test_split
from surprise import Reader, Dataset, accuracy
from surprise import KNNBasic, KNNWithMeans, SVD, CoClustering, SlopeOne, SVDpp, NMF, BaselineOnly
from surprise.model_selection import KFold, RepeatedKFold, cross_validate, GridSearchCV, LeaveOneOut
from surprise.model_selection import train_test_split as surprise_train_test_split
import time

In [41]:
i_cols = ["movie_id", "movie_title" ,"release_date","video_release_date", "IMDb_URL", "unknown", "Action", "Adventure",
 "Animation", "Children\'s", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
 "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

items = pd.read_csv("data/ml-100k/u.item.csv",sep="|", names=i_cols,encoding='latin-1')

movie_dict = dict(zip(items["movie_id"],items["movie_title"]))

In [13]:
data = Dataset.load_builtin("ml-100k")
all_ratings = data.build_full_trainset().all_ratings()
df = pd.DataFrame(columns=["uid", "iid", "rating"]).astype(int)
i = 0
for (uid, iid, rating) in all_ratings:
    df.loc[i] = [uid, iid, rating]
    i = i+1
df = df.astype(int)

In [18]:
uids = df["uid"].unique()
iids = sorted(df["iid"].unique())

In [24]:
perc_users = 40
n_skills = 2
sample_size = int(np.ceil((len(uids)/100)*perc_users))

random.seed(1)
sample = random.sample(population=uids.tolist(),k=sample_size)

min_rating = 4

In [25]:
holdout = pd.DataFrame()

user_sample = []
for uid in sample: 
    df_subset = df[(df["uid"] == uid) & (df["rating"] >= min_rating)]
    # try to get two random high ranking skills (if the employee doesn't have high ranking skills they will be skipped)
    try:
        holdout_uid = df_subset.sample(n=n_skills,random_state=1)
        user_sample.append(uid)
        df.drop(holdout_uid.index,inplace=True)
        holdout = pd.concat([holdout,holdout_uid],ignore_index=1)
    except ValueError:
        pass 
holdout

Unnamed: 0,uid,iid,rating
0,137,378,4
1,137,405,5
2,582,242,4
3,582,586,4
4,867,1079,4
...,...,...,...
749,757,309,5
750,777,368,4
751,777,113,5
752,575,161,4


In [None]:
NUM_TRIALS = 3
NUM_SPLITS = 5

In [None]:
def gridsearch_cv(algo_class,algo_name,param_grid,data,best_model_dict,best_params_dict):
    start = time.time()
    rmse = np.zeros(NUM_TRIALS)
    mse = np.zeros(NUM_TRIALS)
    mae = np.zeros(NUM_TRIALS)
    models = []
    params = []

    for i in range(NUM_TRIALS):
        print("Running CV for model", algo_name ,"in Iteration:", i ,"at", time.time()-start)
        cv = LeaveOneOut(
                        n_splits=NUM_SPLITS,
                        random_state=i,
                        min_n_ratings=i
                    )
                        # COMMENT: min_n_ratings -> what's the impact of this parameter? 
        gs = GridSearchCV(
                        algo_class,
                        param_grid, 
                        measures=["rmse","mse","mae"], 
                        cv=cv,refit="rmse"
                    )
        gs.fit(data)
        rmse[i] = gs.best_score["rmse"]
        mse[i] = gs.best_score["mse"]
        mae[i] = gs.best_score["mae"]
        models.append(gs.best_estimator["rmse"])
        params.append(gs.best_params["rmse"])

    # hier noch erläutern, warum rmse als ausschlaggebene measure gewählt wird
    best_model_dict[algo_name] = models[np.argmin(rmse)]
    best_params_dict[algo_name] = params[np.argmin(rmse)]

    print("Total time: ", (time.time()-start), "sec.")
    return rmse, mse, mae 

In [None]:
def add_result(results, name, rmse, mse, mae):
    '''
    Function adding the results returned by nested_cv to a dataframe. 
    Results will be aggregated for better comparison. 
    Parameters: 

    Output:
    results DataFrame containing an additional row
    '''
    row = pd.DataFrame({
        "name":name,
        "rmse_mean":rmse.mean(), 
        "rmse_std":rmse.std(), 
        "mse_mean":mse.mean(), 
        "mse_std":mse.std(), 
        "mae_mean":mae.mean(), 
        "mae_std":mae.std()
        },index=[0])
    return pd.concat([results,row],ignore_index=True)

In [None]:
results = pd.DataFrame()
best_model_dict = {}
best_params_dict = {}

In [None]:
algo_list = [
                (BaselineOnly,"Baseline",{
                                                "verbose":[False]
                                    }),
                (KNNBasic,"k-NN",{
                                                "k": [20, 40, 60, 80],
                                                "min_k": [1, 5, 10, 20],
                                                "verbose":[False]
                                    }),
                (KNNWithMeans,"Centered k-NN",{
                                                "k": [20, 40, 60, 80],
                                                "min_k": [1, 5, 10, 20],
                                                "verbose":[False]
                                    }),
                (SVD,"SVD",{
                                                "n_factors": [20, 40, 60, 80, 100],
                                                "n_epochs": [10, 20, 40, 60],
                                                "biased":[True,False],
                                                "random_state":[1]
                                    }),
                # (SVDpp,"SVD++",{
                #                                 "n_factors": [20, 40, 60, 80, 100],
                #                                 "n_epochs": [10, 20, 40, 60],
                #                                 "random_state":[1]
                #                     }),
                # (CoClustering,"CoClustering",{
                #                                 "n_cltr_u": [2, 3, 4],
                #                                 "n_cltr_i": [2, 3, 4],
                #                                 "n_epochs":[10, 20, 40, 60]
                #                     }),
                (SlopeOne,"SlopeOne",{
                                    }),
                (NMF,"NMF",{
                                                "n_factors": [10, 15, 20, 40],
                                                "n_epochs": [20, 40, 80, 120],
                                                "biased":[True,False],
                                                "random_state":[1]
                                    })
            ]

In [None]:
for algo in algo_list:
    rmse, mse, mae = gridsearch_cv(algo[0],algo[1],algo[2],data,best_model_dict,best_params_dict)
    results = add_result(results,algo[1],rmse, mse, mae)

In [None]:
results

In [None]:
model = NMF(n_factors= 40, n_epochs = 120, biased = False, random_state= 1)
#model = best_model_dict["SVD++"]
#model = SVDpp(n_factors = 40, n_epochs= 60, random_state=1)
trainset = data.build_full_trainset()
model.fit(trainset)

In [None]:
from collections import defaultdict

def get_top_n(predictions, n):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = {}
    
    for uid, iid, true_r, est, _ in predictions:
        try:
                top_n[uid].append((iid, est))
        except KeyError:
                top_n[uid] = [(iid, est)]

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [None]:
def get_recoms_for_employee(emp_id,n):
    df_subset = df_rated[df_rated["person"] == 233]
    recom_df = pd.DataFrame()
    recom_df["skill"] = skills
    recom_df["person"] = emp_id
    employee_ratings = np.zeros(len(skills))
    for i in range(0,len(skills)): 
        try:
            employee_ratings[i] = df_subset[df_subset["skill"] == skills[i]]["rating"]
        except ValueError:
            employee_ratings[i] = np.nan
    recom_df["rating"] = employee_ratings
    recom_df = recom_df[["person","skill","rating"]]
    #return recom_df.to_numpy()
    preds = model.test(recom_df.to_numpy())
    return get_top_n(preds,n)

In [None]:
preds = model.test(trainset.build_anti_testset())

In [None]:
top_n = get_top_n(preds, n=10)

# Print the recommended items for each user
recommendations = pd.DataFrame()

for uid, user_ratings in top_n.items():
    recom = pd.DataFrame(index=[uid],data=[[iid for (iid, _) in user_ratings]])
    #print([iid for (iid, _) in user_ratings])
    recommendations = pd.concat([recommendations,recom])

recommendations

In [None]:
get_recoms_for_employee(233,10)

In [None]:
def measure_holdout_occurence(holdout,recommendations):
    score = 0 
    for emp in holdout["person"].unique():
        skills = holdout[holdout["person"] == emp]["skill"].values
        for skill in skills:
            if skill in recommendations.iloc[12].to_list():
                score = score + 1
    return score/len(holdout)

In [None]:
measure_holdout_occurence(holdout,recommendations)

In [None]:
def get_recoms_for_new_employee(new_id,skilldict,model):

    all_ids = df_rated["person"].unique()
    all_skills = df_rated["skill"].unique()

    if new_id in all_ids:
        return "Error: ID already taken. Please choose a different ID an try again."
    for skill in skilldict.keys():
        if skill not in all_skills:
            return "Skill " + skill + " not in database."
            # COMMENT: this is only a placeholder. For ideas on how to handle new skills, see "HOW TO HANDLE NEW SKILL" section 

    new_test = []
    for skill in all_skills:
        try:
            skill_rating = skilldict[skill]
            new_test.append((new_id,skill,skill_rating))
        except KeyError:
            new_test.append((new_id,skill,np.nan))
    
    predictions = model.test(new_test)

    return get_top_n(predictions, n=10)[new_id]