In [331]:
import pandas as pd
import numpy as np
import random
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
from sklearn.model_selection import train_test_split as scikit_train_test_split
from surprise import Reader, Dataset, accuracy
from surprise import KNNBasic, KNNWithMeans, SVD, CoClustering, SlopeOne, SVDpp, NMF, BaselineOnly
from surprise.model_selection import KFold, RepeatedKFold, cross_validate, GridSearchCV, LeaveOneOut
from surprise.model_selection import train_test_split as surprise_train_test_split
from sklearn.pipeline import Pipeline
import time

In [332]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";")

Declare various aiding variables

In [333]:
persons = df["person"].unique()
skills = sorted(df["skill"].unique())
categories = df["category"].unique()

In [334]:
employee_dict = {}
for n in range(1,len(persons)+1):
    employee_dict[n] = persons[n-1]

In [335]:
skills_to_category = pd.read_csv("mitarbeiterportal-skills_2022-06-21.csv",sep=";",header=None)
category_dict = dict(zip(skills_to_category[0],skills_to_category[1]))

Insert ratings for skills according to frequency of underlying categories in the skill protfolio on an employee: 

In [336]:
def scaler_1_5(x,old_max,old_min):
    OldRange = (old_max - old_min)  
    if (OldRange == 0):
        return 1
    else:
        NewRange = (5 - 1)  
        return round((((x - 1) * NewRange) / OldRange) + 1)

In [337]:
# create dataframe for each employee and their category count
category_per_employee = pd.DataFrame(index=categories)
for p in persons:
    p_skills = df[df["person"] == p]
    # count categories for each employee
    category_count = p_skills["category"].value_counts()
    category_count = category_count.apply(lambda x:scaler_1_5(x,category_count.max(),category_count.min()))
    skill_values = np.array([])
    # add count for each category to an array and 0 if category wasn't found (and therefore is not part of resptive employee's skillset)
    for cat in categories:
        try:
            skill_values = np.append(skill_values,category_count[cat])
        except KeyError:
            skill_values = np.append(skill_values,0)
    # add employee data to dataframe
    p_skills_df = pd.DataFrame(index=categories,columns=[p],data=skill_values)
    category_per_employee = pd.concat([category_per_employee,p_skills_df],axis=1)

In [338]:
df_rated = pd.DataFrame()
for p in persons:
    # get employee
    df_subset = df[df["person"] == p].copy()
    # get skills of employee
    p_skills = df_subset["skill"].to_numpy()
    # initialize empty array for ratings
    ratings = np.zeros(len(p_skills))
    # fill array for each skill with respective category value 
    for i in range(0,len(p_skills)): 
        ratings[i] = category_per_employee.loc[category_dict[p_skills[i]]][p]
    # append ratings to subset and drop category column
    df_subset["rating"] = ratings
    df_subset.drop("category",axis=1,inplace=True)
    # append user to df
    df_rated = pd.concat([df_rated,df_subset],ignore_index=True)

In [339]:
df_rated

Unnamed: 0,person,skill,rating
0,12,Windows NT/2000/XP,4.0
1,12,MAC OS X,4.0
2,12,Windows 7,4.0
3,12,Windows 8,4.0
4,12,iOS,4.0
...,...,...,...
24581,1488,Deutsch,1.0
24582,1488,Spring-JPA,5.0
24583,1488,Maven,3.0
24584,1488,Subversion,3.0


## Comparing Different Recommender Systems

In [340]:
cv_data,holdout_data = scikit_train_test_split(df_rated, test_size=0.05)
reader = Reader(rating_scale=(1,5)) 
data = Dataset.load_from_df(cv_data,reader)
holdout = Dataset.load_from_df(holdout_data,reader)

In [341]:
NUM_TRIALS = 5
NUM_OUTER_SPLITS = 5

COMMENT: Hyperparameter optimization using GridSearchCV can not be incorporated in the cross validation. Doing this yields an error. Multiple work arounds have been unsuccefully tried. As it seems, surprise's GridSearchCV is not compatible with splitting functions such as KFold/LeaveOneOut etc. as well as cross_validate.

In [342]:
def nested_cv(algo,data):
    start = time.time()
    rmse = np.zeros((NUM_TRIALS, NUM_TRIALS))
    mae = np.zeros((NUM_TRIALS, NUM_TRIALS))
    mse = np.zeros((NUM_TRIALS, NUM_TRIALS))
    fit_times = np.zeros((NUM_TRIALS, NUM_TRIALS))
    test_times = np.zeros((NUM_TRIALS, NUM_TRIALS))

    # hier eventuell gridsearch rein? 
    for i in range(NUM_TRIALS):
        cv= LeaveOneOut(n_splits=NUM_OUTER_SPLITS,
                                random_state=36,
                                min_n_ratings=1)
                                # COMMENT: min_n_ratings -> what's the impact of this parameter? 
        cv_results = cross_validate(algo=algo,
                                    data=data,
                                    measures=["rmse","mae","mse"],
                                    cv=cv,
                                    n_jobs=8)
        rmse[i] = cv_results["test_rmse"]
        mae[i] = cv_results["test_mae"]
        mse[i] = cv_results["test_mse"]
        fit_times[i] = cv_results["fit_time"]
        test_times[i] = cv_results["test_time"]

    return mse, rmse, mae, fit_times, test_times

In [343]:
def add_result(results, name, mse, rmse, mae, fit_times, test_times):
    '''
    Function adding the results returned by nested_cv to a dataframe. 
    Results will be aggregated for better comparison. 
    Parameters: 

    Output:
    results DataFrame containing an additional row
    '''
    row = pd.DataFrame({
        "name":name,
        "mse_mean":mse.mean(), 
        "mse_std":mse.std(),
        "rmse_mean":rmse.mean(), 
        "rmse_std":rmse.std(),  
        "mae_mean":mae.mean(), 
        "mae_std":mae.std(), 
        "fit_time":fit_times.mean(),
        "test_time":test_times.mean()
        },index=[0])
    return pd.concat([results,row],ignore_index=True)

In [344]:
results = pd.DataFrame()

In [345]:
algo_list = [
                (BaselineOnly(),"Baseline"),
                (KNNBasic(),"k-NN"),
                (KNNWithMeans(),"Centered k-NN"),
                (SVD(),"SVD"),
                (SVDpp(),"SVD++"),
                (CoClustering(),"CoClustering"),
                (SlopeOne(),"SlopeOne"),
                (NMF(),"NMF")
            ]

In [346]:
for algo in algo_list:
    mse, rmse, mae, fit_times, test_times = nested_cv(algo[0],data)
    results = add_result(results,algo[1],mse,rmse,mae,fit_times,test_times)

In [347]:
results

Unnamed: 0,name,mse_mean,mse_std,rmse_mean,rmse_std,mae_mean,mae_std,fit_time,test_time
0,Baseline,1.235586,0.09911,1.110628,0.045722,0.905646,0.044856,0.007881,0.000799
1,k-NN,0.916038,0.082487,0.956144,0.042742,0.68812,0.031669,0.066126,0.038233
2,Centered k-NN,0.912171,0.09423,0.953785,0.049653,0.677288,0.041546,0.047881,0.027762
3,SVD,0.888929,0.091844,0.941483,0.050396,0.716209,0.040691,0.276458,0.003287
4,SVD++,0.679702,0.070315,0.823275,0.043817,0.594947,0.02896,3.686526,0.024308
5,CoClustering,1.086943,0.095587,1.041516,0.046766,0.770343,0.039558,0.523946,0.001567
6,SlopeOne,1.076973,0.120946,1.036056,0.059672,0.780094,0.045225,0.080931,0.022356
7,NMF,0.693287,0.098038,0.830423,0.060699,0.570171,0.035447,0.36208,0.001995


Pick out best algorithms and optimize parameters

In [348]:
param_grid = {
                "n_factors": [10, 15, 20,40],
                "n_epochs": [20, 40, 80, 120],
                "biased":[True,False],
                "random_state":[36]
            }
gs = GridSearchCV(NMF, param_grid, measures=["rmse"], cv=3)

gs.fit(data)
# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

0.60816638557169
{'n_factors': 40, 'n_epochs': 120, 'biased': False, 'random_state': 36}


In [349]:
algo = NMF(n_factors=10,n_epochs=120,biased=False,random_state=1)
trainset = data.build_full_trainset()
algo.fit(trainset)
algo.test(trainset.build_anti_testset())

[Prediction(uid=272, iid='Dependency Injection', r_ui=2.8846549066621, est=2.2302373569999254, details={'was_impossible': False}),
 Prediction(uid=272, iid='Implementierung', r_ui=2.8846549066621, est=2.87739849267839, details={'was_impossible': False}),
 Prediction(uid=272, iid='Gitlab', r_ui=2.8846549066621, est=3.0583279147326623, details={'was_impossible': False}),
 Prediction(uid=272, iid='CruiseControl', r_ui=2.8846549066621, est=2.7442236231595194, details={'was_impossible': False}),
 Prediction(uid=272, iid='Mediengestalter Digital & Print - Fachrichtung Gestaltung & Technik', r_ui=2.8846549066621, est=1.9280040362404511, details={'was_impossible': False}),
 Prediction(uid=272, iid='Prozessmanagement', r_ui=2.8846549066621, est=1.7630155861305865, details={'was_impossible': False}),
 Prediction(uid=272, iid='Teamviewer', r_ui=2.8846549066621, est=1.1982184932669682, details={'was_impossible': False}),
 Prediction(uid=272, iid='Continuous Integration (CI)', r_ui=2.8846549066621,

In [350]:
holdout_test = holdout_data.to_numpy()
algo.test(holdout_test)

[Prediction(uid=740, iid='Symfony', r_ui=1.0, est=1.2360322873728447, details={'was_impossible': False}),
 Prediction(uid=404, iid='JavaScript', r_ui=2.0, est=2.306234382867797, details={'was_impossible': False}),
 Prediction(uid=1295, iid='Open SSH', r_ui=5.0, est=4.253248542150006, details={'was_impossible': False}),
 Prediction(uid=884, iid='ITIL', r_ui=2.0, est=2.377583101710579, details={'was_impossible': False}),
 Prediction(uid=546, iid='Behaviour Driven Development (BDD)', r_ui=5.0, est=4.190755666695242, details={'was_impossible': False}),
 Prediction(uid=141, iid='UML', r_ui=1.0, est=1.4789521179549578, details={'was_impossible': False}),
 Prediction(uid=245, iid='LINQ', r_ui=5.0, est=4.598955085648132, details={'was_impossible': False}),
 Prediction(uid=557, iid='JSON', r_ui=4.0, est=3.904837867481369, details={'was_impossible': False}),
 Prediction(uid=785, iid='Test Driven Development (TDD)', r_ui=4.0, est=3.7012831994118223, details={'was_impossible': False}),
 Prediction

In [351]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [352]:
# top_n = get_top_n(predictions, n=10)

# # Print the recommended items for each user
# recommendations = pd.DataFrame()

# for uid, user_ratings in top_n.items():
#     recom = pd.DataFrame(index=[uid],data=[[iid for (iid, _) in user_ratings]])
#     #print([iid for (iid, _) in user_ratings])
#     recommendations = pd.concat([recommendations,recom])

# recommendations

In [353]:
train,test = surprise_train_test_split(holdout,test_size=0.99)
holdout_preds = algo.test(test)
accuracy.rmse(holdout_preds)

RMSE: 0.5961


0.5961168024885166