In [1]:
import pandas as pd
import numpy as np
import random
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
from sklearn.model_selection import train_test_split as scikit_train_test_split
from surprise import Reader, Dataset, accuracy
from surprise import KNNBasic, KNNWithMeans, SVD, CoClustering, SlopeOne, SVDpp, NMF, BaselineOnly
from surprise.model_selection import KFold, RepeatedKFold, cross_validate, GridSearchCV, LeaveOneOut
from surprise.model_selection import train_test_split as surprise_train_test_split
from sklearn.pipeline import Pipeline
import time

In [2]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";")

Declare various aiding variables

In [3]:
persons = df["person"].unique()
skills = sorted(df["skill"].unique())
categories = df["category"].unique()

In [4]:
employee_dict = {}
for n in range(1,len(persons)+1):
    employee_dict[n] = persons[n-1]

In [5]:
skills_to_category = pd.read_csv("mitarbeiterportal-skills_2022-06-21.csv",sep=";",header=None)
category_dict = dict(zip(skills_to_category[0],skills_to_category[1]))

Insert ratings for skills according to frequency of underlying categories in the skill protfolio on an employee: 

In [6]:
def scaler_1_5(x,old_max,old_min):
    OldRange = (old_max - old_min)  
    if (OldRange == 0):
        return 1
    else:
        NewRange = (5 - 1)  
        return round((((x - 1) * NewRange) / OldRange) + 1)

In [7]:
# create dataframe for each employee and their category count
category_per_employee = pd.DataFrame(index=categories)
for p in persons:
    p_skills = df[df["person"] == p]
    # count categories for each employee
    category_count = p_skills["category"].value_counts()
    category_count = category_count.apply(lambda x:scaler_1_5(x,category_count.max(),category_count.min()))
    skill_values = np.array([])
    # add count for each category to an array and 0 if category wasn't found (and therefore is not part of resptive employee's skillset)
    for cat in categories:
        try:
            skill_values = np.append(skill_values,category_count[cat])
        except KeyError:
            skill_values = np.append(skill_values,0)
    # add employee data to dataframe
    p_skills_df = pd.DataFrame(index=categories,columns=[p],data=skill_values)
    category_per_employee = pd.concat([category_per_employee,p_skills_df],axis=1)

In [8]:
df_rated = pd.DataFrame()
for p in persons:
    # get employee
    df_subset = df[df["person"] == p].copy()
    # get skills of employee
    p_skills = df_subset["skill"].to_numpy()
    # initialize empty array for ratings
    ratings = np.zeros(len(p_skills))
    # fill array for each skill with respective category value 
    for i in range(0,len(p_skills)): 
        ratings[i] = category_per_employee.loc[category_dict[p_skills[i]]][p]
    # append ratings to subset and drop category column
    df_subset["rating"] = ratings
    df_subset.drop("category",axis=1,inplace=True)
    # append user to df
    df_rated = pd.concat([df_rated,df_subset],ignore_index=True)

In [9]:
df_rated

Unnamed: 0,person,skill,rating
0,12,Windows NT/2000/XP,4.0
1,12,MAC OS X,4.0
2,12,Windows 7,4.0
3,12,Windows 8,4.0
4,12,iOS,4.0
...,...,...,...
24581,1488,Deutsch,1.0
24582,1488,Spring-JPA,5.0
24583,1488,Maven,3.0
24584,1488,Subversion,3.0


## Comparing Different Recommender Systems

Define holdout data: We want to take out 2 skills with high rating of 40% of users. Skills and users should be chosen randomly. 

In [268]:
perc_users = 40
n_skills = 2
sample_size = int(np.ceil((len(persons)/100)*perc_users))

random.seed(1)
sample = random.sample(population=persons.tolist(),k=sample_size)

min_rating = 4

In [279]:
holdout = pd.DataFrame()
user_sample = []
for person in sample: 
    df_subset = df_rated[(df_rated["person"] == person) & (df_rated["rating"] >= min_rating)]
    # try to get two random high ranking skills (if the employee doesn't have high ranking skills they will be skipped)
    try:
        holdout_employee = df_subset.sample(n=n_skills,random_state=1)
        user_sample.append(person)
        df_rated.drop(holdout_employee.index,inplace=True)
        holdout = pd.concat([holdout,holdout_employee],ignore_index=1)
    except ValueError:
        pass 
holdout

Unnamed: 0,person,skill,rating
0,233,.NET Framework,5.0
1,233,Silverlight,5.0
2,921,PHP,5.0
3,921,Groovy,4.0
4,883,JavaScript,4.0
...,...,...,...
145,491,Dependency Injection,4.0
146,888,UltraDev,5.0
147,888,HTML5,4.0
148,153,Magento,5.0


In [272]:
reader = Reader(rating_scale=(1,5)) 
data = Dataset.load_from_df(df_rated,reader)

In [273]:
NUM_TRIALS = 3
NUM_SPLITS = 5

In [275]:
def gridsearch_cv(algo_class,algo_name,param_grid,data,best_model_dict,best_params_dict):
    start = time.time()
    rmse = np.zeros(NUM_TRIALS)
    mse = np.zeros(NUM_TRIALS)
    mae = np.zeros(NUM_TRIALS)
    models = []
    params = []

    # hier eventuell gridsearch rein? 
    for i in range(NUM_TRIALS):
        print("Running CV for model", algo_name ,"in Iteration:", i ,"at", time.time()-start)
        cv = LeaveOneOut(
                        n_splits=NUM_SPLITS,
                        random_state=i,
                        min_n_ratings=i
                    )
                        # COMMENT: min_n_ratings -> what's the impact of this parameter? 
        gs = GridSearchCV(
                        algo_class,
                        param_grid, 
                        measures=["rmse","mse","mae"], 
                        cv=cv,refit="rmse"
                    )
        gs.fit(data)
        rmse[i] = gs.best_score["rmse"]
        mse[i] = gs.best_score["mse"]
        mae[i] = gs.best_score["mae"]
        models.append(gs.best_estimator["rmse"])
        params.append(gs.best_params["rmse"])

    # hier noch erläutern, warum rmse als ausschlaggebene measure gewählt wird
    best_model_dict[algo_name] = models[np.argmin(rmse)]
    best_params_dict[algo_name] = params[np.argmin(rmse)]

    print("Total time: ", (time.time()-start), "sec.")
    return rmse, mse, mae 

In [276]:
def add_result(results, name, rmse, mse, mae):
    '''
    Function adding the results returned by nested_cv to a dataframe. 
    Results will be aggregated for better comparison. 
    Parameters: 

    Output:
    results DataFrame containing an additional row
    '''
    row = pd.DataFrame({
        "name":name,
        "rmse_mean":rmse.mean(), 
        "rmse_std":rmse.std(), 
        "mse_mean":mse.mean(), 
        "mse_std":mse.std(), 
        "mae_mean":mae.mean(), 
        "mae_std":mae.std()
        },index=[0])
    return pd.concat([results,row],ignore_index=True)

In [188]:
results = pd.DataFrame()
best_model_dict = {}
best_params_dict = {}

In [189]:
algo_list = [
                (BaselineOnly,"Baseline",{
                                                "verbose":[False]
                                    }),
                (KNNBasic,"k-NN",{
                                                "k": [20, 40, 60, 80],
                                                "min_k": [1, 5, 10, 20],
                                                "verbose":[False]
                                    }),
                (KNNWithMeans,"Centered k-NN",{
                                                "k": [20, 40, 60, 80],
                                                "min_k": [1, 5, 10, 20],
                                                "verbose":[False]
                                    }),
                (SVD,"SVD",{
                                                "n_factors": [20, 40, 60, 80, 100],
                                                "n_epochs": [10, 20, 40, 60],
                                                "biased":[True,False],
                                                "random_state":[1]
                                    }),
                (SVDpp,"SVD++",{
                                                "n_factors": [20, 40, 60, 80, 100],
                                                "n_epochs": [10, 20, 40, 60],
                                                "random_state":[1]
                                    }),
                (CoClustering,"CoClustering",{
                                                "n_cltr_u": [2, 3, 4],
                                                "n_cltr_i": [2, 3, 4],
                                                "n_epochs":[10, 20, 40, 60]
                                    }),
                (SlopeOne,"SlopeOne",{
                                    }),
                (NMF,"NMF",{
                                                "n_factors": [10, 15, 20, 40],
                                                "n_epochs": [20, 40, 80, 120],
                                                "biased":[True,False],
                                                "random_state":[1]
                                    })
            ]

In [190]:
for algo in algo_list:
    rmse, mse, mae = gridsearch_cv(algo[0],algo[1],algo[2],data,best_model_dict,best_params_dict)
    results = add_result(results,algo[1],rmse, mse, mae)

Running CV for model Baseline in Iteration: 0 at  0.0
Running CV for model Baseline in Iteration: 1 at  0.20434308052062988
Running CV for model Baseline in Iteration: 2 at  0.37715911865234375
Total time:  0.5441522598266602 sec.
Running CV for model k-NN in Iteration: 0 at  0.0
Running CV for model k-NN in Iteration: 1 at  6.3359129428863525
Running CV for model k-NN in Iteration: 2 at  12.098838329315186
Total time:  17.72901439666748 sec.
Running CV for model Centered k-NN in Iteration: 0 at  0.0
Running CV for model Centered k-NN in Iteration: 1 at  6.360764265060425
Running CV for model Centered k-NN in Iteration: 2 at  13.28922724723816
Total time:  19.365179300308228 sec.
Running CV for model SVD in Iteration: 0 at  0.0
Running CV for model SVD in Iteration: 1 at  28.520416736602783
Running CV for model SVD in Iteration: 2 at  56.90075922012329
Total time:  88.70752930641174 sec.
Running CV for model SVD++ in Iteration: 0 at  0.0


KeyboardInterrupt: 

In [187]:
best_params_dict

{'Baseline': {'verbose': False},
 'k-NN': {'k': 20, 'min_k': 1, 'verbose': False},
 'Centered k-NN': {'k': 20, 'min_k': 1, 'verbose': False},
 'SVD': {'n_factors': 40, 'n_epochs': 60, 'biased': False, 'random_state': 1},
 'SVD++': {'n_factors': 60, 'n_epochs': 60},
 'CoClustering': {'n_cltr_u': 4, 'n_cltr_i': 4, 'n_epochs': 40},
 'SlopeOne': {},
 'NMF': {'n_factors': 40, 'n_epochs': 120, 'biased': False, 'random_state': 1}}

Select best performing model and fit it on full data:

In [313]:
model = NMF(n_factors= 40, n_epochs = 120, biased = False, random_state= 1)
trainset = data.build_full_trainset()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.NMF at 0x1f4de2be380>

## Evaluation

Idea for evaluating the recoms: define holdout a bit differently. Pick random users (say 30% of all users) and remove a prior defined amount of skills (say 2) from their skillset. The skills should have a high rating (say 4 or 5). If these skills then show up as recommendations, we have an objective indicator that the recommendations are somewhat meaningful. 

Evaluation using holdout data: 

In [356]:
from collections import defaultdict

def get_top_n(predictions, n):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = {}
    
    for uid, iid, true_r, est, _ in predictions:
        try:
                top_n[uid].append((iid, est))
        except KeyError:
                top_n[uid] = [(iid, est)]

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

In [311]:
def get_recoms_for_employee(emp_id,n):
    df_subset = df_rated[df_rated["person"] == 233]
    recom_df = pd.DataFrame()
    recom_df["skill"] = skills
    recom_df["person"] = emp_id
    employee_ratings = np.zeros(len(skills))
    for i in range(0,len(skills)): 
        try:
            employee_ratings[i] = df_subset[df_subset["skill"] == skills[i]]["rating"]
        except ValueError:
            employee_ratings[i] = np.nan
    recom_df["rating"] = employee_ratings
    recom_df = recom_df[["person","skill","rating"]]
    #return recom_df.to_numpy()
    preds = model.test(recom_df.to_numpy())
    return get_top_n(preds,n)

In [315]:
preds = model.test(trainset.build_anti_testset())

In [373]:
top_n = get_top_n(preds, n=10)

# Print the recommended items for each user
recommendations = pd.DataFrame()

for uid, user_ratings in top_n.items():
    recom = pd.DataFrame(index=[uid],data=[[iid for (iid, _) in user_ratings]])
    #print([iid for (iid, _) in user_ratings])
    recommendations = pd.concat([recommendations,recom])

recommendations

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
12,StyleReport,Microsoft Enterprise Library,Smartedit,Security,SAP Commerce 6.x,Backoffice Konfiguration,Spartacus,Promotion / Drools Engine,Produktkonfigurator,Omni Commerce Connect (OCC) Rest APIs
13,Microsoft Enterprise Library,NHibernate,Kendo UI,ADO.NET,NUnit,DevExpress,Windows Communication Foundation (WCF),Infragistics,.NET Core,UX Strategie
15,Styled Components,NGRX,Browser Technologien,Browser Debugging,BEM - Block Element Modifier,ITCSS,Storybook,RxJS,SCSS,Angular Material
16,Angular Material,Storybook,ITCSS,Browser Debugging,NGRX,Node.js,Bootstrap,Jest,Redux,Angular (2 und höher)
17,Angular Material,BEM - Block Element Modifier,Bootstrap,Web Components,Express.js,Stylelint,Tailwind CSS,SCSS,Storybook,Jest
...,...,...,...,...,...,...,...,...,...,...
1467,TypoScript,Java,C#,Perl,Python,C/C++,Ruby,Markdown,Assembler,Pascal
1474,AngularJS,Webpack,Web Components,Microsoft Enterprise Library,Magento,RxJS,Redux,Express.js,Angular Material,Browser Debugging
1475,Windows NT/2000/XP,MAC OS X,Windows 7,Windows 8,Java EE,JUnit,Hibernate,JDBC,Log4J,Spring Boot
1479,ASP Generalist,Windows Forms,NHibernate,Windows Communication Foundation (WCF),Sharepoint Entwicklung,.NET Compact Framework,ASP.NET,LINQ,ASP.NET MVC-Framework,Silverlight


In [374]:
holdout_employees = sorted(holdout["person"].unique())

In [377]:
holdout.sort_values(by="person")

Unnamed: 0,person,skill,rating
104,12,Log4J,4.0
105,12,Team Foundation Server (TF Server),4.0
15,13,ASP.NET WebAPI,5.0
14,13,.NET Framework,5.0
129,21,CI/CD Entwicklung,4.0
...,...,...,...
95,1370,HTML,5.0
42,1377,Windows 7,5.0
43,1377,MAC OS X,5.0
29,1413,jQuery,4.0


In [375]:
recommendations.loc[holdout_employees]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
12,StyleReport,Microsoft Enterprise Library,Smartedit,Security,SAP Commerce 6.x,Backoffice Konfiguration,Spartacus,Promotion / Drools Engine,Produktkonfigurator,Omni Commerce Connect (OCC) Rest APIs
13,Microsoft Enterprise Library,NHibernate,Kendo UI,ADO.NET,NUnit,DevExpress,Windows Communication Foundation (WCF),Infragistics,.NET Core,UX Strategie
21,Angular Material,Stylelint,Castor,TestNG,Browser Debugging,Jakarta Commons,Xerces,TopLink,Oracle XDK,osCommerce
25,Tailwind CSS,osCommerce,Browser Debugging,Vue.js,xtCommerce,SCSS,Angular (2 und höher),Yii,Magento,NGRX
42,AngularJS,Jest,Web Components,Angular Material,Browser Debugging,xtCommerce,RxJS,Express.js,Redux,Stylelint
...,...,...,...,...,...,...,...,...,...,...
1330,JDBC,Log4J,StyleReport,Java ME,EJB 3,Castor,Velocity,Jakarta Commons,Hibernate,JSF
1341,StyleReport,Log4J,Spring-JPA,JDBC,Castor,EJB 3,TestNG,Hibernate,JSF,Xerces
1370,Magento,Web Components,TypeScript,HTML5,Browser Debugging,SCSS,Vue.js,Angular (2 und höher),Tailwind CSS,RxJS
1377,Windows Small Business Server,Microsoft Enterprise Library,Symbian,MAC OS 7/8/9,.NET Core,Novell,Windows Phone 7,iOS,Entity Framework,Windows Presentation Foundation (WPF)


In [358]:
get_recoms_for_employee(233,3)

{233: [('Silverlight', 4.9034593175791),
  ('Microsoft Enterprise Library', 4.864950150441453),
  ('Windows Identity Foundation (WIF)', 4.828327256730565)]}