To Do's: 

- zwei code blocks, die mit listen arbeiten in arrays umwandeln -> performanter

Books: 

https://www.taylorfrancis.com/books/edit/10.1201/9780367631888/recommender-systems-pavan-kumar-vairachilai-sirisha-potluri-sachi-nandan-mohanty

https://beluga.sub.uni-hamburg.de/vufind/Record/1656091313?rank=1

In [None]:
import pandas as pd
import numpy as np
import random
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
from surprise import Reader, Dataset
from surprise import KNNBasic, SVD, CoClustering, SlopeOne
from surprise.model_selection import KFold, RepeatedKFold, cross_validate, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
import time

In [None]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";")
df 

In [None]:
data = df

In [None]:
# So later functions work, it makes sense to rename persons from 1 to maxno. 
# Create dictionary which matches the employee
persons = data["person"].unique()
#np.arange(1,len(data["person"].unique())+1)
skills = sorted(data["skill"].unique())

In [None]:
skills_per_person = []
for person in data["person"].unique(): 
    skillset = []
    for skill in skills:
        if skill in data[data["person"] == person]["skill"].unique():
            skillset.append(1)
        else:
            skillset.append(0)
    skills_per_person.append(skillset)


In [None]:
matrix = pd.DataFrame(index=persons,columns=skills,data=skills_per_person)

# Recommender 

## Nearest Neighbors Approach

In [None]:
data = df#[~df["category"].isin(["Betriebssystem", "Dienste", "Einsatzfelder / Erfahrungen / Schwerpunkte", "Rolle (intern)"])]

In [None]:
data.head(20)

In [None]:
employee_dict = {}
for n in range(1,data["person"].nunique()+1):
    employee_dict[n] = data["person"].unique()[n-1]

In [None]:
# So later functions work, it makes sense to rename persons from 1 to maxno. 
# Create dictionary which matches the employee
persons = data["person"].unique()
#np.arange(1,len(data["person"].unique())+1)
skills = sorted(data["skill"].unique())

In [None]:
skills_per_person = []
for person in data["person"].unique(): 
    skillset = []
    for skill in skills:
        if skill in data[data["person"] == person]["skill"].unique():
            skillset.append(1)
        else:
            skillset.append(0)
    skills_per_person.append(skillset)


In [None]:
matrix = pd.DataFrame(index=persons,columns=skills,data=skills_per_person)

In [None]:
matrix

Employee-employee approach

Similarity between employees: due to dichotomous nature of data, Jaccard similarity is used for computing the similarity between employees. 

In [None]:
def jaccard_binary(x,y):
    """A function for finding the similarity between two binary vectors"""
    intersection = np.logical_and(x, y)
    union = np.logical_or(x, y)
    similarity = intersection.sum() / float(union.sum())
    return similarity

In [None]:
def compute_jaccard_sim(data,employee):
    sim = pd.DataFrame(index=[employee])
    employee_data = data.loc[employee]
    sim_data = data.drop(employee,axis=0)
    for emp in sim_data.index:
        new_sim = pd.DataFrame(index=[employee],columns=[emp],data=jaccard_binary(employee_data,sim_data.loc[emp]))
        sim = pd.concat([sim,new_sim],axis=1)
    return sim       

In [None]:
def compute_n_nearest_neighbors(data,employee,n):
    neighbors = pd.DataFrame(index=[employee])
    similarities = compute_jaccard_sim(data,employee)
    for i in range(0,n):
        nearest_neighbor = similarities[similarities.idxmax(axis=1)]
        neighbors = pd.concat([neighbors,nearest_neighbor],axis=1)
        similarities.drop(nearest_neighbor,axis=1,inplace=True)
    return neighbors.columns.to_list()

In [None]:
def predict_skills_for_employee(data,employee,n_neighbors):
    neighbors = compute_n_nearest_neighbors(data,employee,n_neighbors)
    values_of_employee = data.loc[employee].to_numpy()
    neighbors_matrix = data.loc[neighbors].to_numpy()
    predicted_values = np.array([])
    for i in range(0,len(values_of_employee)):
        if values_of_employee[i] == 0: 
            predicted_values = np.append(predicted_values,
                                            # weighted averahe might be more accurate
                                            np.mean(neighbors_matrix[:,i]))
        else: 
            predicted_values = np.append(predicted_values,values_of_employee[i])
    return predicted_values

In [None]:
def predict_skills_for_matrix(data,n_neighbors): 
    # create emppy array of length according to columns (in order for later vstack to work)
    return_data = np.zeros(len(data.columns))
    for i in data.index: 
        single_predicition = predict_skills_for_employee(data,i,n_neighbors)
        return_data = np.vstack([return_data,single_predicition])
    # delete first entry (zeros) from return data
    return_data = return_data[1:len(return_data)]
    return pd.DataFrame(index=data.index,columns=data.columns,data=return_data)

In [None]:
#predict_skills_for_matrix(matrix,10)

## Matrix Facorization

Create matrix with values between 1 and 5 for recommender trials

Values between 1 and 5 should not be arbitrary. As weight for their score, the frequency of the underlying category for each skill will be taken into account. For instance, if employee 12 has four skills in the category programming language and one skill in data banks, each programming skill will be evaluated with a respectivley high score. 

In [None]:
skills_to_category = pd.read_csv("mitarbeiterportal-skills_2022-06-21.csv",sep=";",header=None)
category_dict = dict(zip(skills_to_category[0],skills_to_category[1]))

In [None]:
def scaler_1_5(x,old_max,old_min):
    OldRange = (old_max - old_min)  
    if (OldRange == 0):
        return 1
    else:
        NewRange = (5 - 1)  
        return round((((x - 1) * NewRange) / OldRange) + 1)

In [None]:
# create dataframe for each employee and their category count
categories = data["category"].unique()
category_per_employee = pd.DataFrame(index=categories)
for p in persons:
    p_skills = data[data["person"] == p]
    # count categories for each employee
    category_count = p_skills["category"].value_counts()
    category_count = category_count.apply(lambda x:scaler_1_5(x,category_count.max(),category_count.min()))
    skill_values = np.array([])
    # add count for each category to an array and 0 if category wasn't found (and therefore is not part of resptive employee's skillset)
    for cat in categories:
        try:
            skill_values = np.append(skill_values,category_count[cat])
        except KeyError:
            skill_values = np.append(skill_values,0)
    # add employee data to dataframe
    p_skills_df = pd.DataFrame(index=categories,columns=[p],data=skill_values)
    category_per_employee = pd.concat([category_per_employee,p_skills_df],axis=1)

In [None]:
skills_per_person_weighted = []
for p in persons: 
    skillset = []
    for skill in skills:
        if skill in data[data["person"] == p]["skill"].unique():
            skillset.append(category_per_employee[p][category_dict[skill]])
        else:
            skillset.append(0)    
    skills_per_person_weighted.append(skillset)

In [None]:
matrix_mod = pd.DataFrame(index=persons,columns=skills,data=skills_per_person_weighted)
matrix_mod

https://www.kaggle.com/code/ibtesama/getting-started-with-a-movie-recommendation-system

In [None]:
from surprise import Reader, Dataset, SVD, accuracy
#from surprise.model_selection import cross_validate, train_test_split

In [None]:
matrix_mod[matrix_mod == 0].count().sum()/matrix_mod[matrix_mod != 0].count().sum()

11 times more 0 values than values with a rating.

In [None]:
skill_alphabetically = sorted(skills_to_category[0].to_list())
skills_dict = {x:skill_alphabetically[x] for x in range(0,len(skill_alphabetically))}

In [None]:
employee_dict_inv = {v: k for k, v in employee_dict.items()}

Create data structure suitable for surprise library:

In [None]:
df_np = df.to_numpy()

for i in range(0,len(df_np)):
    df_np[i][2] = matrix_mod.loc[df_np[i][0]][df_np[i][1]]

df_rated = pd.DataFrame(df_np)
df_rated.columns = ["Employee","Skill","Rating"]
df_rated    

In [None]:
df_category_rating = pd.concat([df_rated,df["category"]],axis=1)

In [None]:
reader = Reader()
svd_data = Dataset.load_from_df(df_rated,reader)

RMSE >= 0.75 -> bueno <br>
MAE? no sé

In [None]:
svd = SVD()
cv = cross_validate(svd, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
trainset, testset = train_test_split(svd_data, test_size=0.25)
svd.fit(trainset)

In [None]:
predictions = svd.test(testset)

In [None]:
def get_skills_of_employee(emp_id):
    df_category_rating[df_category_rating["Employee"] == emp_id]["category"].value_counts().plot(kind="bar",ylabel="Count",title="Skill Category Portfolio Employee "+str(emp_id))
    return df_category_rating[df_category_rating["Employee"] == emp_id]

Manual prediciton testing: input skill of employee that they already know and check the prediciton vs. the actual value

In [None]:
employee_id = str(1479)
skill_id = ".NET Core" 

# get a prediction for specific users and items.
pred = svd.predict(employee_id, skill_id, r_ui=5, verbose=True)

In [None]:
get_skills_of_employee(12)

HOW TO EVALUATE RECOMMENATIONS? <BR>
HOW TO IMPLEMENT NEAT WAY OF GETTING RECOMMENDATIONS/RECOMMEND EMPLOYEES FOR CERTAIN SKILLS

In [None]:
# Create item-based representation of DataFrame (cause we might need it later?)
df_item_based = df_category_rating.sort_values(by="Skill")[["Skill","Employee","Rating","category"]]
item_based_data = df_item_based[["Skill","Employee","Rating"]].to_numpy()

## Comparing Different Recommender Systems

First, lets pick some algorithms to include into our ensemble. We'll choose four.
-  Collaborative Filtering
-  Matrix Factorization
-  collaborative filtering with co-clustering
-  Collaborative Filtering based on the popular Slope One Algorithm

https://www.kaggle.com/code/robottums/hybrid-recommender-systems-with-surprise/notebook

In [None]:
from surprise import Reader, Dataset
from surprise import KNNBasic, KNNWithMeans, SVD, CoClustering, SlopeOne, SVDpp, NMF, BaselineOnly
from surprise.model_selection import KFold, RepeatedKFold, cross_validate, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
import time

Idea: Try to recreate cross validation set up from above link and compare different recommendation techiques. 
Ablauf: run cros val and protocol KPI's (RMSE etc.), then build comprehensive recommendation function for best performing recommender.

In [None]:
reader = Reader(rating_scale=(1,5))
svd_data = Dataset.load_from_df(df_rated,reader)

In [None]:
NUM_TRIALS = 5
NUM_OUTER_SPLITS = 5

In [None]:
def nested_cv(algo,data):
    
    start = time.time()
    rmse = np.zeros((NUM_TRIALS, NUM_TRIALS))
    mae = np.zeros((NUM_TRIALS, NUM_TRIALS))
    fit_times = np.zeros((NUM_TRIALS, NUM_TRIALS))
    test_times = np.zeros((NUM_TRIALS, NUM_TRIALS))

    for i in range(NUM_TRIALS):
        outer_cv = KFold(n_splits=NUM_OUTER_SPLITS,shuffle=True,random_state=36)
        cv_results = cross_validate(algo=algo,data=data,measures=["rmse","mae"],cv=outer_cv,n_jobs=8)
        rmse[i] = cv_results["test_rmse"]
        mae[i] = cv_results["test_mae"]
        fit_times[i] = cv_results["fit_time"]
        test_times[i] = cv_results["test_time"]

    return rmse,mae, fit_times, test_times

In [None]:
def add_result(results, name, accs, baccs, fit_times, test_times):
    '''
    Function adding the results returned by nested_cv to a dataframe. 
    Results will be aggregated for better comparison. 
    Parameters: 
    results = DataFrame which the results should be added to
    name = string describing the estimator which values are to be added
    accs = accuracy values of estimator
    baccs = balanced accuracy values of estimator 
    fit_times = fitting times of estimator
    test_times = testing times of estimator
    Output:
    results DataFrame containing an additional row
    '''
    row = pd.DataFrame({
        "name":name,
        "rmse_mean":accs.mean(), 
        "rmse_std":accs.std(), 
        "rmse_min":accs.min(), 
        "rmse_max":accs.max(), 
        "mae_mean":baccs.mean(), 
        "mae_std":baccs.std(), 
        "mae_min":baccs.min(), 
        "mae_max":baccs.max(), 
        "fit_time":fit_times.mean(),
        "test_time":test_times.mean()
        },index=[0])
    return pd.concat([results,row],ignore_index=True)

In [None]:
results = pd.DataFrame()

In [None]:
algo_list = [(BaselineOnly(),"Baseline"),
                (KNNBasic(),"k-NN"),
                (KNNWithMeans(),"Centered k-NN"),
                (SVD(),"SVD"),
                (SVDpp(),"SVD++"),
                (CoClustering(),"CoClustering"),
                (SlopeOne(),"SlopeOne"),
                (NMF(),"NMF")
            ]

In [None]:
for algo in algo_list:
    rmse, mae, fit_times, test_times = nested_cv(algo[0],svd_data)
    results = add_result(results,algo[1],rmse,mae,fit_times,test_times)

In [None]:
results

In [None]:
def get_best_params(algo,grid,data):
    params = {}
    for i in range(NUM_TRIALS):
        cv = RepeatedKFold(n_splits=5)
        model = GridSearchCV(algo_class=algo,param_grid=grid,cv=cv,measures=["rmse", "mae"],refit="rmse")
        model.fit(data)
        params = model.best_params["rmse"]
    return params

In [None]:
# algo = SVDpp
# grid = {"n_factors":[10,50,100],
#         "n_epochs":[10,20,40]
#         #,"init_mean":[0,0.5,1],
#         # "init_std_dev":[0.1,0.3]
#         }
# params = get_best_params(algo,grid,svd_data)

# algo = SVDpp(n_factors=params["n_factors"],n_epochs=params["n_epochs"])#,biased=params["biased"])

# rmse, mae, fit_times, test_times = nested_cv(algo,svd_data)
# results = add_result(results,"SVD++_best_params",rmse,mae,fit_times,test_times)

In [None]:
# algo = NMF
# grid = {"n_factors":[10,50,100],"n_epochs":[20,40,80],"biased":[True,False]}
# params = get_best_params(algo,grid,svd_data)

# algo = NMF(n_factors=params["n_factors"],n_epochs=params["n_epochs"],biased=params["biased"])

# rmse, mae, fit_times, test_times = nested_cv(algo,svd_data)
# results = add_result(results,"NMF_best_params",rmse,mae,fit_times,test_times)

In [None]:
results

To Do's : 
-  Explore sim_options of some recommenders
-  Think of useful param grid options
-  Make KNNBasic shut the f up 
-  Choose 1 algorihm
 -  Think of evaluation method 
 -  Work with actual recoms (see which skills get recommended) and see if it's sensual 

-  Binary problem? Can we apply this stuff to it? 

In [None]:
from collections import defaultdict

def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

next steps: 
modell definieren according to best algo with best params, die funktion oben ausprobieren mit predict und dann die top_n ausgeben lassen, eventueller vergleich zur baseline funktion, funktion schreiben, die NEUEN user sachen vorschlägt