In [1]:
import implicit
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
from implicit.utils import nonzeros

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

import scipy.sparse

from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, GridSearchCV, cross_validate,KFold, RepeatedKFold

import time

from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, fbeta_score, precision_score
from sklearn.metrics import make_scorer
from sklearn.metrics import ndcg_score
from implicit.evaluation import train_test_split, precision_at_k, mean_average_precision_at_k, ndcg_at_k


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";") 
#df = [~df["category"].isin(["Betriebssystem", "Dienste", "Einsatzfelder / Erfahrungen / Schwerpunkte", "Rolle (intern)"])]
df.head(5)

Unnamed: 0,person,skill,category
0,12,Windows NT/2000/XP,Betriebssystem
1,12,MAC OS X,Betriebssystem
2,12,Windows 7,Betriebssystem
3,12,Windows 8,Betriebssystem
4,12,iOS,Betriebssystem


In [3]:
data = df.drop(columns=["category"])
data

Unnamed: 0,person,skill
0,12,Windows NT/2000/XP
1,12,MAC OS X
2,12,Windows 7
3,12,Windows 8
4,12,iOS
...,...,...
24581,1488,Deutsch
24582,1488,Spring-JPA
24583,1488,Maven
24584,1488,Subversion


In [4]:
matrix = pd.read_csv("data/origin_binary_matrix.csv")

In [5]:
matrix

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Preprocessing

In [6]:
from sklearn.model_selection import LeaveOneOut

In [15]:
data["person"] = data["person"].astype("category")
data["skill"] = data["skill"].astype("category")
data.dtypes

person    category
skill     category
dtype: object

In [16]:
class MatrixTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, confidence=1):
        self.confidence = confidence

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        matrix =     coo_matrix((np.ones(X.shape[0]),
                           (X.iloc[:,0].cat.codes.copy(),
                            X.iloc[:,1].cat.codes.copy()))) * self.confidence
        return matrix.reshape(matrix.shape[0], -1)

In [137]:
class ALSEstimator(BaseEstimator, TransformerMixin):
    def __init__(self, factors=50,
                       regularization=0.01,
                       iterations=10,
                       filter_seen=True
                       ,random_state=1):
        self.factors = factors
        self.regularization = regularization
        self.iterations = iterations
        self.filter_seen = filter_seen
        self.random_state = random_state

    def fit(self, X, y=None):
        self.model = AlternatingLeastSquares(factors=self.factors,
                                             regularization=self.regularization,
                                             iterations=self.iterations,
                                             dtype=np.float32,
                                             use_native=True,
                                             random_state=self.random_state)
        self.model.fit(X)
        if self.filter_seen:
            self.fit_X = X
        return self

    def predict(self, X, y=None):
        predictions = np.dot(self.model.item_factors, self.model.user_factors.T)
        return np.asarray(predictions.T.tolist())

    def fit_predict(self, X, y=None):
        self.model = self.fit(X)
        prediction = self.predict(X).reshape(1,-1)
        return prediction


    def score (self, X, K = 5 ):
        predicited_values =  self.predict(X).reshape(1,-1)
        actual_values = X.todense().reshape(1,-1)
        return ndcg_score(predicited_values,actual_values, k = K)

## Pipeline

In [22]:
def get_pipe(estimator):
    return Pipeline([('preprocessor',MatrixTransformer()),('estimator',estimator)]) #, ('scaler',StandardScaler()),('estimator',estimator)] 


In [23]:
NUM_TRIALS = 3
NUM_INNER_REPEATS = 3
NUM_INNER_SPLITS = 3
NUM_OUTER_SPLITS = 3


In [24]:
als_grid = [{"estimator__factors":[5,10]}] #"factors":[5,10,20,50,70],"iterations":[20,30, 50,70], 'regularization':[0.005,0.01,0.02] #"factors":[5],"iterations":[20]

In [25]:
ndcg_scorer = make_scorer(ndcg_score,k = 5)

In [121]:

def nested_cv(estimator,grid, data):
    
    start = time.time()
    ndcg = np.zeros((NUM_TRIALS, NUM_INNER_REPEATS))

    fit_times = np.zeros((NUM_TRIALS, NUM_INNER_REPEATS))
    test_times = np.zeros((NUM_TRIALS, NUM_INNER_REPEATS))


    scores_dict = {'ndcg_scorer': ndcg_scorer}    #{'recall': 'recall', 'precision': 'precision', 'f_beta_score': f_beta_score}  
    for i in range(NUM_TRIALS):
        print("Running Outer CV in Iteration: ", i ," at ", time.time()-start)
        pipe = get_pipe(estimator)
        inner_cv = RepeatedKFold(n_splits=NUM_INNER_SPLITS, n_repeats=NUM_INNER_REPEATS, random_state=i)
        outer_cv = KFold(n_splits=NUM_OUTER_SPLITS, shuffle=True,random_state=i)    
        clf = GridSearchCV(estimator=pipe, param_grid=grid, cv=inner_cv,n_jobs = 8) 
        #model = clf.fit(data)
        #return (model.best_score_)
        cv_result = cross_validate(clf, X=data,cv=outer_cv,n_jobs=8, error_score="raise")
        ##print(cv_result)
        ndcg[i] = cv_result["test_score"]
        fit_times[i] = cv_result["fit_time"]
        test_times[i] = cv_result["score_time"]
        #return cv_result
      
    print("Total time: ", (time.time()-start), "sec.")
    return ndcg, fit_times, test_times  

In [125]:
def add_result(results, name, score, fit_times, test_times):
    row = {
        "name":name,
        "ndcg_mean":score.mean(), 
        "ndcg_std":score.std(), 
        "ndcg_min":score.min(), 
        "ndcg_max":score.max(),  
        "fit_time":fit_times.mean(),
        "test_time":test_times.mean()
        }
    return results.append(row,ignore_index=True)

In [110]:
results = pd.DataFrame()

In [114]:
nested_cv(ALSEstimator(),als_grid, data)

Running Outer CV in Iteration:  0  at  0.0


{'fit_time': array([7.61570239, 7.40923524, 7.60569453]),
 'score_time': array([0.26226568, 0.3099606 , 0.27125788]),
 'test_score': array([0.84167866, 0.8419229 , 0.84373169])}

In [128]:
ndcg, fit_times, test_times  = nested_cv(ALSEstimator(),als_grid, data)


Running Outer CV in Iteration:  0  at  0.0
Running Outer CV in Iteration:  1  at  7.895333766937256
Running Outer CV in Iteration:  2  at  15.59433627128601
Total time:  23.469338178634644 sec.


In [129]:
results = add_result(results,"ALS",ndcg,fit_times, test_times)
results

  return results.append(row,ignore_index=True)


Unnamed: 0,name,ndcg_mean,ndcg_std,ndcg_min,ndcg_max,fit_time,test_time
0,ALS,0.842318,0.001867,0.839426,0.844938,8.604885,0.52989
1,ALS,0.842318,0.001867,0.839426,0.844938,7.359889,0.344442


## Validate this score by using a train test split

In [146]:
model= ALSEstimator(random_state=1,factors = 20).fit(MatrixTransformer().fit_transform(data))

100%|██████████| 10/10 [00:02<00:00,  3.97it/s]


In [163]:
model.score(MatrixTransformer().fit_transform(data))



0.41295217652240573

In [161]:
data_test  =data.loc[(data['person'] == 12)]
data_test

Unnamed: 0,person,skill
0,12,Windows NT/2000/XP
1,12,MAC OS X
2,12,Windows 7
3,12,Windows 8
4,12,iOS
...,...,...
73,12,SQL
74,12,SCRUM
75,12,JIRA
76,12,Maven


# Predict

In [46]:
predictions = model.predict(data_trans)
predictions


array([[-1.78520337e-01,  1.08718067e-01,  2.24071041e-01, ...,
        -8.85440633e-02,  1.65439725e-01, -1.80386275e-01],
       [ 1.56876817e-01,  2.03243852e-01,  1.01517630e+00, ...,
         2.66133938e-02,  1.13400564e-01, -2.10368857e-01],
       [ 7.28639811e-02, -2.42017601e-02,  2.88033858e-02, ...,
        -1.20175697e-01,  4.96238887e-01,  4.33641486e-02],
       ...,
       [ 2.11857948e-02,  2.43892055e-02, -3.45128439e-02, ...,
         2.38773059e-02, -4.22449149e-02, -1.97418290e-03],
       [-1.48500991e-03,  2.46375930e-02,  2.48813659e-01, ...,
         9.19075246e-05, -6.77527636e-02, -6.21256791e-02],
       [-9.95601863e-02, -1.11677252e-01,  1.47404715e-01, ...,
         4.86053963e-04,  1.87679410e-01,  4.28548642e-02]])

In [47]:
pd.DataFrame(predictions)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,725,726,727,728,729,730,731,732,733,734
0,-0.178520,0.108718,0.224071,-0.038702,0.057564,0.008159,0.073080,0.031055,-0.031406,-0.044925,...,-0.010852,0.026554,-0.168121,-0.027355,-0.045876,0.083657,0.129632,-0.088544,0.165440,-0.180386
1,0.156877,0.203244,1.015176,0.049720,-0.050562,0.328696,-0.026615,0.261277,0.147765,-0.132973,...,0.012772,0.024118,0.196213,-0.001549,-0.019073,0.040226,0.055873,0.026613,0.113401,-0.210369
2,0.072864,-0.024202,0.028803,0.541928,0.013695,0.007165,0.020571,-0.008604,-0.029858,-0.030395,...,-0.005196,-0.008994,0.004609,0.024406,-0.035380,0.007747,0.055278,-0.120176,0.496239,0.043364
3,0.029088,0.022769,0.040615,0.184255,-0.019882,-0.098494,-0.005141,-0.005792,-0.023737,-0.004261,...,-0.003284,0.001250,-0.123398,-0.067900,-0.016482,-0.005745,0.004510,0.022167,0.341542,-0.134164
4,0.000972,-0.000954,-0.008258,-0.003097,-0.003341,0.003243,-0.000799,0.000044,0.001180,0.004003,...,0.000560,-0.001645,0.004245,-0.004721,-0.003011,-0.004214,-0.006953,-0.002065,-0.006428,0.000637
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.005595,-0.000988,0.168804,0.098672,-0.046935,-0.035479,0.014566,0.023956,0.009776,-0.014878,...,0.003253,-0.002035,-0.072325,-0.017625,0.012741,-0.027905,-0.012128,0.027519,0.115076,0.041634
400,0.107946,-0.015805,-0.280251,0.011880,-0.012584,0.096051,-0.001967,0.022415,-0.011908,0.022287,...,0.005754,0.001763,0.101326,-0.004309,0.036517,0.047599,0.125811,0.049817,-0.024406,0.079597
401,0.021186,0.024389,-0.034513,0.010063,-0.016481,-0.007429,-0.018042,-0.008984,-0.017352,0.040140,...,0.008801,-0.018498,0.052300,0.008943,0.015926,-0.053727,0.046959,0.023877,-0.042245,-0.001974
402,-0.001485,0.024638,0.248814,-0.122919,-0.012562,0.016656,0.005324,-0.029662,-0.008647,0.019275,...,-0.000958,0.007764,-0.004088,-0.019204,-0.002277,0.005205,0.052645,0.000092,-0.067753,-0.062126


In [71]:
pred_user_matrix = predictions[0]
pred_user_matrix= pred_user_matrix.reshape((len(pred_user_matrix), -1))
pred_user_matrix = pd.DataFrame(pred_user_matrix).set_index(matrix.columns)


In [72]:
pred_user_matrix.rename(columns={0: "user_id"}, inplace=True)

In [73]:
pred_user_matrix.head(5)

Unnamed: 0,user_id
.NET Compact Framework,-0.17852
.NET Core,0.108718
.NET Framework,0.224071
3D-Modellierung,-0.038702
ABAP,0.057564


In [74]:
real_df = pd.DataFrame.sparse.from_spmatrix(data_trans)
real_df = pd.DataFrame(real_df.iloc[0,:]).set_index(matrix.columns)


In [75]:
real_df.rename(columns={0: "user_id"}, inplace=True)

In [76]:
real_df.head(5)

Unnamed: 0,user_id
.NET Compact Framework,0.0
.NET Core,0.0
.NET Framework,0.0
3D-Modellierung,0.0
ABAP,0.0


In [85]:
top_5_pred = pred_user_matrix.sort_values('user_id',ascending=False).head(40)
top_5_pred

Unnamed: 0,user_id
Continuous Integration (CI),1.277179
Windows 10,1.260977
SCRUM,1.236381
Design Pattern,1.215583
Englisch,1.198315
IntelliJ(Idea),1.14262
Dependency Injection,1.130275
Windows 8,1.099006
Windows 7,1.070235
Wasserfallmodel,1.048321


In [86]:
top_5_pred.merge(real_df, left_index=True, right_index=True)

Unnamed: 0,user_id_x,user_id_y
Continuous Integration (CI),1.277179,1.0
Windows 10,1.260977,1.0
SCRUM,1.236381,1.0
Design Pattern,1.215583,1.0
Englisch,1.198315,1.0
IntelliJ(Idea),1.14262,1.0
Dependency Injection,1.130275,1.0
Windows 8,1.099006,1.0
Windows 7,1.070235,1.0
Wasserfallmodel,1.048321,1.0


In [None]:
top_5_pred.index

Int64Index([509, 127, 184, 152, 605], dtype='int64')

In [None]:
ndcg_score(predictions[0:200].reshape(1,-1),real_df.to_numpy()[0:200].reshape(1,-1), k = 5)

0.5631921909482153

In [None]:
real_df.iloc[0]

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
730    0.0
731    0.0
732    0.0
733    0.0
734    0.0
Name: 0, Length: 735, dtype: Sparse[float64, 0]

In [None]:
real = data_trans.toarray()[0]