In [57]:
import implicit
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
from implicit.utils import nonzeros

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

import scipy.sparse

from sklearn.model_selection import StratifiedKFold,RepeatedStratifiedKFold, GridSearchCV, cross_validate,KFold, RepeatedKFold

import time

from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, fbeta_score, precision_score
from sklearn.metrics import make_scorer
from sklearn.metrics import ndcg_score
from implicit.evaluation import train_test_split, precision_at_k, mean_average_precision_at_k, ndcg_at_k


In [429]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";") 
#df = [~df["category"].isin(["Betriebssystem", "Dienste", "Einsatzfelder / Erfahrungen / Schwerpunkte", "Rolle (intern)"])]
df.head(5)

Unnamed: 0,person,skill,category
0,12,Windows NT/2000/XP,Betriebssystem
1,12,MAC OS X,Betriebssystem
2,12,Windows 7,Betriebssystem
3,12,Windows 8,Betriebssystem
4,12,iOS,Betriebssystem


In [430]:
data = df.drop(columns=["category"])
data

Unnamed: 0,person,skill
0,12,Windows NT/2000/XP
1,12,MAC OS X
2,12,Windows 7
3,12,Windows 8
4,12,iOS
...,...,...
24581,1488,Deutsch
24582,1488,Spring-JPA
24583,1488,Maven
24584,1488,Subversion


In [14]:
matrix = pd.read_csv("data/origin_binary_matrix.csv")

In [15]:
matrix

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Preprocessing

In [None]:
from sklearn.model_selection import LeaveOneOut

In [489]:
class MatrixTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, confidence=1):
        self.confidence = confidence

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        matrix =     coo_matrix((np.ones(X.shape[0]),
                           (X.iloc[:,0].cat.codes.copy(),
                            X.iloc[:,1].cat.codes.copy()))) * self.confidence
        return matrix.reshape(matrix.shape[0], -1)

In [490]:
MatrixTransformer().fit_transform(data)

<404x735 sparse matrix of type '<class 'numpy.float64'>'
	with 24586 stored elements in COOrdinate format>

In [469]:
class ALSEstimator(BaseEstimator, TransformerMixin):
    def __init__(self, factors=50,
                       regularization=0.01,
                       iterations=10,
                       filter_seen=True
                       ,random_state=1):
        self.factors = factors
        self.regularization = regularization
        self.iterations = iterations
        self.filter_seen = filter_seen
        self.random_state = random_state

    def fit(self, X, y=None):
        self.model = AlternatingLeastSquares(factors=self.factors,
                                             regularization=self.regularization,
                                             iterations=self.iterations,
                                             dtype=np.float32,
                                             use_native=True,
                                             random_state=self.random_state)
        self.model.fit(X)
        if self.filter_seen:
            self.fit_X = X
        return self

    def predict(self, X, y=None):
        predictions = np.dot(self.model.item_factors, self.model.user_factors.T)
        return np.asarray(predictions.T.tolist())


    def score (self, X, K = 5 ):
        predicited_values =  self.predict(X).reshape(1,-1)
        actual_values = X.todense().reshape(1,-1)
        return ndcg_score(predicited_values,actual_values, k = K)

In [458]:
model = ALSEstimator() 
model.fit(MatrixTransformer().fit_transform(data))  

100%|██████████| 10/10 [00:00<00:00, 18.90it/s]


In [460]:
model.predict(MatrixTransformer().fit_transform(data)).shape

(404, 735)

In [394]:
data_trans = MatrixTransformer().fit_transform(data)

In [404]:
model.score(data_trans) #model.score(data_trans, actual = data_trans).shape



0.4931107771245103

In [432]:
def get_pipe(estimator):
    return Pipeline([('preprocessor',MatrixTransformer()),('estimator',estimator)]) #, ('scaler',StandardScaler()),('estimator',estimator)] 


In [None]:
NUM_TRIALS = 3
NUM_INNER_REPEATS = 3
NUM_INNER_SPLITS = 3
NUM_OUTER_SPLITS = 3


In [477]:
als_grid = [{"estimator__factors":[5,10]}] #"factors":[5,10,20,50,70],"iterations":[20,30, 50,70], 'regularization':[0.005,0.01,0.02] #"factors":[5],"iterations":[20]

In [479]:
ndcg_scorer = make_scorer(ndcg_score,k = 5)

In [484]:

def nested_cv(estimator,grid, data):
    
    start = time.time()
    ndcg = np.zeros((NUM_TRIALS, NUM_INNER_REPEATS))
    baccs = np.zeros((NUM_TRIALS, NUM_INNER_REPEATS))

    fit_times = np.zeros((NUM_TRIALS, NUM_INNER_REPEATS))
    test_times = np.zeros((NUM_TRIALS, NUM_INNER_REPEATS))


    scores_dict = {'balanced_accuracy': 'balanced_accuracy','ndcg_scorer': ndcg_scorer}    #{'recall': 'recall', 'precision': 'precision', 'f_beta_score': f_beta_score}  
    for i in range(NUM_TRIALS):
        print("Running Outer CV in Iteration: ", i ," at ", time.time()-start)
        pipe = get_pipe(estimator)
        inner_cv = RepeatedKFold(n_splits=NUM_INNER_SPLITS, n_repeats=NUM_INNER_REPEATS, random_state=i)
        outer_cv = KFold(n_splits=NUM_OUTER_SPLITS, shuffle=True,random_state=i)    
        clf = GridSearchCV(estimator=pipe, param_grid=grid, cv=inner_cv,n_jobs = 8) 
        cv_result = cross_validate(clf, X=data,cv=outer_cv,n_jobs=8, error_score="raise", scoring=(scores_dict))
        #print(cv_result)
        ndcg[i] = cv_result["test_ndcg_scorer"]
        baccs[i] = cv_result["test_balanced_accuracy"]
        fit_times[i] = cv_result["fit_time"]
        test_times[i] = cv_result["score_time"]

      
    #print("Total time: ", (time.time()-start), "sec.")
    return ndcg, fit_times, test_times  

In [486]:
data["person"] = data["person"].astype("category")
data["skill"] = data["skill"].astype("category")
data.dtypes

person    category
skill     category
dtype: object

In [491]:
pipe = get_pipe(ALSEstimator())
clf = GridSearchCV(estimator=pipe, param_grid=als_grid, cv=4,n_jobs = 8) 
clf.fit(MatrixTransformer().fit_transform(data).todense())


ValueError: 
All the 8 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
8 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Maxi\Desktop\virtual_env\ds_ap\lib\site-packages\sklearn\model_selection\_validation.py", line 684, in _fit_and_score
    estimator.fit(X_train, **fit_params)
  File "c:\Users\Maxi\Desktop\virtual_env\ds_ap\lib\site-packages\sklearn\pipeline.py", line 378, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "c:\Users\Maxi\Desktop\virtual_env\ds_ap\lib\site-packages\sklearn\pipeline.py", line 336, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Users\Maxi\Desktop\virtual_env\ds_ap\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "c:\Users\Maxi\Desktop\virtual_env\ds_ap\lib\site-packages\sklearn\pipeline.py", line 870, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "c:\Users\Maxi\Desktop\virtual_env\ds_ap\lib\site-packages\sklearn\base.py", line 867, in fit_transform
    return self.fit(X, **fit_params).transform(X)
  File "C:\Users\Maxi\AppData\Local\Temp\ipykernel_13148\1159135902.py", line 10, in transform
AttributeError: 'matrix' object has no attribute 'iloc'


In [485]:
nested_cv(ALSEstimator(),als_grid, data)

Running Outer CV in Iteration:  0  at  0.0


TypeError: _PredictScorer._score() missing 1 required positional argument: 'y_true'

In [369]:
predictions = model.predict(data_trans)
predictions


array([[-0.18948469,  0.11264376,  0.18528737, ..., -0.09730112,
         0.01772469, -0.09322538],
       [ 0.15346731,  0.19557746,  0.8788178 , ...,  0.036847  ,
         0.02321132, -0.24054708],
       [ 0.1134166 , -0.06157881, -0.0516322 , ..., -0.07739883,
         0.69545168,  0.00709413],
       ...,
       [ 0.01830531,  0.03115963,  0.02530304, ...,  0.02026119,
        -0.08347127, -0.00381479],
       [ 0.02653292,  0.03438199,  0.27909735, ..., -0.0009796 ,
        -0.09671085, -0.03359281],
       [-0.15905787, -0.15058808, -0.0079931 , ..., -0.01091874,
        -0.04526868,  0.05513326]])

In [308]:
pd.DataFrame(predictions)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,725,726,727,728,729,730,731,732,733,734
0,-0.073526,0.144289,0.191058,0.015897,0.073527,0.019095,0.074503,0.068536,-0.023968,-0.090355,...,-0.003753,0.018121,-0.084292,-0.023120,-0.047046,0.088438,0.152842,-0.105557,0.164389,-0.168654
1,0.099023,0.138138,0.798364,0.125602,-0.066752,0.348457,-0.004008,0.238252,0.132941,-0.040247,...,0.017596,0.021406,0.119614,-0.052290,-0.006528,-0.003108,0.015997,0.006574,0.122355,-0.166696
2,0.109618,-0.018658,0.028172,0.540737,-0.006372,0.026175,0.016315,-0.009705,-0.001646,-0.004559,...,-0.005821,0.001209,0.069892,0.027462,-0.035815,-0.001063,0.085596,-0.085049,0.533438,0.067049
3,0.041768,0.045946,0.171301,0.225115,-0.006583,-0.022100,-0.045337,0.044472,0.023907,-0.044518,...,-0.010982,-0.001955,-0.082884,-0.004869,-0.018255,0.002946,-0.012330,0.042793,0.168491,-0.118454
4,0.001244,-0.000501,0.001695,0.004881,-0.001127,0.005462,0.002868,0.002162,0.003420,0.000227,...,-0.000558,-0.002607,-0.003707,-0.001993,-0.002395,-0.004612,-0.004545,-0.005479,-0.009079,-0.002421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,-0.031500,0.004080,0.189558,0.057372,-0.041814,-0.041251,-0.000178,0.036799,0.018174,-0.027027,...,0.006926,0.002320,-0.069216,-0.010059,0.022137,-0.015705,0.032220,0.019282,0.101371,0.071692
400,0.102788,0.002958,-0.265340,-0.019540,-0.024624,0.041432,-0.011955,0.049781,0.011720,0.022904,...,0.015424,-0.007964,0.087419,-0.023835,0.043293,0.034918,0.077573,0.045989,0.048777,0.033433
401,-0.002137,0.026373,0.015661,0.045812,-0.000960,-0.024344,-0.002382,0.006499,-0.005954,0.018316,...,0.004675,-0.015202,0.034479,0.007107,0.016877,-0.038673,0.010129,0.004319,-0.067210,-0.032545
402,0.016723,0.018324,0.246164,-0.087036,-0.003903,0.042848,0.020940,-0.027667,-0.019621,0.014968,...,-0.004527,0.005462,-0.012860,-0.007263,-0.005929,0.006043,0.039109,0.003078,-0.095308,-0.068618


In [325]:
pred_user_matrix = predictions[0]
pred_user_matrix= pred_user_matrix.reshape((len(pred_user_matrix), -1))
pred_user_matrix.T.shape


(1, 735)

In [None]:
data_trans

<404x735 sparse matrix of type '<class 'numpy.float64'>'
	with 24586 stored elements in COOrdinate format>

In [333]:
real_df = pd.DataFrame.sparse.from_spmatrix(data_trans)
real_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,725,726,727,728,729,730,731,732,733,734
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [348]:
real_user_matrix = 
real_user_matrix.reshape(1,-1).shape
pred_user_matrix.T.shape

(1, 735)

In [345]:
real_user_matrix.T.shape

(735,)

In [332]:
predictions[0].reshape(1,-1).shape

(1, 735)

In [357]:
ndcg_score(predictions[0:200].reshape(1,-1),real_df.to_numpy()[0:200].reshape(1,-1), k = 5)

0.5631921909482153

In [250]:
pred_df = pd.DataFrame.sparse.from_spmatrix(predictions)
top_5_pred = pred_df.iloc[0].sort_values(ascending=False).head(5)
top_5_pred

509    1.348854
127    1.211911
184    1.205164
152    1.192723
605    1.188198
Name: 0, dtype: Sparse[float32, 0]

In [251]:
top_5_pred.index

Int64Index([509, 127, 184, 152, 605], dtype='int64')

In [273]:
real_df.iloc[0]

0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
      ... 
730    0.0
731    0.0
732    0.0
733    0.0
734    0.0
Name: 0, Length: 735, dtype: Sparse[float64, 0]

In [290]:
real = data_trans.toarray()[0]

In [78]:
grid_search.best_params_

{'als__factors': 100, 'als__regularization': 0.01, 'matrix__confidence': 3}

In [79]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)



0.027397550037038743 {'als__factors': 20, 'als__regularization': 0.01, 'matrix__confidence': 1}
0.028703561310685133 {'als__factors': 20, 'als__regularization': 0.01, 'matrix__confidence': 3}
0.02949303470360908 {'als__factors': 20, 'als__regularization': 0.01, 'matrix__confidence': 10}
0.028252080669355424 {'als__factors': 20, 'als__regularization': 0.01, 'matrix__confidence': 40}
0.029082858603714307 {'als__factors': 20, 'als__regularization': 0.01, 'matrix__confidence': 100}
0.02695363905330588 {'als__factors': 20, 'als__regularization': 0.001, 'matrix__confidence': 1}
0.028064496211566158 {'als__factors': 20, 'als__regularization': 0.001, 'matrix__confidence': 3}
0.026911975292936412 {'als__factors': 20, 'als__regularization': 0.001, 'matrix__confidence': 10}
0.02727986466588129 {'als__factors': 20, 'als__regularization': 0.001, 'matrix__confidence': 40}
0.027646557920765386 {'als__factors': 20, 'als__regularization': 0.001, 'matrix__confidence': 100}
0.027432163725493765 {'als__fa