## Implicit Library

In [1]:
import implicit
import h5py
import pandas as pd
import numpy as np
import random
import statistics
from implicit.evaluation import train_test_split
from sklearn.metrics import ndcg_score
from numpy.random import permutation
from implicit.nearest_neighbours import (
    BM25Recommender,
    CosineRecommender,
    TFIDFRecommender,
    bm25_weight,
)
import scipy
from scipy.sparse import csr_matrix
import scipy.sparse
from implicit.lmf import LogisticMatrixFactorization
from implicit.evaluation import precision_at_k,mean_average_precision_at_k, ndcg_at_k
from sklearn.model_selection import KFold
from random import shuffle
import itertools
import copy


  from .autonotebook import tqdm as notebook_tqdm


## Preprocessing

In [2]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";") 
#df = [~df["category"].isin(["Betriebssystem", "Dienste", "Einsatzfelder / Erfahrungen / Schwerpunkte", "Rolle (intern)"])]
df.head(5)

Unnamed: 0,person,skill,category
0,12,Windows NT/2000/XP,Betriebssystem
1,12,MAC OS X,Betriebssystem
2,12,Windows 7,Betriebssystem
3,12,Windows 8,Betriebssystem
4,12,iOS,Betriebssystem


In [3]:

data = df

In [4]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";") 
#df = [~df["category"].isin(["Betriebssystem", "Dienste", "Einsatzfelder / Erfahrungen / Schwerpunkte", "Rolle (intern)"])]
df.head(5)

Unnamed: 0,person,skill,category
0,12,Windows NT/2000/XP,Betriebssystem
1,12,MAC OS X,Betriebssystem
2,12,Windows 7,Betriebssystem
3,12,Windows 8,Betriebssystem
4,12,iOS,Betriebssystem


In [5]:
data = df

In [6]:
matrix = pd.read_csv("data/origin_binary_matrix.csv")

In [7]:
matrix

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Normalizing the rows

In [8]:
data_skills  = matrix
data_skills 

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In the next step we will compute the item-item relationships of our skills. Our final goal here is to construct a new item by item matrix containing the weights (relationships) between each of our skills where a perfect correlation equals 1 and no correlation at all equals 0.
<br>
In order to do so, we will first normalize the user vectors. The idea behind this approach is, that a user with many skills contributes less to any individual skill. For example if a user, that rules only 3 skills, rules a skill X it is more valueable than if a user, that rules 20 skills, that particular skill.


* First we caclulat the magnitude for every user

In [9]:
# magnitude = sqrt(x2 + y2 + z2 + ...)
magnitude = np.sqrt(np.square(data_skills).sum(axis=1))

In [10]:
magnitude

0       8.774964
1       8.602325
2       5.567764
3       5.567764
4       1.000000
         ...    
399     2.828427
400     4.358899
401     3.000000
402     2.645751
403    12.206556
Length: 404, dtype: float64

* Now we use a users magnitude to normalize the ratings of this corresponding user

In [11]:
data_skills_row_norm=  data_skills.divide(magnitude, axis='index')

In [12]:
data_skills_row_norm 

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
1,0.0,0.000000,0.116248,0.000000,0.000000,0.116248,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
2,0.0,0.000000,0.000000,0.179605,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.179605,0.0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
4,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
400,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
401,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
402,0.0,0.377964,0.377964,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0


In [13]:
from implicit.nearest_neighbours import normalize

In [14]:
data_skills_row_norm_csr = scipy.sparse.csr_matrix(data_skills_row_norm.values)


### Bm25

In [15]:
data_skills

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
data_skills_csr = scipy.sparse.csr_matrix(data_skills.values)

In [17]:
data_skills_csr_weighted = implicit.nearest_neighbours.bm25_weight(data_skills_csr, K1=2, B=0.5)

In [18]:
pd.DataFrame.sparse.from_spmatrix(data_skills_csr_weighted, columns=data_skills.columns)

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
1,0.0,0.000000,1.433817,0.000000,0.000000,2.616681,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
2,0.0,0.000000,0.000000,2.489165,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,1.539067,0.0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
4,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
400,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
401,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
402,0.0,5.106524,2.192339,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0


In [19]:
df = df.drop(columns=["category"])

In [20]:
data = df
data.head(5)

Unnamed: 0,person,skill
0,12,Windows NT/2000/XP
1,12,MAC OS X
2,12,Windows 7
3,12,Windows 8
4,12,iOS


## Baseline

In [21]:
most_common_skills = matrix.sum(axis=0).nlargest(5)	
for i in most_common_skills.index:
    print(i)

Englisch
MySQL
SCRUM
GIT
HTML


In [22]:
p_at_5_baseline_list  = []
for i in range(0,len(matrix)):
    user_series = matrix.loc[i]
    user_know_skills = user_series[user_series > 0]
    user_known_skills_vs_most_common = user_know_skills.index.isin(most_common_skills.index)
    matches = user_known_skills_vs_most_common.sum()
    results =matches/5
    p_at_5_baseline_list.append(results)
 
p_at_5_baseline = statistics.mean(p_at_5_baseline_list)
p_at_5_baseline


0.6891089108910892

Wie aussagekräftig ist p at k ? Ein recommender, der immmer die häufigsten Skills vorschlägt ist nicht zielführend, hat haben einen hohen p_at_5 score

# GridsearchCV

In [23]:
param_grid_bm25  = {'K1': [2,100], #2,20,50,100
                    'B': [ 0.8] # 0.4,0.8,1
                    }

In [24]:
def normalizer (data, param_grid):
    data_representations = ['Raw','Normalized', 'tfidf', 'bm25']    
    df_list = []
    data_list = []
    for representation in data_representations:
        df = pd.DataFrame(columns = ['data_representations', 'K1', 'B'])


        if representation == 'Normalized':
            data_representations = implicit.nearest_neighbours.normalize(data)
            data_list.append(data_representations)
            df["data_representations"] = [representation]
            df["K1"] = np.nan
            df["B"] = np.nan  
            df_list.append(df)

        elif representation == 'tfidf':
            data_representations = implicit.nearest_neighbours.tfidf_weight(data)
            data_list.append(data_representations)
            df["data_representations"] = [representation]
            df["K1"] = np.nan
            df["B"] = np.nan  
            df_list.append(df)

        elif representation == 'bm25':
            keys, values = zip(*param_grid.items())
            for c,v in enumerate(itertools.product(*values)):
                params = dict(zip(keys, v))
                
                #############
                # Initiate df for the parameter combination
                #############
                df = pd.DataFrame()
                df = pd.DataFrame(params, index =  [c])
                df["data_representations"] = [representation]
                df_list.append(df)

                #############
                # Add key value pair to the dictonary and initiate the bm25_weight function for every parameter combination
                #############
                params['X'] = data
                data_representations = implicit.nearest_neighbours.bm25_weight(**params)
                data_list.append(data_representations)

        else: 
            df["data_representations"] = [representation]
            df["K1"] = np.nan
            df["B"] = np.nan  
            df_list.append(df)
            data_list.append(data)
    return data_list, pd.concat(df_list).reset_index().drop(columns = ['index'])

In [25]:
daten, df = normalizer(data_skills_csr, param_grid_bm25)
df 


Unnamed: 0,data_representations,K1,B
0,Raw,,
1,Normalized,,
2,tfidf,,
3,bm25,2.0,0.8
4,bm25,100.0,0.8


In [26]:
def cross_validation_implicit_gs(data, algorithm, param_grid):
    df_list = []

    data_list, df = normalizer(data, param_grid_bm25)

    df_result = pd.DataFrame()
    for b,d in enumerate(data_list):
        df1 = pd.DataFrame()
        df1 = df.loc[b].to_frame().T
        data_rep = d

        keys, values = zip(*param_grid.items())
        for c,v in enumerate(itertools.product(*values)):
            
            #############
            # This for loop produces every possible combination of the hyperparameters within a dictonary
            #############
            params = dict(zip(keys, v))
            this_model = copy.deepcopy(algorithm)
            df2 = pd.DataFrame()
            df2 = pd.DataFrame(params, index =  [b])
    
            

            for k, v in params.items():
                #############
                # This loop unpacks the diconary by each parameter and initiates the model for each hpyerparameter of the corresponding dictonary
                #############
                setattr(this_model, k, v)

            ############
            # lists to store the results after each train_test_split under different random seed (c.p)
            ############
            p_train_test_results = []
            map_train_test_results = []
            ndcg_train_test_results = []


            for r_seed in range(0,5):
                #############
                # Initiate the train_test_split
                #############
                train_mat, test_mat = implicit.evaluation.train_test_split(data_rep, train_percentage =  0.8, random_state = r_seed)
                this_model.fit(train_mat)

                #############
                # measure evaluation metrixs for each possible hyperparameter combination
                #############
                p_at_k =  precision_at_k(this_model, train_user_items=train_mat, test_user_items=test_mat, K=5)
                map_at_k = mean_average_precision_at_k(this_model, train_user_items=train_mat, test_user_items=test_mat, K=5)
                var_ndcg_at_k = ndcg_at_k(this_model, train_user_items=train_mat, test_user_items=test_mat, K=5)


                #############
                # To list
                #############
                p_train_test_results.append(p_at_k)
                map_train_test_results.append(map_at_k)
                ndcg_train_test_results.append(var_ndcg_at_k)

            #############
            # Create columns to store the scores for each hyperparameter combination after 5 different train_test_splits
            #############
            df2["precision_at_k"] = np.mean(p_train_test_results)
            df2["map_at_k"]  = np.mean(map_train_test_results)
            df2["var_ndcg_at_k"] = np.mean(ndcg_train_test_results)

            df_result = df1.merge(df2, left_index=True, right_index=True)
            df_list.append(df_result)
    
    return pd.concat(df_list).reset_index().drop(columns = ['index'])

In [27]:
als_grid = {'factors': [30,40],#30,40,70,100,120
              'regularization': [0.01],#0.005,0.01, 0.02
              'iterations' : [15], #15,30,45
              'alpha': [1] #1,2,4
              }

In [28]:
results_als= cross_validation_implicit_gs(data_skills_csr, algorithm = implicit.als.AlternatingLeastSquares(random_state = 42),param_grid =  als_grid)
results_als

100%|██████████| 15/15 [00:00<00:00, 16.68it/s]
100%|██████████| 377/377 [00:00<00:00, 6287.24it/s]
100%|██████████| 377/377 [00:00<00:00, 5389.04it/s]
100%|██████████| 377/377 [00:00<00:00, 5987.82it/s]
100%|██████████| 15/15 [00:01<00:00, 14.54it/s]
100%|██████████| 378/378 [00:00<00:00, 5043.17it/s]
100%|██████████| 378/378 [00:00<00:00, 4912.12it/s]
100%|██████████| 378/378 [00:00<00:00, 4976.84it/s]
100%|██████████| 15/15 [00:01<00:00, 12.98it/s]
100%|██████████| 367/367 [00:00<00:00, 5649.65it/s]
100%|██████████| 367/367 [00:00<00:00, 6120.39it/s]
100%|██████████| 367/367 [00:00<00:00, 5481.13it/s]
100%|██████████| 15/15 [00:00<00:00, 16.03it/s]
100%|██████████| 379/379 [00:00<00:00, 7022.97it/s]
100%|██████████| 379/379 [00:00<00:00, 7293.07it/s]
100%|██████████| 379/379 [00:00<00:00, 7155.49it/s]
100%|██████████| 15/15 [00:00<00:00, 19.12it/s]
100%|██████████| 374/374 [00:00<00:00, 6930.16it/s]
100%|██████████| 374/374 [00:00<00:00, 4620.18it/s]
100%|██████████| 374/374 [00:00<

Unnamed: 0,data_representations,K1,B,factors,regularization,iterations,alpha,precision_at_k,map_at_k,var_ndcg_at_k
0,Raw,,,30,0.01,15,1,0.471631,0.351382,0.464517
1,Raw,,,40,0.01,15,1,0.43093,0.311137,0.426329
2,Normalized,,,30,0.01,15,1,0.32877,0.22391,0.323815
3,Normalized,,,40,0.01,15,1,0.287595,0.191139,0.287009
4,tfidf,,,30,0.01,15,1,0.443359,0.325264,0.437407
5,tfidf,,,40,0.01,15,1,0.414046,0.298294,0.409156
6,bm25,2.0,0.8,30,0.01,15,1,0.458723,0.34036,0.452665
7,bm25,2.0,0.8,40,0.01,15,1,0.429992,0.314611,0.424797
8,bm25,100.0,0.8,30,0.01,15,1,0.475496,0.356814,0.467044
9,bm25,100.0,0.8,40,0.01,15,1,0.444302,0.32873,0.439262


## Get the best model according to the ndcg score 

In [29]:
results_als.iloc[results_als.var_ndcg_at_k.idxmax()]

data_representations        bm25
K1                         100.0
B                            0.8
factors                       30
regularization              0.01
iterations                    15
alpha                          1
precision_at_k          0.475496
map_at_k                0.356814
var_ndcg_at_k           0.467044
Name: 8, dtype: object

## Initiate that model

In [30]:
model_best = implicit.als.AlternatingLeastSquares(factors = 30, regularization = 0.01, iterations = 15, alpha = 1, random_state = 42)

In [31]:
data_best = implicit.nearest_neighbours.bm25_weight(data_skills_csr, K1 = 100, B = 0.8)
data_best = data_best.tocsr()

In [32]:
train_mat, test_mat = implicit.evaluation.train_test_split(data_best, train_percentage =  0.8, random_state = 42)

In [33]:
model_best.fit(train_mat)

100%|██████████| 15/15 [00:00<00:00, 19.91it/s]


### Do some recommendations

In [68]:
userid= [0]
skill_ids, scores  = model_best.recommend(userid, user_items= train_mat[userid], N = 20 , filter_already_liked_items=True)
recommendations_df = pd.DataFrame({"skill": matrix.columns[skill_ids].flatten(), "score": scores.flatten(), "already_liked": np.in1d(skill_ids, train_mat.tocsr()[userid].indices)})
recommendations_df = recommendations_df.rename(columns = {"already_liked": "already_liked_in_train"})
recommendations_df

  recommendations_df = pd.DataFrame({"skill": matrix.columns[skill_ids].flatten(), "score": scores.flatten(), "already_liked": np.in1d(skill_ids, train_mat.tocsr()[userid].indices)})


Unnamed: 0,skill,score,already_liked_in_train
0,REST,0.638979,False
1,SCRUM,0.615679,False
2,Dependency Injection,0.612335,False
3,Design Pattern,0.610703,False
4,Windows 7,0.527668,False
5,Türkisch,0.51032,False
6,Gitlab CI,0.488587,False
7,Spring-JPA,0.484995,False
8,MySQL,0.479,False
9,CSS,0.473972,False


In [70]:
data_best_user_id = pd.DataFrame.sparse.from_spmatrix(data = data_best.tocsr()[userid],columns=matrix.columns).T.reset_index().rename(columns = {"index": "skill",0: "rating"}).sort_values(by = "rating", ascending = False)
recommendations_df.merge(data_best_user_id, on = "skill", how = "left").rename(columns = {"rating": "rating_in_original"})

Unnamed: 0,skill,score,already_liked_in_train,rating_in_original
0,REST,0.638979,False,0.589291
1,SCRUM,0.615679,False,0.307475
2,Dependency Injection,0.612335,False,0.597647
3,Design Pattern,0.610703,False,0.911362
4,Windows 7,0.527668,False,0.658671
5,Türkisch,0.51032,False,0.0
6,Gitlab CI,0.488587,False,0.0
7,Spring-JPA,0.484995,False,0.0
8,MySQL,0.479,False,0.301614
9,CSS,0.473972,False,0.0


### Preprocessing function


In [259]:
pd.DataFrame.sparse.from_spmatrix(daten[2], columns=data_skills.columns)

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
1,0.0,0.00000,1.547068,0.000000,0.000000,2.823361,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
2,0.0,0.00000,0.000000,2.089392,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,1.291885,0.0
3,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
4,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
400,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
401,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
402,0.0,3.60352,1.547068,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0


### GridsearchCV results

In [272]:
len(l)
l[4]
pd.DataFrame.sparse.from_spmatrix(l[0], columns=data_skills.columns)

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0
1,0.0,0.00000,1.477067,0.000000,0.000000,2.695613,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0
2,0.0,0.00000,0.000000,2.312203,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,1.42965,0.0
3,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0
4,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0
400,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0
401,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0
402,0.0,4.37638,1.878873,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.00000,0.0


In [253]:
pd.DataFrame.sparse.from_spmatrix(implicit.nearest_neighbours.bm25_weigt(data_skills_csr, K1 = '20', B = 0.4), columns=data_skills.columns)

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
1,0.0,0.00000,1.547068,0.000000,0.000000,2.823361,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
2,0.0,0.00000,0.000000,2.089392,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,1.291885,0.0
3,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
4,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
400,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
401,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
402,0.0,3.60352,1.547068,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0


### Single setup validation

In [281]:
daten =  implicit.nearest_neighbours.bm25_weight(data_skills_csr, K1 = 20, B = 0.4)

In [282]:
model_als = implicit.als.AlternatingLeastSquares(factors = 30, random_state= 42, alpha= 1, regularization= 0.01, iterations= 15)

ndcg_train_test_results = []
for r_seed in range(0,5):
    train_mat, test_mat = implicit.evaluation.train_test_split(daten, train_percentage =  0.8, random_state = r_seed)
    model_als.fit(train_mat)


    #############
    # measure evaluation metrixs for each possible hyperparameter combination
    #############
    var_ndcg_at_k = ndcg_at_k(model_als, train_user_items=train_mat, test_user_items=test_mat, K=5)
    ndcg_train_test_results.append(var_ndcg_at_k)

print(np.mean(ndcg_train_test_results))
    

100%|██████████| 15/15 [00:01<00:00,  7.74it/s]
100%|██████████| 377/377 [00:00<00:00, 4010.88it/s]
100%|██████████| 15/15 [00:01<00:00,  7.69it/s]
100%|██████████| 378/378 [00:00<00:00, 4345.90it/s]
100%|██████████| 15/15 [00:01<00:00,  7.86it/s]
100%|██████████| 367/367 [00:00<00:00, 4170.56it/s]
100%|██████████| 15/15 [00:01<00:00,  7.87it/s]
100%|██████████| 379/379 [00:00<00:00, 3828.50it/s]
100%|██████████| 15/15 [00:02<00:00,  7.46it/s]
100%|██████████| 374/374 [00:00<00:00, 4453.40it/s]

0.44710373923827795



