## Implicit Library

In [1]:
import implicit
import h5py
import pandas as pd
import numpy as np
import random
import statistics
from implicit.evaluation import train_test_split
from sklearn.metrics import ndcg_score
from numpy.random import permutation
from implicit.nearest_neighbours import (
    BM25Recommender,
    CosineRecommender,
    TFIDFRecommender,
    bm25_weight,
)
import scipy
from scipy.sparse import csr_matrix
import scipy.sparse
from implicit.lmf import LogisticMatrixFactorization
from implicit.evaluation import precision_at_k,mean_average_precision_at_k, ndcg_at_k
from sklearn.model_selection import KFold
from random import shuffle
import itertools
import copy


  from .autonotebook import tqdm as notebook_tqdm


## Preprocessing

In [2]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";") 
#df = [~df["category"].isin(["Betriebssystem", "Dienste", "Einsatzfelder / Erfahrungen / Schwerpunkte", "Rolle (intern)"])]
df.head(5)

Unnamed: 0,person,skill,category
0,12,Windows NT/2000/XP,Betriebssystem
1,12,MAC OS X,Betriebssystem
2,12,Windows 7,Betriebssystem
3,12,Windows 8,Betriebssystem
4,12,iOS,Betriebssystem


In [3]:

data = df

In [4]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";") 
#df = [~df["category"].isin(["Betriebssystem", "Dienste", "Einsatzfelder / Erfahrungen / Schwerpunkte", "Rolle (intern)"])]
df.head(5)

Unnamed: 0,person,skill,category
0,12,Windows NT/2000/XP,Betriebssystem
1,12,MAC OS X,Betriebssystem
2,12,Windows 7,Betriebssystem
3,12,Windows 8,Betriebssystem
4,12,iOS,Betriebssystem


In [5]:
data = df

In [6]:
matrix = pd.read_csv("data/origin_binary_matrix.csv")

In [7]:
matrix

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Normalizing the rows

In [8]:
data_skills  = matrix
data_skills 

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In the next step we will compute the item-item relationships of our skills. Our final goal here is to construct a new item by item matrix containing the weights (relationships) between each of our skills where a perfect correlation equals 1 and no correlation at all equals 0.
<br>
In order to do so, we will first normalize the user vectors. The idea behind this approach is, that a user with many skills contributes less to any individual skill. For example if a user, that rules only 3 skills, rules a skill X it is more valueable than if a user, that rules 20 skills, that particular skill.


* First we caclulat the magnitude for every user

In [9]:
# magnitude = sqrt(x2 + y2 + z2 + ...)
magnitude = np.sqrt(np.square(data_skills).sum(axis=1))

In [10]:
magnitude

0       8.774964
1       8.602325
2       5.567764
3       5.567764
4       1.000000
         ...    
399     2.828427
400     4.358899
401     3.000000
402     2.645751
403    12.206556
Length: 404, dtype: float64

* Now we use a users magnitude to normalize the ratings of this corresponding user

In [11]:
data_skills_row_norm=  data_skills.divide(magnitude, axis='index')

In [12]:
data_skills_row_norm 

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
1,0.0,0.000000,0.116248,0.000000,0.000000,0.116248,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
2,0.0,0.000000,0.000000,0.179605,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.179605,0.0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
4,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
400,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
401,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
402,0.0,0.377964,0.377964,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0


In [13]:
from implicit.nearest_neighbours import normalize

In [14]:
data_skills_row_norm_csr = scipy.sparse.csr_matrix(data_skills_row_norm.values)


### Bm25

In [15]:
data_skills

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
data_skills_csr = scipy.sparse.csr_matrix(data_skills.values)

In [17]:
data_skills_csr_weighted = implicit.nearest_neighbours.bm25_weight(data_skills_csr, K1=2, B=0.5)

In [18]:
pd.DataFrame.sparse.from_spmatrix(data_skills_csr_weighted, columns=data_skills.columns)

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
1,0.0,0.000000,1.433817,0.000000,0.000000,2.616681,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
2,0.0,0.000000,0.000000,2.489165,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,1.539067,0.0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
4,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
400,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
401,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
402,0.0,5.106524,2.192339,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0


In [19]:
data = df
data.head(5)

Unnamed: 0,person,skill,category
0,12,Windows NT/2000/XP,Betriebssystem
1,12,MAC OS X,Betriebssystem
2,12,Windows 7,Betriebssystem
3,12,Windows 8,Betriebssystem
4,12,iOS,Betriebssystem


## Baseline

In [20]:
most_common_skills = matrix.sum(axis=0).nlargest(5)	
for i in most_common_skills.index:
    print(i)

Englisch
MySQL
SCRUM
GIT
HTML


In [21]:
p_at_5_baseline_list  = []
for i in range(0,len(matrix)):
    user_series = matrix.loc[i]
    user_know_skills = user_series[user_series > 0]
    user_known_skills_vs_most_common = user_know_skills.index.isin(most_common_skills.index)
    matches = user_known_skills_vs_most_common.sum()
    results =matches/5
    p_at_5_baseline_list.append(results)
 
p_at_5_baseline = statistics.mean(p_at_5_baseline_list)
p_at_5_baseline


0.6891089108910892

Wie aussagekräftig ist p at k ? Ein recommender, der immmer die häufigsten Skills vorschlägt ist nicht zielführend, hat haben einen hohen p_at_5 score

# GridsearchCV

In [51]:
param_grid_bm25  = {'K1': [2,50,100], #2,20,50,100
                    'B': [ 0.4,0.8,1] # 0.4,0.8,1
                    }

In [23]:
def normalizer (data, param_grid):
    data_representations = ['Raw','Normalized', 'tfidf', 'bm25']    
    df_list = []
    data_list = []
    for representation in data_representations:
        df = pd.DataFrame(columns = ['data_representations', 'K1', 'B'])


        if representation == 'Normalized':
            data_representations = implicit.nearest_neighbours.normalize(data)
            data_list.append(data_representations)
            df["data_representations"] = [representation]
            df["K1"] = np.nan
            df["B"] = np.nan  
            df_list.append(df)

        elif representation == 'tfidf':
            data_representations = implicit.nearest_neighbours.tfidf_weight(data)
            data_list.append(data_representations)
            df["data_representations"] = [representation]
            df["K1"] = np.nan
            df["B"] = np.nan  
            df_list.append(df)

        elif representation == 'bm25':
            keys, values = zip(*param_grid.items())
            for c,v in enumerate(itertools.product(*values)):
                params = dict(zip(keys, v))
                
                #############
                # Initiate df for the parameter combination
                #############
                df = pd.DataFrame()
                df = pd.DataFrame(params, index =  [c])
                df["data_representations"] = [representation]
                df_list.append(df)

                #############
                # Add key value pair to the dictonary and initiate the bm25_weight function for every parameter combination
                #############
                params['X'] = data
                data_representations = implicit.nearest_neighbours.bm25_weight(**params)
                data_list.append(data_representations)

        else: 
            df["data_representations"] = [representation]
            df["K1"] = np.nan
            df["B"] = np.nan  
            df_list.append(df)
            data_list.append(data)
    return data_list, pd.concat(df_list).reset_index().drop(columns = ['index'])

In [24]:
daten, df = normalizer(data_skills_csr, param_grid_bm25)
df 


Unnamed: 0,data_representations,K1,B
0,Raw,,
1,Normalized,,
2,tfidf,,
3,bm25,2.0,0.8
4,bm25,100.0,0.8


In [25]:
def cross_validation_implicit_gs(data, algorithm, param_grid):
    df_list = []

    data_list, df = normalizer(data, param_grid_bm25)

    df_result = pd.DataFrame()
    for b,d in enumerate(data_list):
        df1 = pd.DataFrame()
        df1 = df.loc[b].to_frame().T
        data_rep = d

        keys, values = zip(*param_grid.items())
        for c,v in enumerate(itertools.product(*values)):
            
            #############
            # This for loop produces every possible combination of the hyperparameters within a dictonary
            #############
            params = dict(zip(keys, v))
            this_model = copy.deepcopy(algorithm)
            df2 = pd.DataFrame()
            df2 = pd.DataFrame(params, index =  [b])
    
            

            for k, v in params.items():
                #############
                # This loop unpacks the diconary by each parameter and initiates the model for each hpyerparameter of the corresponding dictonary
                #############
                setattr(this_model, k, v)

            ############
            # lists to store the results after each train_test_split under different random seed (c.p)
            ############
            p_train_test_results = []
            map_train_test_results = []
            ndcg_train_test_results = []


            for r_seed in range(0,5):
                #############
                # Initiate the train_test_split
                #############
                train_mat, test_mat = implicit.evaluation.train_test_split(data_rep, train_percentage =  0.8, random_state = r_seed)
                this_model.fit(train_mat)

                #############
                # measure evaluation metrixs for each possible hyperparameter combination
                #############
                p_at_k =  precision_at_k(this_model, train_user_items=train_mat, test_user_items=test_mat, K=5)
                map_at_k = mean_average_precision_at_k(this_model, train_user_items=train_mat, test_user_items=test_mat, K=5)
                var_ndcg_at_k = ndcg_at_k(this_model, train_user_items=train_mat, test_user_items=test_mat, K=5)


                #############
                # To list
                #############
                p_train_test_results.append(p_at_k)
                map_train_test_results.append(map_at_k)
                ndcg_train_test_results.append(var_ndcg_at_k)

            #############
            # Create columns to store the scores for each hyperparameter combination after 5 different train_test_splits
            #############
            df2["precision_at_k"] = np.mean(p_train_test_results)
            df2["map_at_k"]  = np.mean(map_train_test_results)
            df2["var_ndcg_at_k"] = np.mean(ndcg_train_test_results)

            df_result = df1.merge(df2, left_index=True, right_index=True)
            df_list.append(df_result)
    
    return pd.concat(df_list).reset_index().drop(columns = ['index'])

In [26]:
als_grid = {'factors': [30,40,70,100,120],
              'regularization': [0.005,0.01, 0.02],
              'iterations' : [15,30,45,80], 
              'alpha': [1,2,4] 
              }

## ALS

In [27]:
results_als= cross_validation_implicit_gs(data_skills_csr, algorithm = implicit.als.AlternatingLeastSquares(random_state = 42),param_grid =  als_grid)
results_als

100%|██████████| 15/15 [00:00<00:00, 19.39it/s]
100%|██████████| 377/377 [00:00<00:00, 6504.03it/s]
100%|██████████| 377/377 [00:00<00:00, 7117.85it/s]
100%|██████████| 377/377 [00:00<00:00, 6985.66it/s]
100%|██████████| 15/15 [00:00<00:00, 20.23it/s]
100%|██████████| 378/378 [00:00<00:00, 7564.77it/s]
100%|██████████| 378/378 [00:00<00:00, 7564.88it/s]
100%|██████████| 378/378 [00:00<00:00, 7564.73it/s]
100%|██████████| 15/15 [00:00<00:00, 21.53it/s]
100%|██████████| 367/367 [00:00<00:00, 7344.63it/s]
100%|██████████| 367/367 [00:00<00:00, 7650.72it/s]
100%|██████████| 367/367 [00:00<00:00, 7650.64it/s]
100%|██████████| 15/15 [00:00<00:00, 22.54it/s]
100%|██████████| 379/379 [00:00<00:00, 7584.89it/s]
100%|██████████| 379/379 [00:00<00:00, 7584.64it/s]
100%|██████████| 379/379 [00:00<00:00, 7900.84it/s]
100%|██████████| 15/15 [00:00<00:00, 22.17it/s]
100%|██████████| 374/374 [00:00<00:00, 7484.75it/s]
100%|██████████| 374/374 [00:00<00:00, 7637.41it/s]
100%|██████████| 374/374 [00:00<

Unnamed: 0,data_representations,K1,B,factors,regularization,iterations,alpha,precision_at_k,map_at_k,var_ndcg_at_k
0,Raw,,,30,0.01,15,1,0.471631,0.351382,0.464517
1,Raw,,,40,0.01,15,1,0.43093,0.311137,0.426329
2,Normalized,,,30,0.01,15,1,0.32877,0.22391,0.323815
3,Normalized,,,40,0.01,15,1,0.287595,0.191139,0.287009
4,tfidf,,,30,0.01,15,1,0.443359,0.325264,0.437407
5,tfidf,,,40,0.01,15,1,0.414046,0.298294,0.409156
6,bm25,2.0,0.8,30,0.01,15,1,0.458723,0.34036,0.452665
7,bm25,2.0,0.8,40,0.01,15,1,0.429992,0.314611,0.424797
8,bm25,100.0,0.8,30,0.01,15,1,0.475496,0.356814,0.467044
9,bm25,100.0,0.8,40,0.01,15,1,0.444302,0.32873,0.439262


In [None]:
results_als.to_csv('results_als.csv')

## LMF


In [None]:
lmf_grid = {'factors': [30,40,70,100,120],
              'regularization': [0.005,0.01, 0.02],
              'iterations' : [15,30,45,80],
              'alpha': [1,2,4] 
              }

In [None]:
results_lmf= cross_validation_implicit_gs(data_skills_csr, algorithm = implicit.lmf.LogisticMatrixFactorization(random_state = 42),param_grid =  lmf_grid)
results_lmf

In [None]:
results_lmf.to_csv('results_lmf.csv')

## Get the best model according to the ndcg score 

In [28]:
results_als.iloc[results_als.var_ndcg_at_k.idxmax()]

data_representations        bm25
K1                         100.0
B                            0.8
factors                       30
regularization              0.01
iterations                    15
alpha                          1
precision_at_k          0.475496
map_at_k                0.356814
var_ndcg_at_k           0.467044
Name: 8, dtype: object

## Initiate that model

In [29]:
model_best = implicit.als.AlternatingLeastSquares(factors = 30, regularization = 0.01, iterations = 15, alpha = 1, random_state = 42)

In [30]:
data_best = implicit.nearest_neighbours.bm25_weight(data_skills_csr, K1 = 100, B = 0.8)
data_best = data_best.tocsr()

In [31]:
train_mat, test_mat = implicit.evaluation.train_test_split(data_best, train_percentage =  0.8, random_state = 42)

In [32]:
model_best.fit(train_mat)

100%|██████████| 15/15 [00:00<00:00, 23.02it/s]


### Do some recommendations

In [33]:
data_categories = data[["skill","category"]].drop_duplicates()
data_categories.head(5)

Unnamed: 0,skill,category
0,Windows NT/2000/XP,Betriebssystem
1,MAC OS X,Betriebssystem
2,Windows 7,Betriebssystem
3,Windows 8,Betriebssystem
4,iOS,Betriebssystem


In [34]:
userid= [1]

def do_recommendations (userid, user_items_mat_train,full_data_mat ,n = 10):
    df_list = []
    for u in userid:
        

        #############
        # Do recommendations for every user in the list. The known items are in the matrix the model was trained on
        #############
        skill_ids, scores  = model_best.recommend(u, user_items= user_items_mat_train[u], N = n , filter_already_liked_items=True)
        recommendations_df = pd.DataFrame({ "skill": matrix.columns[skill_ids], "score": scores, "already_liked": np.in1d(skill_ids, user_items_mat_train.tocsr()[u].indices)})
        recommendations_df["user_id"] = [u] * len(recommendations_df)
        recommendations_df = recommendations_df.rename(columns = {"already_liked": "already_liked_in_train"})

        #############
        # Get all known skills for each user by indexing the full data matrix
        #############
        data_best_user_id = pd.DataFrame.sparse.from_spmatrix(data = full_data_mat.tocsr()[u],columns=matrix.columns).T.reset_index().rename(columns = {"index": "skill",0: "rating"}).sort_values(by = "rating", ascending = False)
        data_best_user_id["user_id"] = [u] * len(data_best_user_id)
      
        #############
        # Merge both df in order to see, if the recommended skills are the skills, which the user already knows, but were left out in the trainings data
        #############
        recommendations_df = recommendations_df.merge(data_best_user_id, on = ["skill","user_id"], how = "left").rename(columns = {"rating": "rating_in_original"})
        recommendations_df.set_index("user_id", inplace = True)

        #############
        # Merge the category information to the recommendations
        #############
        recommendations_df = recommendations_df.merge(data_categories, on = "skill", how = "left")
        df_list.append(recommendations_df)
    return pd.concat(df_list)

In [35]:
recommendations_df = do_recommendations(userid, train_mat,data_best,n = 10)
recommendations_df

Unnamed: 0,skill,score,already_liked_in_train,rating_in_original,category
0,LINQ,0.778314,False,1.634244,.NET Frameworks und Tools
1,NUnit,0.750925,False,1.710148,.NET Frameworks und Tools
2,Angular Generalist,0.712047,False,0.0,JavaScript Frameworks
3,ASP.NET MVC-Framework,0.635854,False,1.742536,.NET Frameworks und Tools
4,MongoDB,0.628998,False,0.0,Datenbank
5,Test Driven Development (TDD),0.566148,False,0.456205,Methoden und Praktiken
6,GIMP,0.565924,False,0.0,Grafik/Design
7,Windows Forms,0.56134,False,0.0,.NET Frameworks und Tools
8,PowerShell,0.553256,False,1.429121,Technik/Tools
9,SCRUM,0.543668,False,0.0,Projektmanagement / Vorgehensmodelle


### New user

In [36]:
skill_ids, scores  = model_best.recommend(0, user_items= train_mat[0], N = 5 , filter_already_liked_items=True, recalculate_user=True)
recommendations_df = pd.DataFrame({ "skill": matrix.columns[skill_ids], "score": scores, "already_liked": np.in1d(skill_ids, train_mat.tocsr()[0].indices)})
recommendations_df = recommendations_df.rename(columns = {"already_liked": "already_liked_in_train"})
recommendations_df

Unnamed: 0,skill,score,already_liked_in_train
0,REST,0.631223,False
1,Dependency Injection,0.605594,False
2,SCRUM,0.60366,False
3,Design Pattern,0.599021,False
4,Windows 7,0.522964,False


In [37]:
# x.toarray()
# >> array([[ 1.,  1.,  1.,  1.,  1.],
#           [ 1.,  1.,  1.,  1.,  1.],
#           [ 1.,  1.,  1.,  1.,  1.]])
# # reshape is not implemented for csr_matrix but you can cheat and do it  yourself.
# x._shape = (4,5)
# # Update indptr to let it know we added a row with nothing in it. So just append the last
# # value in indptr to the end.
# # note that you are still copying the indptr array
# x.indptr = np.hstack((x.indptr,x.indptr[-1]))
# x.toarray()
# array([[ 1.,  1.,  1.,  1.,  1.],
#        [ 1.,  1.,  1.,  1.,  1.],
#        [ 1.,  1.,  1.,  1.,  1.],
#        [ 0.,  0.,  0.,  0.,  0.]])

In [38]:
new_id = 404
new_skills = {
                "Windows 11":1,
                "Englisch":1,
                "Chinesisch (Mandarin)":1,
                "Python":1,
                "MATLAB":1,
                "Java":1,
                "MySQL":1,
                "SQLBase":1,
                "Microsoft SQL Server":1,
                "Google Cloud Platform":1,
                "MongoDB":1,
                "JSON":1,
                "Docker":1,
                "Statische Codeanalyse": 1,
                "Power BI":1,
                "Postman":1,
                "PowerShell":1,
                "Github actions":1
            }

In [39]:
new_user_df = pd.DataFrame(new_skills, index = [new_id])
new_user_df

Unnamed: 0,Windows 11,Englisch,Chinesisch (Mandarin),Python,MATLAB,Java,MySQL,SQLBase,Microsoft SQL Server,Google Cloud Platform,MongoDB,JSON,Docker,Statische Codeanalyse,Power BI,Postman,PowerShell,Github actions
404,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [40]:
new_user_matrix = pd.concat([matrix,new_user_df]).sort_index().fillna(0)
new_user_matrix.tail(5)

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
402,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
403,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


ebenfalls normalizen

In [41]:
new_user_matrix_csr = scipy.sparse.csr_matrix(new_user_matrix.values)
new_user_matrix_csr


<405x735 sparse matrix of type '<class 'numpy.float64'>'
	with 24187 stored elements in Compressed Sparse Row format>

In [42]:
skill_ids, scores  = model_best.recommend(404, user_items= new_user_matrix_csr[404], N = 5 , filter_already_liked_items=True, recalculate_user=True)
recommendations_df = pd.DataFrame({ "skill": new_user_matrix.columns[skill_ids], "score": scores, "already_liked": np.in1d(skill_ids, new_user_matrix_csr.tocsr()[404].indices)})
recommendations_df

Unnamed: 0,skill,score,already_liked
0,REST,0.23986,False
1,C#,0.228421,False
2,PHP,0.189112,False
3,C/C++,0.18593,False
4,KANBAN,0.183195,False


### Simularities

### Items

In [43]:
# Id and skills
dict_skills_id = {}
for c,skill in enumerate(matrix.columns):
        dict_skills_id[c] = skill

dict_skills_id


{0: '.NET Compact Framework',
 1: '.NET Core',
 2: '.NET Framework',
 3: '3D-Modellierung',
 4: 'ABAP',
 5: 'ADO.NET',
 6: 'AIX',
 7: 'ARIS',
 8: 'ARIS ITArchitect',
 9: 'AS400',
 10: 'ASP Generalist',
 11: 'ASP.NET',
 12: 'ASP.NET MVC-Framework',
 13: 'ASP.NET WebAPI',
 14: 'AWS',
 15: 'AWS Lambda',
 16: 'Abstract',
 17: 'Access',
 18: 'Accessibility / WCAG',
 19: 'Active Directory',
 20: 'ActiveX',
 21: 'Adobe CC',
 22: 'Adobe Flash',
 23: 'Adobe Illustrator',
 24: 'Adobe InDesign',
 25: 'Adobe Indesign',
 26: 'Adobe Photoshop',
 27: 'Adobe Premiere',
 28: 'Adobe XD',
 29: 'After Effects',
 30: 'Agile Methoden',
 31: 'Ajax',
 32: 'Alexa-Skills',
 33: 'Alpine',
 34: 'Analytics',
 35: 'Android',
 36: 'Android Studio',
 37: 'Anforderungsanalyse',
 38: 'Anforderungsmanagement',
 39: 'Angular (2 und höher)',
 40: 'Angular Generalist',
 41: 'Angular Material',
 42: 'Angular Theming',
 43: 'AngularJS',
 44: 'Animations (transition, @keyframes)',
 45: 'Ansible',
 46: 'Ant',
 47: 'Apache',
 4

In [44]:
skill_id, sim = model_best.similar_items(0, N=10)
# display the results using pandas for nicer formatting
simularity_df  = pd.DataFrame({"skill": matrix.columns[skill_id], "score": sim})
simularity_df.merge(data_categories, how = "left", left_on = "skill", right_on = "skill")

Unnamed: 0,skill,score,category
0,.NET Compact Framework,1.0,.NET Frameworks und Tools
1,BizTalk,0.750393,.NET Frameworks und Tools
2,Infragistics,0.727576,.NET Frameworks und Tools
3,ADO.NET,0.645795,.NET Frameworks und Tools
4,Silverlight,0.626509,.NET Frameworks und Tools
5,CORBA,0.625713,Standards
6,Windows Communication Foundation (WCF),0.619321,.NET Frameworks und Tools
7,.NET Core,0.55209,.NET Frameworks und Tools
8,Team Foundation Server (TF Server),0.535807,"CI/CD, Build- und Versionskontrollsysteme"
9,Team Developer,0.534182,Umgebungen


### Users

In [45]:
new_skills = {
                "Windows 11":1,
                "Englisch":1,
                "Chinesisch (Mandarin)":1,
                "Python":1,
                "MATLAB":1,
                "Java":1,
                "MySQL":1,
                "SQLBase":1,
                "Microsoft SQL Server":1,
                "Google Cloud Platform":1,
                "MongoDB":1,
                "JSON":1,
                "Docker":1,
                "Statische Codeanalyse": 1,
                "Power BI":1,
                "Postman":1,
                "PowerShell":1,
                "Github actions":1
            }

In [46]:
new_user_id_list = list(new_skills.keys())

In [47]:
user_ids_, sim = model_best.similar_users(0, N=10)
# display the results using pandas for nicer formatting
simularity_df  = pd.DataFrame({"user_ids": matrix.index[user_ids_], "score": sim})
simularity_df

Unnamed: 0,user_ids,score
0,0,1.0
1,241,0.68059
2,249,0.631691
3,184,0.621767
4,48,0.602871
5,174,0.598954
6,362,0.57721
7,58,0.570617
8,55,0.542461
9,355,0.498745


In [48]:
## Compare the mst similar user to the new user
matrix[new_user_id_list].iloc[241].to_frame().T

Unnamed: 0,Windows 11,Englisch,Chinesisch (Mandarin),Python,MATLAB,Java,MySQL,SQLBase,Microsoft SQL Server,Google Cloud Platform,MongoDB,JSON,Docker,Statische Codeanalyse,Power BI,Postman,PowerShell,Github actions
241,0,1,0,1,0,1,1,0,1,0,0,1,0,1,0,0,0,0


### Preprocessing function


In [49]:
pd.DataFrame.sparse.from_spmatrix(daten[2], columns=data_skills.columns)

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
1,0.0,0.00000,1.547068,0.000000,0.000000,2.823361,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
2,0.0,0.00000,0.000000,2.089392,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,1.291885,0.0
3,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
4,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
400,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
401,0.0,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0
402,0.0,3.60352,1.547068,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.0


### Single setup validation

In [None]:
daten =  implicit.nearest_neighbours.bm25_weight(data_skills_csr, K1 = 20, B = 0.4)

In [None]:
model_als = implicit.als.AlternatingLeastSquares(factors = 30, random_state= 42, alpha= 1, regularization= 0.01, iterations= 15)

ndcg_train_test_results = []
for r_seed in range(0,5):
    train_mat, test_mat = implicit.evaluation.train_test_split(daten, train_percentage =  0.8, random_state = r_seed)
    model_als.fit(train_mat)


    #############
    # measure evaluation metrixs for each possible hyperparameter combination
    #############
    var_ndcg_at_k = ndcg_at_k(model_als, train_user_items=train_mat, test_user_items=test_mat, K=5)
    ndcg_train_test_results.append(var_ndcg_at_k)

print(np.mean(ndcg_train_test_results))
    

100%|██████████| 15/15 [00:01<00:00,  7.74it/s]
100%|██████████| 377/377 [00:00<00:00, 4010.88it/s]
100%|██████████| 15/15 [00:01<00:00,  7.69it/s]
100%|██████████| 378/378 [00:00<00:00, 4345.90it/s]
100%|██████████| 15/15 [00:01<00:00,  7.86it/s]
100%|██████████| 367/367 [00:00<00:00, 4170.56it/s]
100%|██████████| 15/15 [00:01<00:00,  7.87it/s]
100%|██████████| 379/379 [00:00<00:00, 3828.50it/s]
100%|██████████| 15/15 [00:02<00:00,  7.46it/s]
100%|██████████| 374/374 [00:00<00:00, 4453.40it/s]

0.44710373923827795



