## Implicit Library

In [1]:
import implicit
import h5py
import pandas as pd
import numpy as np
import random
import statistics
from implicit.evaluation import train_test_split
from sklearn.metrics import ndcg_score
from numpy.random import permutation
from implicit.nearest_neighbours import (
    BM25Recommender,
    CosineRecommender,
    TFIDFRecommender,
    bm25_weight,
)
import scipy
from scipy.sparse import csr_matrix
import scipy.sparse
from implicit.lmf import LogisticMatrixFactorization
from implicit.evaluation import precision_at_k,mean_average_precision_at_k, ndcg_at_k
from sklearn.model_selection import KFold
from random import shuffle


  from .autonotebook import tqdm as notebook_tqdm


## Preprocessing

In [2]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";") 
#df = [~df["category"].isin(["Betriebssystem", "Dienste", "Einsatzfelder / Erfahrungen / Schwerpunkte", "Rolle (intern)"])]
df.head(5)

Unnamed: 0,person,skill,category
0,12,Windows NT/2000/XP,Betriebssystem
1,12,MAC OS X,Betriebssystem
2,12,Windows 7,Betriebssystem
3,12,Windows 8,Betriebssystem
4,12,iOS,Betriebssystem


In [3]:

data = df

In [4]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";") 
#df = [~df["category"].isin(["Betriebssystem", "Dienste", "Einsatzfelder / Erfahrungen / Schwerpunkte", "Rolle (intern)"])]
df.head(5)

Unnamed: 0,person,skill,category
0,12,Windows NT/2000/XP,Betriebssystem
1,12,MAC OS X,Betriebssystem
2,12,Windows 7,Betriebssystem
3,12,Windows 8,Betriebssystem
4,12,iOS,Betriebssystem


In [5]:
data = df

In [6]:
matrix = pd.read_csv("data/origin_binary_matrix.csv")

In [7]:
matrix

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Normalizing the rows

In [8]:
data_skills  = matrix
data_skills 

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In the next step we will compute the item-item relationships of our skills. Our final goal here is to construct a new item by item matrix containing the weights (relationships) between each of our skills where a perfect correlation equals 1 and no correlation at all equals 0.
<br>
In order to do so, we will first normalize the user vectors. The idea behind this approach is, that a user with many skills contributes less to any individual skill. For example if a user, that rules only 3 skills, rules a skill X it is more valueable than if a user, that rules 20 skills, that particular skill.


* First we caclulat the magnitude for every user

In [9]:
# magnitude = sqrt(x2 + y2 + z2 + ...)
magnitude = np.sqrt(np.square(data_skills).sum(axis=1))

In [10]:
magnitude

0       8.774964
1       8.602325
2       5.567764
3       5.567764
4       1.000000
         ...    
399     2.828427
400     4.358899
401     3.000000
402     2.645751
403    12.206556
Length: 404, dtype: float64

* Now we use a users magnitude to normalize the ratings of this corresponding user

In [11]:
data_skills_row_norm=  data_skills.divide(magnitude, axis='index')

In [12]:
data_skills_row_norm

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
1,0.0,0.000000,0.116248,0.000000,0.000000,0.116248,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
2,0.0,0.000000,0.000000,0.179605,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.179605,0.0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
4,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
400,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
401,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
402,0.0,0.377964,0.377964,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0


In [13]:
data_skills_row_norm_csr = scipy.sparse.csr_matrix(data_skills_row_norm.values)


### Bm25

In [14]:
data_skills

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
data_skills_csr = scipy.sparse.csr_matrix(data_skills.values)

In [16]:
data_skills_csr_weighted = implicit.nearest_neighbours.bm25_weight(data_skills_csr, K1=100, B=0.8)

In [17]:
pd.DataFrame.sparse.from_spmatrix(data_skills_csr_weighted, columns=data_skills.columns)

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
1,0.0,0.000000,1.302587,0.000000,0.000000,2.37719,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
2,0.0,0.000000,0.000000,3.378903,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,2.089198,0.0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
4,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
400,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
401,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
402,0.0,11.987697,5.146573,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0


In [18]:
df = df.drop(columns=["category"])

In [19]:
data = df
data.head(5)

Unnamed: 0,person,skill
0,12,Windows NT/2000/XP
1,12,MAC OS X
2,12,Windows 7
3,12,Windows 8
4,12,iOS


## Baseline

In [20]:
most_common_skills = matrix.sum(axis=0).nlargest(5)	
for i in most_common_skills.index:
    print(i)

Englisch
MySQL
SCRUM
GIT
HTML


In [21]:
p_at_5_baseline_list  = []
for i in range(0,len(matrix)):
    user_series = matrix.loc[i]
    user_know_skills = user_series[user_series > 0]
    user_known_skills_vs_most_common = user_know_skills.index.isin(most_common_skills.index)
    matches = user_known_skills_vs_most_common.sum()
    results =matches/5
    p_at_5_baseline_list.append(results)
 
p_at_5_baseline = statistics.mean(p_at_5_baseline_list)
p_at_5_baseline


0.6891089108910892

Wie aussagekräftig ist p at k ? Ein recommender, der immmer die häufigsten Skills vorschlägt ist nicht zielführend, hat haben einen hohen p_at_5 score

# Testing different algorithms

## AlternatingLeastSquares

In [20]:
from implicit.evaluation import leave_k_out_split
from scipy.sparse import vstack

In [21]:
folds = 4

results = pd.DataFrame()
def cross_validation_implicit_als(data):
    results_over_all = pd.DataFrame(columns=["factors","alpha","regularization","iterations","p_at_k_mean","map_at_k_mean","ndcg_at_k_mean"])
    factor_list = [5,10,20,50,70]
    alpha_list = [0.5,1.0, 1.5]
    regularization_list = [0.005,0.01,0.02]
    iterations_list =  [20,30, 50,70]

    for i in iterations_list:
        for f in factor_list:
            for a in alpha_list:
                for r in regularization_list:
                    avg_p_at_k_list = []
                    avg_map_at_k_list = []
                    avg_ndcg_at_k_list = []

                    #Note: Shuffle is set on false, because it leads to ambiguous results regarding the map_at_k and the ndcg_at_k. The 
                    # However p_at_k is not affected by the shuffle and stays consistent.
                    skf = KFold(n_splits= folds , shuffle=False) 

                    for r_seed in range(0,5):
                        # Lists to store the results for each random_seed
                        p_at_k_list = []
                        map_at_k_list = []
                        ndcg_at_k_list = []
                        model = implicit.als.AlternatingLeastSquares(factors = f, random_state= r_seed, alpha= a, regularization= r, iterations= i)

                        ###########################
                        # Split the data
                        ###########################
                        for train_index, test_index in skf.split(data):
                            X_train = data[train_index]
                            X_test = data[test_index]
                            
                            return X_train, X_test
    #                         kfold_data = scipy.sparse.vstack((X_train, X_test))
                            
    #                         train_mat, test_mat = implicit.evaluation.leave_k_out_split(ratings = kfold_data , K= 4 , random_state =1)
    #                         ###########################
    #                         # Shuffle manually
    #                         ###########################

    #                         index = np.arange(np.shape(train_mat)[0])
    #                         np.random.RandomState(seed = 1).shuffle(index)
                            
    #                         train_mat = train_mat[index, :] 
    #                         model.fit(train_mat)

    #                         #############
    #                         # Fifth Step: We predict on the scaled test data
    #                         #############
    #                         p_at_k = precision_at_k(model, train_user_items=train_mat, test_user_items=test_mat, K=5)
    #                         map_at_k  = mean_average_precision_at_k(model, train_user_items=train_mat, test_user_items=test_mat, K=5)
    #                         var_ndcg_at_k = ndcg_at_k(model, train_user_items=train_mat, test_user_items=test_mat, K=5)

    #                         # results to list
    #                         p_at_k_list.append(p_at_k)
    #                         map_at_k_list.append(map_at_k) 
    #                         ndcg_at_k_list.append(var_ndcg_at_k) #store as df to create the avg confusion matrix later


    #                     # store the results for each random_seed
    #                     # After each iteration in the random_seed loop the lists (acc_list, bacc_list and confusion_m_list) are cleaned. Therefore we can do the following computations to store the average results for each random_seed:
    #                     # Store the averages per random_seed in list. 
    #                     avg_p_at_k_list.append(statistics.mean(p_at_k_list))
    #                     avg_map_at_k_list.append(statistics.mean(map_at_k_list))
    #                     avg_ndcg_at_k_list.append(statistics.mean(ndcg_at_k_list))

                    
    #                 row = {
    #                     "factors": f,
    #                     "alpha": a,
    #                     "regularization": r,
    #                     "iterations": i,
    #                     "p_at_k_mean": statistics.mean(avg_p_at_k_list),  
    #                     "map_at_k_mean":statistics.mean(avg_map_at_k_list), 
    #                     "ndcg_at_k_mean":statistics.mean(avg_ndcg_at_k_list),
    #                     }
    #                 results_per_run = pd.DataFrame([row])
    #                 results_over_all = pd.concat([results_over_all, results_per_run], ignore_index=True)
    # return results_over_all
                
    #           #  return avg_p_at_k_list,avg_map_at_k_list,avg_ndcg_at_k_list


In [26]:
folds = 4

results = pd.DataFrame()
def cross_validation_implicit_als(data):
    results_over_all = pd.DataFrame(columns=["factors","alpha","regularization","iterations","p_at_k_mean","map_at_k_mean","ndcg_at_k_mean"])
    factor_list = [20,50,70]
    alpha_list = [0.5,1.0, 1.5]
    regularization_list = [0.01,0.02]
    iterations_list =  [30, 50,70]

    for i in iterations_list:
        for f in factor_list:
            for a in alpha_list:
                for r in regularization_list:
                    avg_p_at_k_list = []
                    avg_map_at_k_list = []
                    avg_ndcg_at_k_list = []

                    #Note: Shuffle is set on false, because it leads to ambiguous results regarding the map_at_k and the ndcg_at_k. The 
                    # However p_at_k is not affected by the shuffle and stays consistent.
                     

                    for r_seed in range(0,5):
                        # Lists to store the results for each random_seed
                        p_at_k_list = []
                        map_at_k_list = []
                        ndcg_at_k_list = []
                        model = implicit.als.AlternatingLeastSquares#(factors = f, random_state= r_seed, alpha= a, regularization= r, iterations= i)

                        ##########################
                        #Split the data
                        ##########################

                        train_mat, test_mat = implicit.evaluation.leave_k_out_split(ratings = data , K= 4 , random_state =1)
                        ###########################
                        # Shuffle manually
                        ###########################

                        index = np.arange(np.shape(train_mat)[0])
                        np.random.RandomState(seed = 1).shuffle(index)
                        
                        train_mat = train_mat[index, :] 
                        model.fit(train_mat)

                        #############
                        # Fifth Step: We predict on the scaled test data
                        #############
                        p_at_k = precision_at_k(model, train_user_items=train_mat, test_user_items=test_mat, K=5)
                        map_at_k  = mean_average_precision_at_k(model, train_user_items=train_mat, test_user_items=test_mat, K=5)
                        var_ndcg_at_k = ndcg_at_k(model, train_user_items=train_mat, test_user_items=test_mat, K=5)

                        # results to list
                        p_at_k_list.append(p_at_k)
                        map_at_k_list.append(map_at_k) 
                        ndcg_at_k_list.append(var_ndcg_at_k) #store as df to create the avg confusion matrix later


                    # store the results for each random_seed
                    # After each iteration in the random_seed loop the lists (acc_list, bacc_list and confusion_m_list) are cleaned. Therefore we can do the following computations to store the average results for each random_seed:
                    # Store the averages per random_seed in list. 
                    avg_p_at_k_list.append(statistics.mean(p_at_k_list))
                    avg_map_at_k_list.append(statistics.mean(map_at_k_list))
                    avg_ndcg_at_k_list.append(statistics.mean(ndcg_at_k_list))

                
                row = {
                    "factors": f,
                    "alpha": a,
                    "regularization": r,
                    "iterations": i,
                    "p_at_k_mean": statistics.mean(avg_p_at_k_list),  
                    "map_at_k_mean":statistics.mean(avg_map_at_k_list), 
                    "ndcg_at_k_mean":statistics.mean(avg_ndcg_at_k_list),
                    }
                results_per_run = pd.DataFrame([row])
                results_over_all = pd.concat([results_over_all, results_per_run], ignore_index=True)
    return results_over_all
            
            #  return avg_p_at_k_list,avg_map_at_k_list,avg_ndcg_at_k_list


In [None]:
folds = 4

results = pd.DataFrame()
def cross_validation_implicit_als(data):
    results_over_all = pd.DataFrame(columns=["factors","alpha","regularization","iterations","p_at_k_mean","map_at_k_mean","ndcg_at_k_mean"])
    factor_list = [5,10,20,50,70]
    alpha_list = [0.5,1.0, 1.5]
    regularization_list = [0.005,0.01,0.02]
    iterations_list =  [20,30, 50,70]

    for i in iterations_list:
        for f in factor_list:
            for a in alpha_list:
                for r in regularization_list:
                    avg_p_at_k_list = []
                    avg_map_at_k_list = []
                    avg_ndcg_at_k_list = []

                    #Note: Shuffle is set on false, because it leads to ambiguous results regarding the map_at_k and the ndcg_at_k. The 
                    # However p_at_k is not affected by the shuffle and stays consistent.
                    skf = KFold(n_splits= folds , shuffle=False) 

                    for r_seed in range(0,5):
                        # Lists to store the results for each random_seed
                        p_at_k_list = []
                        map_at_k_list = []
                        ndcg_at_k_list = []
                        model = implicit.als.AlternatingLeastSquares(factors = f, random_state= r_seed, alpha= a, regularization= r, iterations= i)

                        ###########################
                        # Split the data
                        ###########################
                        for train_index, test_index in skf.split(data):
                            X_train = data[train_index]
                            X_test = data[test_index]
                            

                            
                            
                            ###########################
                            # Shuffle manually
                            ###########################

                            index = np.arange(np.shape(X_train)[0])
                            np.random.RandomState(seed = 1).shuffle(index)
                            
                            X_train = X_train[index, :] 
                            model.fit(X_train)

                            #############
                            # Fifth Step: We predict on the scaled test data
                            #############
                            p_at_k = precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
                            map_at_k  = mean_average_precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
                            var_ndcg_at_k = ndcg_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)

                            # results to list
                            p_at_k_list.append(p_at_k)
                            map_at_k_list.append(map_at_k) 
                            ndcg_at_k_list.append(var_ndcg_at_k) #store as df to create the avg confusion matrix later


                        # store the results for each random_seed
                        # After each iteration in the random_seed loop the lists (acc_list, bacc_list and confusion_m_list) are cleaned. Therefore we can do the following computations to store the average results for each random_seed:
                        # Store the averages per random_seed in list. 
                        avg_p_at_k_list.append(statistics.mean(p_at_k_list))
                        avg_map_at_k_list.append(statistics.mean(map_at_k_list))
                        avg_ndcg_at_k_list.append(statistics.mean(ndcg_at_k_list))

                    
                    row = {
                        "factors": f,
                        "alpha": a,
                        "regularization": r,
                        "iterations": i,
                        "p_at_k_mean": statistics.mean(avg_p_at_k_list),  
                        "map_at_k_mean":statistics.mean(avg_map_at_k_list), 
                        "ndcg_at_k_mean":statistics.mean(avg_ndcg_at_k_list),
                        }
                    results_per_run = pd.DataFrame([row])
                    results_over_all = pd.concat([results_over_all, results_per_run], ignore_index=True)
    return results_over_all
                
              #  return avg_p_at_k_list,avg_map_at_k_list,avg_ndcg_at_k_list


SyntaxError: invalid syntax (1449821823.py, line 72)

In [27]:
results_of_als_grid = cross_validation_implicit_als(data_skills_row_norm_csr)
results_of_als_grid

100%|██████████| 30/30 [00:03<00:00,  9.27it/s]
100%|██████████| 365/365 [00:00<00:00, 3686.91it/s]
100%|██████████| 365/365 [00:00<00:00, 4147.88it/s]
100%|██████████| 365/365 [00:00<00:00, 3883.05it/s]
100%|██████████| 30/30 [00:03<00:00,  7.76it/s]
100%|██████████| 365/365 [00:00<00:00, 2744.34it/s]
100%|██████████| 365/365 [00:00<00:00, 2786.42it/s]
100%|██████████| 365/365 [00:00<00:00, 3174.02it/s]
100%|██████████| 30/30 [00:03<00:00,  9.70it/s]
100%|██████████| 365/365 [00:00<00:00, 2588.52it/s]
100%|██████████| 365/365 [00:00<00:00, 3041.72it/s]
100%|██████████| 365/365 [00:00<00:00, 2991.84it/s]
100%|██████████| 30/30 [00:03<00:00,  8.87it/s]
100%|██████████| 365/365 [00:00<00:00, 4451.41it/s]
100%|██████████| 365/365 [00:00<00:00, 3443.31it/s]
100%|██████████| 365/365 [00:00<00:00, 4562.57it/s]
100%|██████████| 30/30 [00:03<00:00,  9.95it/s]
100%|██████████| 365/365 [00:00<00:00, 3802.16it/s]
100%|██████████| 365/365 [00:00<00:00, 4345.39it/s]
100%|██████████| 365/365 [00:00<

Unnamed: 0,factors,alpha,regularization,iterations,p_at_k_mean,map_at_k_mean,ndcg_at_k_mean
0,20,0.5,0.02,30,0.028767,0.014098,0.027426
1,20,1.0,0.02,30,0.036986,0.018151,0.03521
2,20,1.5,0.02,30,0.036301,0.016096,0.031528
3,50,0.5,0.02,30,0.022603,0.013276,0.023935
4,50,1.0,0.02,30,0.013699,0.0071,0.01359
5,50,1.5,0.02,30,0.019178,0.007694,0.016339
6,70,0.5,0.02,30,0.012329,0.006119,0.011945
7,70,1.0,0.02,30,0.017808,0.009167,0.017238
8,70,1.5,0.02,30,0.016438,0.008219,0.015727
9,20,0.5,0.02,50,0.034247,0.016176,0.032137


### Results to csv

In [31]:
results_of_als_grid.to_csv("results_of_als_grid.csv", index=False)

### Best Params

In [28]:
results_of_als_grid.p_at_k_mean.idxmax()
results_of_als_grid.iloc[results_of_als_grid.p_at_k_mean.idxmax()]

factors                 20
alpha                  1.0
regularization        0.02
iterations              30
p_at_k_mean       0.036986
map_at_k_mean     0.018151
ndcg_at_k_mean     0.03521
Name: 1, dtype: object

In [29]:
results_of_als_grid.iloc[results_of_als_grid.map_at_k_mean.idxmax()]

factors                 20
alpha                  1.0
regularization        0.02
iterations              30
p_at_k_mean       0.036986
map_at_k_mean     0.018151
ndcg_at_k_mean     0.03521
Name: 1, dtype: object

In [30]:
results_of_als_grid.iloc[results_of_als_grid.ndcg_at_k_mean.idxmax()]

factors                 20
alpha                  1.0
regularization        0.02
iterations              30
p_at_k_mean       0.036986
map_at_k_mean     0.018151
ndcg_at_k_mean     0.03521
Name: 1, dtype: object

## BayesianPersonalizedRanking

In [None]:

folds = 4

results = pd.DataFrame()
def cross_validation_implicit_bpr(data, algo):
    results_over_all = pd.DataFrame(columns=["factors","regularization","iterations","p_at_k_mean","map_at_k_mean","ndcg_at_k_mean"])
    factor_list = [5,10,20,50,70]
    iteration_list = [20,30, 50,70]
    regularization_list = [0.005,0.01,0.02]


    for f in factor_list:
        for i in iteration_list:
            for r in regularization_list:
                avg_p_at_k_list = []
                avg_map_at_k_list = []
                avg_ndcg_at_k_list = []

                #Note: Shuffle is set on false, because it leads to ambiguous results regarding the map_at_k and the ndcg_at_k. The 
                # However p_at_k is not affected by the shuffle and stays consistent.
                skf = KFold(n_splits= folds , shuffle=False) 

                for r_seed in range(0,5):
                    # Lists to store the results for each random_seed
                    p_at_k_list = []
                    map_at_k_list = []
                    ndcg_at_k_list = []
                    model = implicit.bpr.BayesianPersonalizedRanking(factors = f, random_state= r_seed, regularization= r, iterations= i)

                    ###########################
                    # Split the data
                    ###########################
                    for train_index, test_index in skf.split(data):
                        X_train = data[train_index]
                        X_test = data[test_index]
                        
                        ###########################
                        # Shuffle manually
                        ###########################

                        index = np.arange(np.shape(X_train)[0])
                        np.random.RandomState(seed = 1).shuffle(index)
                        
                        X_train = X_train[index, :] 
                        model.fit(X_train)

                        #############
                        # Fifth Step: We predict on the scaled test data
                        #############
                        p_at_k = precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
                        map_at_k  = mean_average_precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
                        var_ndcg_at_k = ndcg_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)

                        # results to list
                        p_at_k_list.append(p_at_k)
                        map_at_k_list.append(map_at_k) 
                        ndcg_at_k_list.append(var_ndcg_at_k) #store as df to create the avg confusion matrix later


                    # store the results for each random_seed
                    # After each iteration in the random_seed loop the lists (acc_list, bacc_list and confusion_m_list) are cleaned. Therefore we can do the following computations to store the average results for each random_seed:
                    # Store the averages per random_seed in list. 
                    avg_p_at_k_list.append(statistics.mean(p_at_k_list))
                    avg_map_at_k_list.append(statistics.mean(map_at_k_list))
                    avg_ndcg_at_k_list.append(statistics.mean(ndcg_at_k_list))

                
                row = {
                    "factors": f,
                    "regularization": r,
                    "iterations": i,
                    "p_at_k_mean": statistics.mean(avg_p_at_k_list),  
                    "map_at_k_mean":statistics.mean(avg_map_at_k_list), 
                    "ndcg_at_k_mean":statistics.mean(avg_ndcg_at_k_list),
                    }
                results_per_run = pd.DataFrame([row])
                results_over_all = pd.concat([results_over_all, results_per_run], ignore_index=True)
    return results_over_all
                
                #  return avg_p_at_k_list,avg_map_at_k_list,avg_ndcg_at_k_list


In [None]:
results_of_bpr_grid = cross_validation_implicit_bpr(data_skills_row_norm_csr)
results_of_bpr_grid

100%|██████████| 100/100 [00:00<00:00, 106.31it/s, train_auc=75.46%, skipped=39.44%]
100%|██████████| 101/101 [00:00<00:00, 3050.51it/s]
100%|██████████| 101/101 [00:00<00:00, 2589.08it/s]
100%|██████████| 101/101 [00:00<00:00, 2678.05it/s]
100%|██████████| 100/100 [00:01<00:00, 89.78it/s, train_auc=79.00%, skipped=39.19%]
100%|██████████| 101/101 [00:00<00:00, 2936.78it/s]
100%|██████████| 101/101 [00:00<00:00, 2340.87it/s]
100%|██████████| 101/101 [00:00<00:00, 3054.30it/s]
100%|██████████| 100/100 [00:00<00:00, 119.20it/s, train_auc=78.19%, skipped=40.09%]
100%|██████████| 101/101 [00:00<00:00, 2941.16it/s]
100%|██████████| 101/101 [00:00<00:00, 2657.74it/s]
100%|██████████| 101/101 [00:00<00:00, 2494.11it/s]
100%|██████████| 100/100 [00:01<00:00, 92.10it/s, train_auc=79.09%, skipped=38.65%]
100%|██████████| 101/101 [00:00<00:00, 3323.46it/s]
100%|██████████| 101/101 [00:00<00:00, 3151.83it/s]
100%|██████████| 101/101 [00:00<00:00, 2920.32it/s]
100%|██████████| 100/100 [00:00<00:00,

Unnamed: 0,factors,regularization,p_at_k_mean,map_at_k_mean,ndcg_at_k_mean
0,1,0.005,0.424007,0.344604,0.420083
1,1,0.01,0.423791,0.344165,0.42029
2,1,0.02,0.424949,0.344927,0.421517
3,5,0.005,0.254302,0.186533,0.253958
4,5,0.01,0.25388,0.185932,0.253187
5,5,0.02,0.260535,0.192172,0.259911
6,10,0.005,0.225367,0.159818,0.225753
7,10,0.01,0.223653,0.160471,0.225264
8,10,0.02,0.233774,0.168269,0.234268
9,20,0.005,0.208391,0.144956,0.206501


### Results to Csv

In [None]:
results_of_bpr_grid.to_csv("results_of_bpr_grid.csv", index=False)

### Best Params

In [None]:
results_of_bpr_grid.p_at_k_mean.idxmax()
results_of_bpr_grid.iloc[results_of_bpr_grid.p_at_k_mean.idxmax()]

factors                 70
epochs                  70
regularization        0.02
p_at_k_mean       0.294134
map_at_k_mean     0.205017
ndcg_at_k_mean    0.274624
Name: 59, dtype: object

In [None]:
results_of_bpr_grid.iloc[results_of_bpr_grid.map_at_k_mean.idxmax()]

factors                 70
epochs                  70
regularization        0.02
p_at_k_mean       0.294134
map_at_k_mean     0.205017
ndcg_at_k_mean    0.274624
Name: 59, dtype: object

In [None]:
results_of_bpr_grid.iloc[results_of_bpr_grid.ndcg_at_k_mean.idxmax()]

factors                 50
epochs                  50
regularization        0.02
p_at_k_mean       0.292184
map_at_k_mean     0.203665
ndcg_at_k_mean    0.275885
Name: 44, dtype: object

## LogisticMatrixFactorization

In [None]:

folds = 4

results = pd.DataFrame()
def cross_validation_implicit_lmf(data):
    results_over_all = pd.DataFrame(columns=["factors","epochs","regularization","p_at_k_mean","map_at_k_mean","ndcg_at_k_mean"])
    factor_list = [5,10,20,50,70]
    iteration_list = [20,30, 50,70]
    regularization_list = [0.005,0.01,0.02]


    for f in factor_list:
        for i in iteration_list:
            for r in regularization_list:
                avg_p_at_k_list = []
                avg_map_at_k_list = []
                avg_ndcg_at_k_list = []

                #Note: Shuffle is set on false, because it leads to ambiguous results regarding the map_at_k and the ndcg_at_k. The 
                # However p_at_k is not affected by the shuffle and stays consistent.
                skf = KFold(n_splits= folds , shuffle=False) 

                for r_seed in range(0,5):
                    # Lists to store the results for each random_seed
                    p_at_k_list = []
                    map_at_k_list = []
                    ndcg_at_k_list = []
                    model = implicit.lmf.LogisticMatrixFactorization(factors = f, random_state= r_seed, iterations = i, regularization= r)

                    ###########################
                    # Split the data
                    ###########################
                    for train_index, test_index in skf.split(data):
                        X_train = data[train_index]
                        X_test = data[test_index]
                        
                        ###########################
                        # Shuffle manually
                        ###########################

                        index = np.arange(np.shape(X_train)[0])
                        np.random.RandomState(seed = 1).shuffle(index)
                        
                        X_train = X_train[index, :] 
                        model.fit(X_train)

                        #############
                        # Fifth Step: We predict on the scaled test data
                        #############
                        p_at_k = precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
                        map_at_k  = mean_average_precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
                        var_ndcg_at_k = ndcg_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)

                        # results to list
                        p_at_k_list.append(p_at_k)
                        map_at_k_list.append(map_at_k) 
                        ndcg_at_k_list.append(var_ndcg_at_k) #store as df to create the avg confusion matrix later


                    # store the results for each random_seed
                    # After each iteration in the random_seed loop the lists (acc_list, bacc_list and confusion_m_list) are cleaned. Therefore we can do the following computations to store the average results for each random_seed:
                    # Store the averages per random_seed in list. 
                    avg_p_at_k_list.append(statistics.mean(p_at_k_list))
                    avg_map_at_k_list.append(statistics.mean(map_at_k_list))
                    avg_ndcg_at_k_list.append(statistics.mean(ndcg_at_k_list))

                
                row = {
                    "factors": f,
                    "epochs": i,
                    "regularization": r,
                    "p_at_k_mean": statistics.mean(avg_p_at_k_list),  
                    "map_at_k_mean":statistics.mean(avg_map_at_k_list), 
                    "ndcg_at_k_mean":statistics.mean(avg_ndcg_at_k_list),
                    }
                results_per_run = pd.DataFrame([row])
                results_over_all = pd.concat([results_over_all, results_per_run], ignore_index=True)
    return results_over_all

In [None]:
results_of_lmf_grid = cross_validation_implicit_lmf(data_skills_row_norm_csr)
results_of_lmf_grid

100%|██████████| 20/20 [00:00<00:00, 66.67it/s] 
100%|██████████| 101/101 [00:00<00:00, 1074.41it/s]
100%|██████████| 101/101 [00:00<00:00, 1312.25it/s]
100%|██████████| 101/101 [00:00<00:00, 1442.85it/s]
100%|██████████| 20/20 [00:00<00:00, 150.38it/s]
100%|██████████| 101/101 [00:00<00:00, 2463.56it/s]
100%|██████████| 101/101 [00:00<00:00, 2244.29it/s]
100%|██████████| 101/101 [00:00<00:00, 2526.01it/s]
100%|██████████| 20/20 [00:00<00:00, 166.67it/s]
100%|██████████| 101/101 [00:00<00:00, 2657.62it/s]
100%|██████████| 101/101 [00:00<00:00, 2805.48it/s]
100%|██████████| 101/101 [00:00<00:00, 2348.92it/s]
100%|██████████| 20/20 [00:00<00:00, 170.94it/s]
100%|██████████| 101/101 [00:00<00:00, 2657.97it/s]
100%|██████████| 101/101 [00:00<00:00, 2730.05it/s]
100%|██████████| 101/101 [00:00<00:00, 3060.39it/s]
100%|██████████| 20/20 [00:00<00:00, 162.61it/s]
100%|██████████| 101/101 [00:00<00:00, 2807.04it/s]
100%|██████████| 101/101 [00:00<00:00, 2295.72it/s]
100%|██████████| 101/101 [0

Unnamed: 0,factors,epochs,regularization,p_at_k_mean,map_at_k_mean,ndcg_at_k_mean
0,5,20,0.005,0.211345,0.132021,0.198403
1,5,20,0.01,0.226556,0.145399,0.214212
2,5,20,0.02,0.23175,0.147755,0.218211
3,5,30,0.005,0.19601,0.125619,0.18524
4,5,30,0.01,0.198243,0.126151,0.186596
5,5,30,0.02,0.214959,0.136981,0.202376
6,5,50,0.005,0.161072,0.099616,0.148457
7,5,50,0.01,0.171202,0.110067,0.1617
8,5,50,0.02,0.189409,0.121717,0.177791
9,5,70,0.005,0.142384,0.090614,0.134679


In [None]:
results_of_lmf_grid.to_csv("results_of_lmf_grid.csv", index=False)

In [None]:
results_of_lmf_grid.dtypes

factors            object
epochs             object
regularization    float64
p_at_k_mean       float64
map_at_k_mean     float64
ndcg_at_k_mean    float64
dtype: object

In [None]:
results_of_lmf_grid.describe()

Unnamed: 0,regularization,p_at_k_mean,map_at_k_mean,ndcg_at_k_mean
count,60.0,60.0,60.0,60.0
mean,0.011667,0.21607,0.140182,0.201202
std,0.006289,0.049113,0.037761,0.046637
min,0.005,0.129559,0.080903,0.117947
25%,0.005,0.173136,0.112629,0.163186
50%,0.01,0.213593,0.134124,0.199056
75%,0.02,0.250388,0.160673,0.233339
max,0.02,0.294134,0.205017,0.275885


### Best Params

In [None]:
results_of_lmf_grid.p_at_k_mean.idxmax()
results_of_lmf_grid.iloc[results_of_lmf_grid.p_at_k_mean.idxmax()]

factors                 70
epochs                  70
regularization        0.02
p_at_k_mean       0.294134
map_at_k_mean     0.205017
ndcg_at_k_mean    0.274624
Name: 59, dtype: object

In [None]:
results_of_lmf_grid.iloc[results_of_lmf_grid.map_at_k_mean.idxmax()]

factors                 70
epochs                  70
regularization        0.02
p_at_k_mean       0.294134
map_at_k_mean     0.205017
ndcg_at_k_mean    0.274624
Name: 59, dtype: object

In [None]:
results_of_lmf_grid.iloc[results_of_lmf_grid.ndcg_at_k_mean.idxmax()]

factors                 50
epochs                  50
regularization        0.02
p_at_k_mean       0.292184
map_at_k_mean     0.203665
ndcg_at_k_mean    0.275885
Name: 44, dtype: object

In [None]:
# train the model on a sparse matrix of user/item/confidence weights
train, test = train_test_split(data_skills_row_norm_csr, train_percentage =  0.8, random_state = 1)
model = implicit.lmf.LogisticMatrixFactorization(factors = 70, random_state=1,iterations = 50, regularization= 0.02)
model.fit(train)


100%|██████████| 50/50 [00:04<00:00, 10.86it/s]


In [None]:
test?

[1;31mType:[0m        csr_matrix
[1;31mString form:[0m
(0, 165)	0.11396057645963795
           (0, 242)	0.11396057645963795
           (0, 243)	0.11396057645963795
           (0 <...>  (403, 629)	0.08192319205190406
           (403, 643)	0.08192319205190406
           (403, 652)	0.08192319205190406
[1;31mFile:[0m        c:\users\maximus\envs\ds_ap\lib\site-packages\scipy\sparse\_csr.py
[1;31mDocstring:[0m  
Compressed Sparse Row matrix

This can be instantiated in several ways:
    csr_matrix(D)
        with a dense matrix or rank-2 ndarray D

    csr_matrix(S)
        with another sparse matrix S (equivalent to S.tocsr())

    csr_matrix((M, N), [dtype])
        to construct an empty matrix with shape (M, N)
        dtype is optional, defaulting to dtype='d'.

    csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
        where ``data``, ``row_ind`` and ``col_ind`` satisfy the
        relationship ``a[row_ind[k], col_ind[k]] = data[k]``.

    csr_matrix((data, indices, ind

In [None]:
matrix.iloc[0].sum()


77

In [None]:
user_id  = 0
skill_ids, scores = model.recommend(user_id, data_skills_row_norm_csr[user_id], N = 10 , filter_already_liked_items=False)
recommendations_df = pd.DataFrame({"skill": data_skills_row_norm.columns[skill_ids], "score": scores, "already_liked": np.in1d(skill_ids, data_skills_row_norm_csr[userid].indices)})
recommendations_df

Unnamed: 0,skill,score,already_liked
0,OpenOffice / LibreOffice,0.580505,False
1,Subversion,0.305606,True
2,PowerShell,0.297782,False
3,JSON,0.220934,False
4,Windows NT/2000/XP,0.176941,True
5,Spanisch,0.072854,False
6,Wasserfallmodel,-0.120552,False
7,Android,-0.179944,True
8,Windows 10,-0.21522,True
9,JavaScript,-0.330638,True


In [None]:
recommendations_df.merge(skill_and_categories, how = "left", left_on = "skill", right_on = "skill")

Unnamed: 0,skill,score,already_liked,category
0,OpenOffice / LibreOffice,0.580505,False,Produkterfahrung
1,Subversion,0.305606,True,"CI/CD, Build- und Versionskontrollsysteme"
2,PowerShell,0.297782,False,Technik/Tools
3,JSON,0.220934,False,Webentwicklung
4,Windows NT/2000/XP,0.176941,True,Betriebssystem
5,Spanisch,0.072854,False,Sprachen
6,Wasserfallmodel,-0.120552,False,Projektmanagement / Vorgehensmodelle
7,Android,-0.179944,True,Betriebssystem
8,Windows 10,-0.21522,True,Betriebssystem
9,JavaScript,-0.330638,True,Programmiersprachen / Scriptsprachen


In [None]:
dict_user_skills = {}
for value, name in zip(data_skills_row_norm.iloc[userid,:], data_skills_row_norm.columns):
    if value > 0:
        dict_user_skills[name] = value
    else:
        next

dict_user_skills

{'Android': 0.23570226039551587,
 'Apache': 0.23570226039551587,
 'GIT': 0.23570226039551587,
 'JavaScript': 0.23570226039551587,
 'Joomla': 0.23570226039551587,
 'Microsoft SQL Server': 0.23570226039551587,
 'MySQL': 0.23570226039551587,
 'Oracle': 0.23570226039551587,
 'PHP': 0.23570226039551587,
 'PhpStorm': 0.23570226039551587,
 'Pimcore': 0.23570226039551587,
 'SUSE Linux': 0.23570226039551587,
 'Subversion': 0.23570226039551587,
 'Symfony': 0.23570226039551587,
 'Ubuntu': 0.23570226039551587,
 'Windows 10': 0.23570226039551587,
 'Windows 7': 0.23570226039551587,
 'Windows NT/2000/XP': 0.23570226039551587}

### Baseline

In [None]:
# initialize a model
model = implicit.als.AlternatingLeastSquares(factors = 1,random_state=1,regularization= 0.005) 

In [None]:
# train the model on a sparse matrix of user/item/confidence weights
model.fit(data_skills_row_norm_csr)

100%|██████████| 15/15 [00:00<00:00, 43.00it/s]


In [None]:
userid= 13
dict_user_skills = {}
for value, name in zip(data_skills_row_norm.iloc[userid,:], data_skills_row_norm.columns):
    if value > 0:
        dict_user_skills[name] = value
    else:
        next

dict_user_skills

{'Android': 0.23570226039551587,
 'Apache': 0.23570226039551587,
 'GIT': 0.23570226039551587,
 'JavaScript': 0.23570226039551587,
 'Joomla': 0.23570226039551587,
 'Microsoft SQL Server': 0.23570226039551587,
 'MySQL': 0.23570226039551587,
 'Oracle': 0.23570226039551587,
 'PHP': 0.23570226039551587,
 'PhpStorm': 0.23570226039551587,
 'Pimcore': 0.23570226039551587,
 'SUSE Linux': 0.23570226039551587,
 'Subversion': 0.23570226039551587,
 'Symfony': 0.23570226039551587,
 'Ubuntu': 0.23570226039551587,
 'Windows 10': 0.23570226039551587,
 'Windows 7': 0.23570226039551587,
 'Windows NT/2000/XP': 0.23570226039551587}

### Create a df that holds the recommendations and the scores 
* Die Scores sind in der implicip library equivalent zu der confidence. Die Confidence gibt an, wie häufig ein item x konsumiert wurde. Allerdings soll die Intensität, mit der dieser Effekt steigt, sukkzessive abnehmen, um den Einfluss eines Superfans (jmd der überdurchschnittkich viel von Item x konsumiert) zu schwächen. Hier sind parallelen zu den Motiven der Row-Normalisation zu erkennen. <br>
Ich habe im ersten Lauf die Werte meiner row-normalisation als confidence werte verwendet. <br>
    * Sollte nach angepasst werden. Hierzu: https://en.wikipedia.org/wiki/Okapi_BM25

Maybe it makes sense <br>
"
Confidence can be defined as the worth or the value we give to the interaction. For User A buying(a transaction event) item X we increase the interaction weight, while User A viewing item Z has lesser weight than the ‘interaction of buying’. https://towardsdatascience.com/alternating-least-square-for-implicit-dataset-with-code-8e7999277f4b " 

In [None]:
skill_ids, scores  = model.recommend(userid, data_skills_row_norm_csr[userid], N = 20 , filter_already_liked_items=False)
recommendations_df = pd.DataFrame({"skill": data_skills_row_norm.columns[skill_ids], "score": scores, "already_liked": np.in1d(skill_ids, data_skills_row_norm_csr[userid].indices)})
recommendations_df

Unnamed: 0,skill,score,already_liked
0,Englisch,0.109891,False
1,MySQL,0.108704,True
2,HTML,0.081065,False
3,JavaScript,0.072065,True
4,SQL,0.071913,False
5,GIT,0.06957,True
6,SCRUM,0.06155,False
7,Test Driven Development (TDD),0.056983,False
8,Java,0.056517,False
9,JIRA,0.056304,False


In [None]:
skill_and_categories = df[["skill","category"]].drop_duplicates()

KeyError: "['category'] not in index"

In [None]:
recommendations_df.merge(skill_and_categories, how = "left", left_on = "skill", right_on = "skill")

Unnamed: 0,skill,score,already_liked,category
0,MySQL,0.44042,True,Datenbank
1,PHP,0.422259,True,Webentwicklung
2,Windows NT/2000/XP,0.38084,True,Betriebssystem
3,Apache,0.380116,True,Applikationsserver
4,TYPO3 Generalist,0.21658,False,TYPO3
5,PhpStorm,0.204278,True,Umgebungen
6,Windows 7,0.195201,True,Betriebssystem
7,Java,0.185855,False,Programmiersprachen / Scriptsprachen
8,Englisch,0.185376,False,Sprachen
9,C/C++,0.167171,False,Programmiersprachen / Scriptsprachen


In [None]:
# Id and skills
dict_skills_id = {}
for c,skill in enumerate(data_skills_row_norm.columns):
        dict_skills_id[c] = skill

dict_skills_id

{0: '.NET Compact Framework',
 1: '.NET Core',
 2: '.NET Framework',
 3: '3D-Modellierung',
 4: 'ABAP',
 5: 'ADO.NET',
 6: 'AIX',
 7: 'ARIS',
 8: 'ARIS ITArchitect',
 9: 'AS400',
 10: 'ASP Generalist',
 11: 'ASP.NET',
 12: 'ASP.NET MVC-Framework',
 13: 'ASP.NET WebAPI',
 14: 'AWS',
 15: 'AWS Lambda',
 16: 'Abstract',
 17: 'Access',
 18: 'Accessibility / WCAG',
 19: 'Active Directory',
 20: 'ActiveX',
 21: 'Adobe CC',
 22: 'Adobe Flash',
 23: 'Adobe Illustrator',
 24: 'Adobe InDesign',
 25: 'Adobe Indesign',
 26: 'Adobe Photoshop',
 27: 'Adobe Premiere',
 28: 'Adobe XD',
 29: 'After Effects',
 30: 'Agile Methoden',
 31: 'Ajax',
 32: 'Alexa-Skills',
 33: 'Alpine',
 34: 'Analytics',
 35: 'Android',
 36: 'Android Studio',
 37: 'Anforderungsanalyse',
 38: 'Anforderungsmanagement',
 39: 'Angular (2 und höher)',
 40: 'Angular Generalist',
 41: 'Angular Material',
 42: 'Angular Theming',
 43: 'AngularJS',
 44: 'Animations (transition, @keyframes)',
 45: 'Ansible',
 46: 'Ant',
 47: 'Apache',
 4

In [None]:
keys = [k for k, v in dict_skills_id.items() if v == 'Python']
print(keys)

[458]


### Display simular items

In [None]:
# find related item
skill_id, sim = model.similar_items(0, N=10)
# display the results using pandas for nicer formatting
simularity_df  = pd.DataFrame({"skill": data_skills_row_norm.columns[skill_id], "score": sim})
simularity_df.merge(skill_and_categories, how = "left", left_on = "skill", right_on = "skill")

Unnamed: 0,skill,score,category
0,.NET Compact Framework,1.0,.NET Frameworks und Tools
1,ADO.NET,0.674586,.NET Frameworks und Tools
2,DevExpress,0.635214,.NET Frameworks und Tools
3,Silverlight,0.632719,.NET Frameworks und Tools
4,Microsoft Enterprise Library,0.632117,.NET Frameworks und Tools
5,ASP Generalist,0.612088,.NET Frameworks und Tools
6,Windows Communication Foundation (WCF),0.599716,.NET Frameworks und Tools
7,Mono,0.578717,.NET Frameworks und Tools
8,Windows Forms,0.566209,.NET Frameworks und Tools
9,Exact,0.548891,ERP


# Add a Evaluation metric to the library

### APK
http://sdsawtelle.github.io/blog/output/mean-average-precision-MAP-for-recommender-systems.html

In [None]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if not actual:
        return 0.0

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
###
# Recommendations for user_id = 13
###
recommendations_list = list(recommendations_df.skill)
known_skills_list = list(dict_user_skills.keys())
len(known_skills_list)

18

In [None]:
apk(known_skills_list,recommendations_list,k=20)

0.4013227513227513

## Train-Test split

In [47]:
# train the model on a sparse matrix of user/item/confidence weights
train, test = train_test_split(data_skills_csr_weighted, train_percentage =  0.8, random_state = 1)
model = implicit.als.AlternatingLeastSquares(factors = 70, random_state=1, regularization= 0.02,iterations = 70 )
model.fit(train)

100%|██████████| 70/70 [00:06<00:00, 10.86it/s]


In [34]:
test[0]

<1x735 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [35]:
matrix.iloc[0].sum()

77

In [None]:
user_id  = 3
skill_ids, scores = model.recommend(0, data_skills_row_norm_csr[user_id], N = 4 , filter_already_liked_items=False)
recommendations_df = pd.DataFrame({"skill": data_skills_row_norm.columns[skill_ids], "score": scores, "already_liked": np.in1d(skill_ids, data_skills_row_norm_csr[userid].indices)})
recommendations_df

Unnamed: 0,skill,score,already_liked
0,Englisch,0.436307,False
1,MySQL,0.431594,True
2,HTML,0.321858,False
3,JavaScript,0.286125,True


In [None]:
most_common_skills = matrix.sum(axis=0).nlargest(10)	
for i in most_common_skills.index:
    print(i)

Englisch
MySQL
SCRUM
GIT
HTML
JavaScript
JIRA
CSS
SQL
Java


In [36]:
results = precision_at_k(model, train_user_items=train, test_user_items=test, K=4)
results

100%|██████████| 378/378 [00:00<00:00, 4111.21it/s]


0.3347670250896057

In [37]:
results = mean_average_precision_at_k(model, train_user_items=train, test_user_items=test, K=5)
results

100%|██████████| 378/378 [00:00<00:00, 6003.73it/s]


0.21753747795414485

### Notiz
* COnfidence durch Multiplikation erhöhen
* Beste model testen und anhand eines users ausm test set nachvollziehen

The model was fitted on the whole dataset.
Try to do a train test / cross valdiation later on.
* see: https://gist.github.com/jbochi/2e8ddcc5939e70e5368326aa034a144e
* evaluation of the implicit lib: https://github.com/benfred/implicit/blob/main/implicit/evaluation.pyx

In [37]:
model = implicit.als.AlternatingLeastSquares(factors = 20, random_state= 1)

In [43]:
from scipy.sparse import coo_matrix


In [34]:
train, test = implicit.evaluation.leave_k_out_split(data_skills_csr_weighted, K=1, random_state=1)

In [35]:
model.fit(train)

100%|██████████| 15/15 [00:03<00:00,  4.59it/s]


In [36]:
implicit.evaluation.precision_at_k(model, train_user_items=train, test_user_items=test, K=5)

100%|██████████| 374/374 [00:00<00:00, 1490.06it/s]


0.3502673796791444

## Raw

In [156]:
model = implicit.als.AlternatingLeastSquares(factors = 20, random_state= 1)

In [143]:
train, test = implicit.evaluation.leave_k_out_split(data_skills_csr, K=4, random_state=1)

In [172]:
model.fit(train)

100%|██████████| 15/15 [00:02<00:00,  5.74it/s]


In [173]:
implicit.evaluation.train_test_split(data_skills_csr, train_percentage =  0.8, random_state = 1)

[1;31mDocstring:[0m
leave_k_out_split(ratings, int K=1, float train_only_size=0.0, random_state=None)
Implements the 'leave-k-out' split protocol for a ratings matrix. Default
    parameters will produce a 'leave-one-out' split.

    This will create two matrices, one where each eligible user (i.e. user with > K + 1
    ratings) will have a single rating held in the test set, and all other ratings held
    in the train set. Optionally, a percentage of users can be reserved to appear _only_
    in the train set. By default, all eligible users may appear in the test set.

    Parameters
    ----------
    ratings : csr_matrix
        The input ratings CSR matrix to be split.
    K : int
        The total number of samples to be 'left out' in the test set.
    train_only_size : float
        The size (as a fraction) of the users set that should appear *only* in the
        training matrix.
    random_state : int, None or RandomState
        The existing RandomState. If None, or an int, 

In [222]:
for i in range (0,2):
    model = implicit.als.AlternatingLeastSquares(factors = 20, random_state= 42)
    train, test = implicit.evaluation.leave_k_out_split(data_skills_csr, K =5, random_state = np.random.RandomState(seed=1))
    model.fit(train)
    print(model.item_factors[0][0])

100%|██████████| 15/15 [00:01<00:00,  7.53it/s]


0.003988262


100%|██████████| 15/15 [00:01<00:00,  7.93it/s]

0.008470657





In [187]:
model = implicit.als.AlternatingLeastSquares(factors = 20, random_state= 42)
train, test = implicit.evaluation.train_test_split(data_skills_csr, train_percentage =  0.8, random_state = 1)
train

<404x735 sparse matrix of type '<class 'numpy.int64'>'
	with 19341 stored elements in Compressed Sparse Row format>

In [219]:
user_0_train = train[0]
user_0_test = test[0]
print(f'Wert für user {0} bzgl des Skills {matrix.columns[165]}  im Trainset: {user_0_train[0,165]} und im Testset: {user_0_test[0,165]}')

Wert für user 0 bzgl des Skills Docker  im Trainset: 0 und im Testset: 1


In [206]:
user_0_test?

[1;31mType:[0m        csr_matrix
[1;31mString form:[0m
  (0, 165)	1
  (0, 242)	1
  (0, 243)	1
  (0, 268)	1
  (0, 276)	1
  (0, 288)	1
  (0, 294)	1
  (0, 334)	1
  (0, 345)	1
  (0, 378)	1
  (0, 638)	1
  (0, 649)	1
  (0, 669)	1
  (0, 709)	1
[1;31mFile:[0m        c:\users\maximus\envs\ds_ap\lib\site-packages\scipy\sparse\_csr.py
[1;31mDocstring:[0m  
Compressed Sparse Row matrix

This can be instantiated in several ways:
    csr_matrix(D)
        with a dense matrix or rank-2 ndarray D

    csr_matrix(S)
        with another sparse matrix S (equivalent to S.tocsr())

    csr_matrix((M, N), [dtype])
        to construct an empty matrix with shape (M, N)
        dtype is optional, defaulting to dtype='d'.

    csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
        where ``data``, ``row_ind`` and ``col_ind`` satisfy the
        relationship ``a[row_ind[k], col_ind[k]] = data[k]``.

    csr_matrix((data, indices, indptr), [shape=(M, N)])
        is the standard CSR representatio

In [147]:
implicit.evaluation.precision_at_k(model, train_user_items=train, test_user_items=test, K=5)

100%|██████████| 365/365 [00:00<00:00, 3016.12it/s]


0.33493150684931505

In [82]:
data_skills_row_norm.columns[165]

'Docker'

In [84]:
user_id  = 0
skill_ids, scores = model.recommend(user_id, train[user_id], N = 10 , filter_already_liked_items=True)
recommendations_df = pd.DataFrame({"skill": data_skills_row_norm.columns[skill_ids], "score": scores, "already_liked": np.in1d(skill_ids, train[user_id].indices)})
recommendations_df


Unnamed: 0,skill,score,already_liked
0,Test Driven Development (TDD),1.040657,False
1,Docker,0.798348,False
2,CSS,0.553093,False
3,XML/XSL,0.55143,False
4,Python,0.540794,False
5,Windows Vista,0.539017,False
6,Tomcat,0.504025,False
7,MongoDB,0.498637,False
8,FileZilla,0.497881,False
9,SOAP,0.476021,False


In [85]:
recommendations_df.merge(skill_and_categories, how = "left", left_on = "skill", right_on = "skill")

NameError: name 'skill_and_categories' is not defined

## Debugging

In [294]:
def cross_validation_implicit_als(data, algorithm, algo_name):
    results_over_all = pd.DataFrame(columns=["algo_name", "factors","alpha","regularization","iterations","p_at_k_mean","map_at_k_mean","ndcg_at_k_mean"])
    factor_list = [20]
    alpha_list = [1.0]
    regularization_list = [0.01]
    iterations_list =  [15]

    for i in iterations_list:
        for f in factor_list:
            for a in alpha_list:
                for r in regularization_list:

                    # Storing the different scores for each algorithm initialization under a random_seed (c.p)
                    p_algo_results = [] 
                    map_algo_results = []     
                    ndcg_algo_results = []

                    for r_seed in range(0,5):
                        # Storing the different scores for each train_test_split under a random_seed (c.p)
                        p_train_test_results = []
                        map_train_test_results = []
                        ndcg_train_test_results = []
 
                        #############
                        # Initiate the model
                        #############
                        model = algorithm(factors = f, random_state= r_seed, alpha= a, regularization= r, iterations= i)

                        for r_seed in range(0,5):
                            #############
                            # Initiate the train_test_split
                            #############
                            train_mat, test_mat = implicit.evaluation.train_test_split(data, train_percentage =  0.8, random_state = r_seed)
                            model.fit(train_mat)

                            #############
                            # measure evaluation metrixs for each possible hyperparameter combination
                            #############
                            p_at_k = precision_at_k(model, train_user_items=train_mat, test_user_items=test_mat, K=5)
                            map_at_k  = mean_average_precision_at_k(model, train_user_items=train_mat, test_user_items=test_mat, K=5)
                            var_ndcg_at_k = ndcg_at_k(model, train_user_items=train_mat, test_user_items=test_mat, K=5)

                            # results to list 
                            p_train_test_results.append(p_at_k)
                            map_train_test_results.append(map_at_k)
                            ndcg_train_test_results.append(var_ndcg_at_k)
                        
                        # Store the mean for for the different randomseeds in the train_test_split (c.p) in a list
                        p_algo_results.append(np.mean(p_train_test_results))
                        map_algo_results.append(np.mean(map_train_test_results))
                        ndcg_algo_results.append(np.mean(ndcg_train_test_results))

                        
                    row = {
                        "algo_name": algo_name,
                        "factors": f,
                        "alpha": a,
                        "regularization": r,
                        "iterations": i,
                        "p_at_k_mean": statistics.mean(p_algo_results),  
                        "map_at_k_mean":statistics.mean(map_algo_results), 
                        "ndcg_at_k_mean":statistics.mean(ndcg_algo_results),
                        }
                    results_per_run = pd.DataFrame([row])
                    results_over_all = pd.concat([results_over_all, results_per_run], ignore_index=True)
    return results_over_all

In [296]:
als  = implicit.als.AlternatingLeastSquares
results_of_als_grid = cross_validation_implicit_als(data_skills_csr,algorithm = als, algo_name = "als")
results_of_als_grid


100%|██████████| 15/15 [00:01<00:00,  7.67it/s]
100%|██████████| 377/377 [00:00<00:00, 3185.92it/s]
100%|██████████| 377/377 [00:00<00:00, 3213.76it/s]
100%|██████████| 377/377 [00:00<00:00, 3422.82it/s]
100%|██████████| 15/15 [00:01<00:00,  8.38it/s]
100%|██████████| 378/378 [00:00<00:00, 2936.88it/s]
100%|██████████| 378/378 [00:00<00:00, 3323.81it/s]
100%|██████████| 378/378 [00:00<00:00, 3227.75it/s]
100%|██████████| 15/15 [00:01<00:00,  8.17it/s]
100%|██████████| 367/367 [00:00<00:00, 3188.20it/s]
100%|██████████| 367/367 [00:00<00:00, 3131.27it/s]
100%|██████████| 367/367 [00:00<00:00, 3007.80it/s]
100%|██████████| 15/15 [00:01<00:00,  8.19it/s]
100%|██████████| 379/379 [00:00<00:00, 2818.58it/s]
100%|██████████| 379/379 [00:00<00:00, 3158.14it/s]
100%|██████████| 379/379 [00:00<00:00, 3158.37it/s]
100%|██████████| 15/15 [00:01<00:00,  8.25it/s]
100%|██████████| 374/374 [00:00<00:00, 3206.82it/s]
100%|██████████| 374/374 [00:00<00:00, 3252.23it/s]
100%|██████████| 374/374 [00:00<

Unnamed: 0,algo_name,factors,alpha,regularization,iterations,p_at_k_mean,map_at_k_mean,ndcg_at_k_mean
0,als,20,1.0,0.01,15,0.52089,0.402772,0.511272


## Passing a class aas an arguemnt

In [273]:
def passing_a_class_as_argument(algo):
    model = algo(random_state= 42)
    train_mat, test_mat = implicit.evaluation.train_test_split(data_skills_csr, train_percentage =  0.8, random_state = 42)
    model.fit(train_mat)
    p_at_k = precision_at_k(model, train_user_items=train_mat, test_user_items=test_mat, K=5)
    return p_at_k



In [274]:
model = implicit.als.AlternatingLeastSquares(random_state= 42)
train_mat, test_mat = implicit.evaluation.train_test_split(data_skills_csr, train_percentage =  0.8, random_state = 42)
model.fit(train_mat)
p_at_k = precision_at_k(model, train_user_items=train_mat, test_user_items=test_mat, K=5)
p_at_k

100%|██████████| 15/15 [00:03<00:00,  4.57it/s]
100%|██████████| 371/371 [00:00<00:00, 3229.21it/s]


0.2318154937906564

In [272]:
passing_a_class_as_argument(implicit.als.AlternatingLeastSquares)

100%|██████████| 15/15 [00:04<00:00,  3.45it/s]
100%|██████████| 371/371 [00:00<00:00, 1515.81it/s]


0.2318154937906564

In [285]:
model = implicit.lmf.LogisticMatrixFactorization(random_state= 42)
train_mat, test_mat = implicit.evaluation.train_test_split(data_skills_csr, train_percentage =  0.8, random_state = 42)
model.fit(train_mat)
p_at_k = precision_at_k(model, train_user_items=train_mat, test_user_items=test_mat, K=5)
p_at_k

100%|██████████| 30/30 [00:02<00:00, 10.72it/s]
100%|██████████| 371/371 [00:00<00:00, 2178.60it/s]


0.39976345357776466

In [286]:
passing_a_class_as_argument(implicit.lmf.LogisticMatrixFactorization)

100%|██████████| 30/30 [00:01<00:00, 29.63it/s]
100%|██████████| 371/371 [00:00<00:00, 2608.74it/s]


0.39976345357776466

## Param grid

In [334]:
param_grid = {'num_factors': [10,20],
              'regularization': [0.0,1],
              'alpha': [1]}


In [335]:
keys, values = zip(*param_grid.items())
keys, values


(('num_factors', 'regularization', 'alpha'), ([10, 20], [0.0, 1], [1]))

In [306]:
import itertools
import copy

In [462]:

for v in itertools.product(*values):
        params = dict(zip(keys, v))
        this_model = copy.deepcopy(implicit.als.AlternatingLeastSquares())
        print_line = []
        print(params)
        
        for k, v in params.items():
            setattr(this_model, k, v)
            print(k,v) 
                   
        #     print_line.append((k, v))



{'num_factors': 10, 'regularization': 0.0, 'alpha': 1}
num_factors 10
regularization 0.0
alpha 1
{'num_factors': 10, 'regularization': 1, 'alpha': 1}
num_factors 10
regularization 1
alpha 1
{'num_factors': 20, 'regularization': 0.0, 'alpha': 1}
num_factors 20
regularization 0.0
alpha 1
{'num_factors': 20, 'regularization': 1, 'alpha': 1}
num_factors 20
regularization 1
alpha 1


# GridsearchCV

In [457]:
def cross_validation_implicit_gs(data, algorithm, param_grid):
    df_list = []


    keys, values = zip(*param_grid.items())
    for c,v in enumerate(itertools.product(*values)):
        
        #############
        # This for loop produces every possible combination of the hyperparameters within a dictonary
        #############
        params = dict(zip(keys, v))
        this_model = copy.deepcopy(algorithm)
        df = pd.DataFrame()
        df = pd.DataFrame(params, index =  [c])


        for k, v in params.items():
            #############
            # This loop unpacks the diconary by each parameter and initiates the model for each hpyerparameter of the corresponding dictonary
            #############
            setattr(this_model, k, v)

        #############
        # lists to store the results after each train_test_split under different random seed (c.p)
        #############
        p_train_test_results = []
        map_train_test_results = []
        ndcg_train_test_results = []

        for r_seed in range(0,5):
            #############
            # Initiate the train_test_split
            #############
            train_mat, test_mat = implicit.evaluation.train_test_split(data, train_percentage =  0.8, random_state = r_seed)
            this_model.fit(train_mat)

            #############
            # measure evaluation metrixs for each possible hyperparameter combination
            #############
            p_at_k =  precision_at_k(this_model, train_user_items=train_mat, test_user_items=test_mat, K=5)
            map_at_k = mean_average_precision_at_k(this_model, train_user_items=train_mat, test_user_items=test_mat, K=5)
            var_ndcg_at_k = ndcg_at_k(this_model, train_user_items=train_mat, test_user_items=test_mat, K=5)

            #############
            # To list
            #############
            p_train_test_results.append(p_at_k)
            map_train_test_results.append(map_at_k)
            ndcg_train_test_results.append(var_ndcg_at_k)

        #############
        # Create columns to store the scores for each hyperparameter combination after 5 different train_test_splits
        #############
        df["precision_at_k"] = np.mean(p_train_test_results)
        df["map_at_k"]  = np.mean(map_train_test_results)
        df["var_ndcg_at_k"] = np.mean(ndcg_train_test_results)

        df_list.append(df)
    return pd.concat(df_list) 

In [460]:
als_grid = {'factors': [30,40],
              'regularization': [0.01,0.02],
              'iterations' : [15],
              'alpha': [1]
              }

In [461]:
a = cross_validation_implicit_gs(data_skills_csr, algorithm = implicit.als.AlternatingLeastSquares(random_state = 42),param_grid =  als_grid)
a

100%|██████████| 15/15 [00:01<00:00,  7.53it/s]
100%|██████████| 377/377 [00:00<00:00, 2856.06it/s]
100%|██████████| 377/377 [00:00<00:00, 2751.86it/s]
100%|██████████| 377/377 [00:00<00:00, 3065.31it/s]
100%|██████████| 15/15 [00:02<00:00,  5.20it/s]
100%|██████████| 378/378 [00:00<00:00, 1488.17it/s]
100%|██████████| 378/378 [00:00<00:00, 1431.81it/s]
100%|██████████| 378/378 [00:00<00:00, 2643.30it/s]
100%|██████████| 15/15 [00:02<00:00,  6.59it/s]
100%|██████████| 367/367 [00:00<00:00, 2659.47it/s]
100%|██████████| 367/367 [00:00<00:00, 2237.79it/s]
100%|██████████| 367/367 [00:00<00:00, 2016.67it/s]
100%|██████████| 15/15 [00:02<00:00,  5.94it/s]
100%|██████████| 379/379 [00:00<00:00, 2493.40it/s]
100%|██████████| 379/379 [00:00<00:00, 2178.14it/s]
100%|██████████| 379/379 [00:00<00:00, 2595.92it/s]
100%|██████████| 15/15 [00:02<00:00,  5.14it/s]
100%|██████████| 374/374 [00:00<00:00, 2308.12it/s]
100%|██████████| 374/374 [00:00<00:00, 2460.54it/s]
100%|██████████| 374/374 [00:00<

Unnamed: 0,factors,regularization,iterations,alpha,precision_at_k,map_at_k,var_ndcg_at_k
0,30,0.01,15,1,0.471631,0.351482,0.464644
1,30,0.02,15,1,0.471748,0.351666,0.464781
2,40,0.01,15,1,0.4314,0.311605,0.426909
3,40,0.02,15,1,0.430695,0.311542,0.426497


### Single setup validation

In [456]:
model_als = implicit.als.AlternatingLeastSquares(factors = 30, random_state= 42, alpha= 1, regularization= 0.01, iterations= 15)

ndcg_train_test_results = []
for r_seed in range(0,5):
    train_mat, test_mat = implicit.evaluation.train_test_split(data_skills_csr, train_percentage =  0.8, random_state = r_seed)
    model_als.fit(train_mat)


    #############
    # measure evaluation metrixs for each possible hyperparameter combination
    #############
    var_ndcg_at_k = ndcg_at_k(model_als, train_user_items=train_mat, test_user_items=test_mat, K=5)
    ndcg_train_test_results.append(var_ndcg_at_k)

print(np.mean(ndcg_train_test_results))
    

100%|██████████| 15/15 [00:02<00:00,  5.92it/s]
100%|██████████| 377/377 [00:00<00:00, 2673.81it/s]
100%|██████████| 15/15 [00:02<00:00,  6.09it/s]
100%|██████████| 378/378 [00:00<00:00, 3123.63it/s]
100%|██████████| 15/15 [00:02<00:00,  6.80it/s]
100%|██████████| 367/367 [00:00<00:00, 2759.42it/s]
100%|██████████| 15/15 [00:02<00:00,  6.87it/s]
100%|██████████| 379/379 [00:00<00:00, 2937.94it/s]
100%|██████████| 15/15 [00:02<00:00,  6.49it/s]
100%|██████████| 374/374 [00:00<00:00, 2877.13it/s]

0.46464433423963697



