## Implicit Library

In [222]:
import implicit
import h5py
import pandas as pd
import numpy as np
import random
import statistics
from implicit.evaluation import train_test_split
from sklearn.metrics import ndcg_score
from numpy.random import permutation
from implicit.nearest_neighbours import (
    BM25Recommender,
    CosineRecommender,
    TFIDFRecommender,
    bm25_weight,
)
import scipy
from scipy.sparse import csr_matrix
import scipy.sparse
from implicit.lmf import LogisticMatrixFactorization
from implicit.evaluation import precision_at_k,mean_average_precision_at_k, ndcg_at_k
from sklearn.model_selection import KFold
from random import shuffle


## Preprocessing

In [2]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";") 
#df = [~df["category"].isin(["Betriebssystem", "Dienste", "Einsatzfelder / Erfahrungen / Schwerpunkte", "Rolle (intern)"])]
df.head(5)

Unnamed: 0,person,skill,category
0,12,Windows NT/2000/XP,Betriebssystem
1,12,MAC OS X,Betriebssystem
2,12,Windows 7,Betriebssystem
3,12,Windows 8,Betriebssystem
4,12,iOS,Betriebssystem


In [3]:
data = df

In [4]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";") 
#df = [~df["category"].isin(["Betriebssystem", "Dienste", "Einsatzfelder / Erfahrungen / Schwerpunkte", "Rolle (intern)"])]
df.head(5)

Unnamed: 0,person,skill,category
0,12,Windows NT/2000/XP,Betriebssystem
1,12,MAC OS X,Betriebssystem
2,12,Windows 7,Betriebssystem
3,12,Windows 8,Betriebssystem
4,12,iOS,Betriebssystem


In [5]:
data = df

In [6]:
matrix = pd.read_csv("data/origin_binary_matrix.csv")

In [7]:
matrix

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Normalizing the rows

In [8]:
data_skills  = matrix
data_skills 

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In the next step we will compute the item-item relationships of our skills. Our final goal here is to construct a new item by item matrix containing the weights (relationships) between each of our skills where a perfect correlation equals 1 and no correlation at all equals 0.
<br>
In order to do so, we will first normalize the user vectors. The idea behind this approach is, that a user with many skills contributes less to any individual skill. For example if a user, that rules only 3 skills, rules a skill X it is more valueable than if a user, that rules 20 skills, that particular skill.


* First we caclulat the magnitude for every user

In [9]:
# magnitude = sqrt(x2 + y2 + z2 + ...)
magnitude = np.sqrt(np.square(data_skills).sum(axis=1))

In [10]:
magnitude

0       8.774964
1       8.602325
2       5.567764
3       5.567764
4       1.000000
         ...    
399     2.828427
400     4.358899
401     3.000000
402     2.645751
403    12.206556
Length: 404, dtype: float64

* Now we use a users magnitude to normalize the ratings of this corresponding user

In [11]:
data_skills_row_norm=  data_skills.divide(magnitude, axis='index')

In [12]:
data_skills_row_norm

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
1,0.0,0.000000,0.116248,0.000000,0.000000,0.116248,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
2,0.0,0.000000,0.000000,0.179605,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.179605,0.0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
4,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
400,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
401,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
402,0.0,0.377964,0.377964,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0


In [27]:
data_skills_row_norm_csr = scipy.sparse.csr_matrix(data_skills_row_norm.values)


### Bm25

In [290]:
data_skills

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
data_skills_csr = scipy.sparse.csr_matrix(data_skills.values)

In [19]:
data_skills_csr_weighted = implicit.nearest_neighbours.bm25_weight(data_skills_csr, K1=100, B=0.8)

In [25]:
pd.DataFrame.sparse.from_spmatrix(data_skills_csr_weighted, columns=data_skills.columns)

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
1,0.0,0.000000,1.302587,0.000000,0.000000,2.37719,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
2,0.0,0.000000,0.000000,3.378903,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,2.089198,0.0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
4,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
400,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
401,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
402,0.0,11.987697,5.146573,0.000000,0.000000,0.00000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0


In [13]:
df = df.drop(columns=["category"])

In [14]:
data = df
data.head(5)

Unnamed: 0,person,skill
0,12,Windows NT/2000/XP
1,12,MAC OS X
2,12,Windows 7
3,12,Windows 8
4,12,iOS


## Baseline

In [None]:
most_common_skills = matrix.sum(axis=0).nlargest(5)	
for i in most_common_skills.index:
    print(i)

Englisch
MySQL
SCRUM
GIT
HTML


In [None]:
p_at_5_baseline_list  = []
for i in range(0,len(matrix)):
    user_series = matrix.loc[i]
    user_know_skills = user_series[user_series > 0]
    user_known_skills_vs_most_common = user_know_skills.index.isin(most_common_skills.index)
    matches = user_known_skills_vs_most_common.sum()
    results =matches/5
    p_at_5_baseline_list.append(results)
 
p_at_5_baseline = statistics.mean(p_at_5_baseline_list)
p_at_5_baseline


0.6891089108910892

Wie aussagekräftig ist p at k ? Ein recommender, der immmer die häufigsten Skills vorschlägt ist nicht zielführend, hat haben einen hohen p_at_5 score

# Testing different algorithms

## AlternatingLeastSquares

In [None]:
folds = 4

results = pd.DataFrame()
def cross_validation_implicit_als(data):
    results_over_all = pd.DataFrame(columns=["factors","alpha","regularization","iterations","p_at_k_mean","map_at_k_mean","ndcg_at_k_mean"])
    factor_list = [5,10,20,50,70]
    alpha_list = [0.5,1.0, 1.5]
    regularization_list = [0.005,0.01,0.02]
    iterations_list =  [20,30, 50,70]

    for i in iterations_list:
        for f in factor_list:
            for a in alpha_list:
                for r in regularization_list:
                    avg_p_at_k_list = []
                    avg_map_at_k_list = []
                    avg_ndcg_at_k_list = []

                    #Note: Shuffle is set on false, because it leads to ambiguous results regarding the map_at_k and the ndcg_at_k. The 
                    # However p_at_k is not affected by the shuffle and stays consistent.
                    skf = KFold(n_splits= folds , shuffle=False) 

                    for r_seed in range(0,5):
                        # Lists to store the results for each random_seed
                        p_at_k_list = []
                        map_at_k_list = []
                        ndcg_at_k_list = []
                        model = implicit.als.AlternatingLeastSquares(factors = f, random_state= r_seed, alpha= a, regularization= r, iterations= i)

                        ###########################
                        # Split the data
                        ###########################
                        for train_index, test_index in skf.split(data):
                            X_train = data[train_index]
                            X_test = data[test_index]
                            
                            ###########################
                            # Shuffle manually
                            ###########################

                            index = np.arange(np.shape(X_train)[0])
                            np.random.RandomState(seed = 1).shuffle(index)
                            
                            X_train = X_train[index, :] 
                            model.fit(X_train)

                            #############
                            # Fifth Step: We predict on the scaled test data
                            #############
                            p_at_k = precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
                            map_at_k  = mean_average_precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
                            var_ndcg_at_k = ndcg_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)

                            # results to list
                            p_at_k_list.append(p_at_k)
                            map_at_k_list.append(map_at_k) 
                            ndcg_at_k_list.append(var_ndcg_at_k) #store as df to create the avg confusion matrix later


                        # store the results for each random_seed
                        # After each iteration in the random_seed loop the lists (acc_list, bacc_list and confusion_m_list) are cleaned. Therefore we can do the following computations to store the average results for each random_seed:
                        # Store the averages per random_seed in list. 
                        avg_p_at_k_list.append(statistics.mean(p_at_k_list))
                        avg_map_at_k_list.append(statistics.mean(map_at_k_list))
                        avg_ndcg_at_k_list.append(statistics.mean(ndcg_at_k_list))

                    
                    row = {
                        "factors": f,
                        "alpha": a,
                        "regularization": r,
                        "iterations": i,
                        "p_at_k_mean": statistics.mean(avg_p_at_k_list),  
                        "map_at_k_mean":statistics.mean(avg_map_at_k_list), 
                        "ndcg_at_k_mean":statistics.mean(avg_ndcg_at_k_list),
                        }
                    results_per_run = pd.DataFrame([row])
                    results_over_all = pd.concat([results_over_all, results_per_run], ignore_index=True)
    return results_over_all
                
              #  return avg_p_at_k_list,avg_map_at_k_list,avg_ndcg_at_k_list


SyntaxError: invalid syntax (1449821823.py, line 72)

In [None]:
results_of_als_grid = cross_validation_implicit_als(data_skills_row_norm_csr)
results_of_als_grid

NameError: name 'cross_validation_implicit' is not defined

### Results to csv

In [None]:
results_of_als_grid.to_csv("results_of_als_grid.csv", index=False)

### Best Params

In [None]:
results_of_als_grid.p_at_k_mean.idxmax()
results_of_als_grid.iloc[results_of_als_grid.p_at_k_mean.idxmax()]

factors                 70
epochs                  70
regularization        0.02
p_at_k_mean       0.294134
map_at_k_mean     0.205017
ndcg_at_k_mean    0.274624
Name: 59, dtype: object

In [None]:
results_of_als_grid.iloc[results_of_als_grid.map_at_k_mean.idxmax()]

factors                 70
epochs                  70
regularization        0.02
p_at_k_mean       0.294134
map_at_k_mean     0.205017
ndcg_at_k_mean    0.274624
Name: 59, dtype: object

In [None]:
results_of_als_grid.iloc[results_of_als_grid.ndcg_at_k_mean.idxmax()]

factors                 50
epochs                  50
regularization        0.02
p_at_k_mean       0.292184
map_at_k_mean     0.203665
ndcg_at_k_mean    0.275885
Name: 44, dtype: object

## BayesianPersonalizedRanking

In [None]:

folds = 4

results = pd.DataFrame()
def cross_validation_implicit_bpr(data):
    results_over_all = pd.DataFrame(columns=["factors","regularization","iterations","p_at_k_mean","map_at_k_mean","ndcg_at_k_mean"])
    factor_list = [5,10,20,50,70]
    iteration_list = [20,30, 50,70]
    regularization_list = [0.005,0.01,0.02]


    for f in factor_list:
        for i in iteration_list:
            for r in regularization_list:
                avg_p_at_k_list = []
                avg_map_at_k_list = []
                avg_ndcg_at_k_list = []

                #Note: Shuffle is set on false, because it leads to ambiguous results regarding the map_at_k and the ndcg_at_k. The 
                # However p_at_k is not affected by the shuffle and stays consistent.
                skf = KFold(n_splits= folds , shuffle=False) 

                for r_seed in range(0,5):
                    # Lists to store the results for each random_seed
                    p_at_k_list = []
                    map_at_k_list = []
                    ndcg_at_k_list = []
                    model = implicit.bpr.BayesianPersonalizedRanking(factors = f, random_state= r_seed, regularization= r, iterations= i)

                    ###########################
                    # Split the data
                    ###########################
                    for train_index, test_index in skf.split(data):
                        X_train = data[train_index]
                        X_test = data[test_index]
                        
                        ###########################
                        # Shuffle manually
                        ###########################

                        index = np.arange(np.shape(X_train)[0])
                        np.random.RandomState(seed = 1).shuffle(index)
                        
                        X_train = X_train[index, :] 
                        model.fit(X_train)

                        #############
                        # Fifth Step: We predict on the scaled test data
                        #############
                        p_at_k = precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
                        map_at_k  = mean_average_precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
                        var_ndcg_at_k = ndcg_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)

                        # results to list
                        p_at_k_list.append(p_at_k)
                        map_at_k_list.append(map_at_k) 
                        ndcg_at_k_list.append(var_ndcg_at_k) #store as df to create the avg confusion matrix later


                    # store the results for each random_seed
                    # After each iteration in the random_seed loop the lists (acc_list, bacc_list and confusion_m_list) are cleaned. Therefore we can do the following computations to store the average results for each random_seed:
                    # Store the averages per random_seed in list. 
                    avg_p_at_k_list.append(statistics.mean(p_at_k_list))
                    avg_map_at_k_list.append(statistics.mean(map_at_k_list))
                    avg_ndcg_at_k_list.append(statistics.mean(ndcg_at_k_list))

                
                row = {
                    "factors": f,
                    "regularization": r,
                    "iterations": i,
                    "p_at_k_mean": statistics.mean(avg_p_at_k_list),  
                    "map_at_k_mean":statistics.mean(avg_map_at_k_list), 
                    "ndcg_at_k_mean":statistics.mean(avg_ndcg_at_k_list),
                    }
                results_per_run = pd.DataFrame([row])
                results_over_all = pd.concat([results_over_all, results_per_run], ignore_index=True)
    return results_over_all
                
                #  return avg_p_at_k_list,avg_map_at_k_list,avg_ndcg_at_k_list


In [None]:
results_of_bpr_grid = cross_validation_implicit_bpr(data_skills_row_norm_csr)
results_of_bpr_grid

100%|██████████| 100/100 [00:00<00:00, 106.31it/s, train_auc=75.46%, skipped=39.44%]
100%|██████████| 101/101 [00:00<00:00, 3050.51it/s]
100%|██████████| 101/101 [00:00<00:00, 2589.08it/s]
100%|██████████| 101/101 [00:00<00:00, 2678.05it/s]
100%|██████████| 100/100 [00:01<00:00, 89.78it/s, train_auc=79.00%, skipped=39.19%]
100%|██████████| 101/101 [00:00<00:00, 2936.78it/s]
100%|██████████| 101/101 [00:00<00:00, 2340.87it/s]
100%|██████████| 101/101 [00:00<00:00, 3054.30it/s]
100%|██████████| 100/100 [00:00<00:00, 119.20it/s, train_auc=78.19%, skipped=40.09%]
100%|██████████| 101/101 [00:00<00:00, 2941.16it/s]
100%|██████████| 101/101 [00:00<00:00, 2657.74it/s]
100%|██████████| 101/101 [00:00<00:00, 2494.11it/s]
100%|██████████| 100/100 [00:01<00:00, 92.10it/s, train_auc=79.09%, skipped=38.65%]
100%|██████████| 101/101 [00:00<00:00, 3323.46it/s]
100%|██████████| 101/101 [00:00<00:00, 3151.83it/s]
100%|██████████| 101/101 [00:00<00:00, 2920.32it/s]
100%|██████████| 100/100 [00:00<00:00,

Unnamed: 0,factors,regularization,p_at_k_mean,map_at_k_mean,ndcg_at_k_mean
0,1,0.005,0.424007,0.344604,0.420083
1,1,0.01,0.423791,0.344165,0.42029
2,1,0.02,0.424949,0.344927,0.421517
3,5,0.005,0.254302,0.186533,0.253958
4,5,0.01,0.25388,0.185932,0.253187
5,5,0.02,0.260535,0.192172,0.259911
6,10,0.005,0.225367,0.159818,0.225753
7,10,0.01,0.223653,0.160471,0.225264
8,10,0.02,0.233774,0.168269,0.234268
9,20,0.005,0.208391,0.144956,0.206501


### Results to Csv

In [None]:
results_of_bpr_grid.to_csv("results_of_bpr_grid.csv", index=False)

### Best Params

In [None]:
results_of_bpr_grid.p_at_k_mean.idxmax()
results_of_bpr_grid.iloc[results_of_bpr_grid.p_at_k_mean.idxmax()]

factors                 70
epochs                  70
regularization        0.02
p_at_k_mean       0.294134
map_at_k_mean     0.205017
ndcg_at_k_mean    0.274624
Name: 59, dtype: object

In [None]:
results_of_bpr_grid.iloc[results_of_bpr_grid.map_at_k_mean.idxmax()]

factors                 70
epochs                  70
regularization        0.02
p_at_k_mean       0.294134
map_at_k_mean     0.205017
ndcg_at_k_mean    0.274624
Name: 59, dtype: object

In [None]:
results_of_bpr_grid.iloc[results_of_bpr_grid.ndcg_at_k_mean.idxmax()]

factors                 50
epochs                  50
regularization        0.02
p_at_k_mean       0.292184
map_at_k_mean     0.203665
ndcg_at_k_mean    0.275885
Name: 44, dtype: object

## LogisticMatrixFactorization

In [None]:

folds = 4

results = pd.DataFrame()
def cross_validation_implicit_lmf(data):
    results_over_all = pd.DataFrame(columns=["factors","epochs","regularization","p_at_k_mean","map_at_k_mean","ndcg_at_k_mean"])
    factor_list = [5,10,20,50,70]
    iteration_list = [20,30, 50,70]
    regularization_list = [0.005,0.01,0.02]


    for f in factor_list:
        for i in iteration_list:
            for r in regularization_list:
                avg_p_at_k_list = []
                avg_map_at_k_list = []
                avg_ndcg_at_k_list = []

                #Note: Shuffle is set on false, because it leads to ambiguous results regarding the map_at_k and the ndcg_at_k. The 
                # However p_at_k is not affected by the shuffle and stays consistent.
                skf = KFold(n_splits= folds , shuffle=False) 

                for r_seed in range(0,5):
                    # Lists to store the results for each random_seed
                    p_at_k_list = []
                    map_at_k_list = []
                    ndcg_at_k_list = []
                    model = implicit.lmf.LogisticMatrixFactorization(factors = f, random_state= r_seed, iterations = i, regularization= r)

                    ###########################
                    # Split the data
                    ###########################
                    for train_index, test_index in skf.split(data):
                        X_train = data[train_index]
                        X_test = data[test_index]
                        
                        ###########################
                        # Shuffle manually
                        ###########################

                        index = np.arange(np.shape(X_train)[0])
                        np.random.RandomState(seed = 1).shuffle(index)
                        
                        X_train = X_train[index, :] 
                        model.fit(X_train)

                        #############
                        # Fifth Step: We predict on the scaled test data
                        #############
                        p_at_k = precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
                        map_at_k  = mean_average_precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
                        var_ndcg_at_k = ndcg_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)

                        # results to list
                        p_at_k_list.append(p_at_k)
                        map_at_k_list.append(map_at_k) 
                        ndcg_at_k_list.append(var_ndcg_at_k) #store as df to create the avg confusion matrix later


                    # store the results for each random_seed
                    # After each iteration in the random_seed loop the lists (acc_list, bacc_list and confusion_m_list) are cleaned. Therefore we can do the following computations to store the average results for each random_seed:
                    # Store the averages per random_seed in list. 
                    avg_p_at_k_list.append(statistics.mean(p_at_k_list))
                    avg_map_at_k_list.append(statistics.mean(map_at_k_list))
                    avg_ndcg_at_k_list.append(statistics.mean(ndcg_at_k_list))

                
                row = {
                    "factors": f,
                    "epochs": i,
                    "regularization": r,
                    "p_at_k_mean": statistics.mean(avg_p_at_k_list),  
                    "map_at_k_mean":statistics.mean(avg_map_at_k_list), 
                    "ndcg_at_k_mean":statistics.mean(avg_ndcg_at_k_list),
                    }
                results_per_run = pd.DataFrame([row])
                results_over_all = pd.concat([results_over_all, results_per_run], ignore_index=True)
    return results_over_all

In [None]:
results_of_lmf_grid = cross_validation_implicit_lmf(data_skills_row_norm_csr)
results_of_lmf_grid

100%|██████████| 20/20 [00:00<00:00, 66.67it/s] 
100%|██████████| 101/101 [00:00<00:00, 1074.41it/s]
100%|██████████| 101/101 [00:00<00:00, 1312.25it/s]
100%|██████████| 101/101 [00:00<00:00, 1442.85it/s]
100%|██████████| 20/20 [00:00<00:00, 150.38it/s]
100%|██████████| 101/101 [00:00<00:00, 2463.56it/s]
100%|██████████| 101/101 [00:00<00:00, 2244.29it/s]
100%|██████████| 101/101 [00:00<00:00, 2526.01it/s]
100%|██████████| 20/20 [00:00<00:00, 166.67it/s]
100%|██████████| 101/101 [00:00<00:00, 2657.62it/s]
100%|██████████| 101/101 [00:00<00:00, 2805.48it/s]
100%|██████████| 101/101 [00:00<00:00, 2348.92it/s]
100%|██████████| 20/20 [00:00<00:00, 170.94it/s]
100%|██████████| 101/101 [00:00<00:00, 2657.97it/s]
100%|██████████| 101/101 [00:00<00:00, 2730.05it/s]
100%|██████████| 101/101 [00:00<00:00, 3060.39it/s]
100%|██████████| 20/20 [00:00<00:00, 162.61it/s]
100%|██████████| 101/101 [00:00<00:00, 2807.04it/s]
100%|██████████| 101/101 [00:00<00:00, 2295.72it/s]
100%|██████████| 101/101 [0

Unnamed: 0,factors,epochs,regularization,p_at_k_mean,map_at_k_mean,ndcg_at_k_mean
0,5,20,0.005,0.211345,0.132021,0.198403
1,5,20,0.01,0.226556,0.145399,0.214212
2,5,20,0.02,0.23175,0.147755,0.218211
3,5,30,0.005,0.19601,0.125619,0.18524
4,5,30,0.01,0.198243,0.126151,0.186596
5,5,30,0.02,0.214959,0.136981,0.202376
6,5,50,0.005,0.161072,0.099616,0.148457
7,5,50,0.01,0.171202,0.110067,0.1617
8,5,50,0.02,0.189409,0.121717,0.177791
9,5,70,0.005,0.142384,0.090614,0.134679


In [None]:
results_of_lmf_grid.to_csv("results_of_lmf_grid.csv", index=False)

In [None]:
results_of_lmf_grid.dtypes

factors            object
epochs             object
regularization    float64
p_at_k_mean       float64
map_at_k_mean     float64
ndcg_at_k_mean    float64
dtype: object

In [None]:
results_of_lmf_grid.describe()

Unnamed: 0,regularization,p_at_k_mean,map_at_k_mean,ndcg_at_k_mean
count,60.0,60.0,60.0,60.0
mean,0.011667,0.21607,0.140182,0.201202
std,0.006289,0.049113,0.037761,0.046637
min,0.005,0.129559,0.080903,0.117947
25%,0.005,0.173136,0.112629,0.163186
50%,0.01,0.213593,0.134124,0.199056
75%,0.02,0.250388,0.160673,0.233339
max,0.02,0.294134,0.205017,0.275885


### Best Params

In [None]:
results_of_lmf_grid.p_at_k_mean.idxmax()
results_of_lmf_grid.iloc[results_of_lmf_grid.p_at_k_mean.idxmax()]

factors                 70
epochs                  70
regularization        0.02
p_at_k_mean       0.294134
map_at_k_mean     0.205017
ndcg_at_k_mean    0.274624
Name: 59, dtype: object

In [None]:
results_of_lmf_grid.iloc[results_of_lmf_grid.map_at_k_mean.idxmax()]

factors                 70
epochs                  70
regularization        0.02
p_at_k_mean       0.294134
map_at_k_mean     0.205017
ndcg_at_k_mean    0.274624
Name: 59, dtype: object

In [None]:
results_of_lmf_grid.iloc[results_of_lmf_grid.ndcg_at_k_mean.idxmax()]

factors                 50
epochs                  50
regularization        0.02
p_at_k_mean       0.292184
map_at_k_mean     0.203665
ndcg_at_k_mean    0.275885
Name: 44, dtype: object

In [None]:
# train the model on a sparse matrix of user/item/confidence weights
train, test = train_test_split(data_skills_row_norm_csr, train_percentage =  0.8, random_state = 1)
model = implicit.lmf.LogisticMatrixFactorization(factors = 70, random_state=1,iterations = 50, regularization= 0.02)
model.fit(train)


100%|██████████| 50/50 [00:04<00:00, 10.86it/s]


In [None]:
test?

[1;31mType:[0m        csr_matrix
[1;31mString form:[0m
(0, 165)	0.11396057645963795
           (0, 242)	0.11396057645963795
           (0, 243)	0.11396057645963795
           (0 <...>  (403, 629)	0.08192319205190406
           (403, 643)	0.08192319205190406
           (403, 652)	0.08192319205190406
[1;31mFile:[0m        c:\users\maximus\envs\ds_ap\lib\site-packages\scipy\sparse\_csr.py
[1;31mDocstring:[0m  
Compressed Sparse Row matrix

This can be instantiated in several ways:
    csr_matrix(D)
        with a dense matrix or rank-2 ndarray D

    csr_matrix(S)
        with another sparse matrix S (equivalent to S.tocsr())

    csr_matrix((M, N), [dtype])
        to construct an empty matrix with shape (M, N)
        dtype is optional, defaulting to dtype='d'.

    csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
        where ``data``, ``row_ind`` and ``col_ind`` satisfy the
        relationship ``a[row_ind[k], col_ind[k]] = data[k]``.

    csr_matrix((data, indices, ind

In [None]:
matrix.iloc[0].sum()


77

In [None]:
user_id  = 0
skill_ids, scores = model.recommend(user_id, data_skills_row_norm_csr[user_id], N = 10 , filter_already_liked_items=False)
recommendations_df = pd.DataFrame({"skill": data_skills_row_norm.columns[skill_ids], "score": scores, "already_liked": np.in1d(skill_ids, data_skills_row_norm_csr[userid].indices)})
recommendations_df

Unnamed: 0,skill,score,already_liked
0,OpenOffice / LibreOffice,0.580505,False
1,Subversion,0.305606,True
2,PowerShell,0.297782,False
3,JSON,0.220934,False
4,Windows NT/2000/XP,0.176941,True
5,Spanisch,0.072854,False
6,Wasserfallmodel,-0.120552,False
7,Android,-0.179944,True
8,Windows 10,-0.21522,True
9,JavaScript,-0.330638,True


In [None]:
recommendations_df.merge(skill_and_categories, how = "left", left_on = "skill", right_on = "skill")

Unnamed: 0,skill,score,already_liked,category
0,OpenOffice / LibreOffice,0.580505,False,Produkterfahrung
1,Subversion,0.305606,True,"CI/CD, Build- und Versionskontrollsysteme"
2,PowerShell,0.297782,False,Technik/Tools
3,JSON,0.220934,False,Webentwicklung
4,Windows NT/2000/XP,0.176941,True,Betriebssystem
5,Spanisch,0.072854,False,Sprachen
6,Wasserfallmodel,-0.120552,False,Projektmanagement / Vorgehensmodelle
7,Android,-0.179944,True,Betriebssystem
8,Windows 10,-0.21522,True,Betriebssystem
9,JavaScript,-0.330638,True,Programmiersprachen / Scriptsprachen


In [None]:
dict_user_skills = {}
for value, name in zip(data_skills_row_norm.iloc[userid,:], data_skills_row_norm.columns):
    if value > 0:
        dict_user_skills[name] = value
    else:
        next

dict_user_skills

{'Android': 0.23570226039551587,
 'Apache': 0.23570226039551587,
 'GIT': 0.23570226039551587,
 'JavaScript': 0.23570226039551587,
 'Joomla': 0.23570226039551587,
 'Microsoft SQL Server': 0.23570226039551587,
 'MySQL': 0.23570226039551587,
 'Oracle': 0.23570226039551587,
 'PHP': 0.23570226039551587,
 'PhpStorm': 0.23570226039551587,
 'Pimcore': 0.23570226039551587,
 'SUSE Linux': 0.23570226039551587,
 'Subversion': 0.23570226039551587,
 'Symfony': 0.23570226039551587,
 'Ubuntu': 0.23570226039551587,
 'Windows 10': 0.23570226039551587,
 'Windows 7': 0.23570226039551587,
 'Windows NT/2000/XP': 0.23570226039551587}

### Baseline

In [None]:
# initialize a model
model = implicit.als.AlternatingLeastSquares(factors = 1,random_state=1,regularization= 0.005) 

In [None]:
# train the model on a sparse matrix of user/item/confidence weights
model.fit(data_skills_row_norm_csr)

100%|██████████| 15/15 [00:00<00:00, 43.00it/s]


In [None]:
userid= 13
dict_user_skills = {}
for value, name in zip(data_skills_row_norm.iloc[userid,:], data_skills_row_norm.columns):
    if value > 0:
        dict_user_skills[name] = value
    else:
        next

dict_user_skills

{'Android': 0.23570226039551587,
 'Apache': 0.23570226039551587,
 'GIT': 0.23570226039551587,
 'JavaScript': 0.23570226039551587,
 'Joomla': 0.23570226039551587,
 'Microsoft SQL Server': 0.23570226039551587,
 'MySQL': 0.23570226039551587,
 'Oracle': 0.23570226039551587,
 'PHP': 0.23570226039551587,
 'PhpStorm': 0.23570226039551587,
 'Pimcore': 0.23570226039551587,
 'SUSE Linux': 0.23570226039551587,
 'Subversion': 0.23570226039551587,
 'Symfony': 0.23570226039551587,
 'Ubuntu': 0.23570226039551587,
 'Windows 10': 0.23570226039551587,
 'Windows 7': 0.23570226039551587,
 'Windows NT/2000/XP': 0.23570226039551587}

### Create a df that holds the recommendations and the scores 
* Die Scores sind in der implicip library equivalent zu der confidence. Die Confidence gibt an, wie häufig ein item x konsumiert wurde. Allerdings soll die Intensität, mit der dieser Effekt steigt, sukkzessive abnehmen, um den Einfluss eines Superfans (jmd der überdurchschnittkich viel von Item x konsumiert) zu schwächen. Hier sind parallelen zu den Motiven der Row-Normalisation zu erkennen. <br>
Ich habe im ersten Lauf die Werte meiner row-normalisation als confidence werte verwendet. <br>
    * Sollte nach angepasst werden. Hierzu: https://en.wikipedia.org/wiki/Okapi_BM25

Maybe it makes sense <br>
"
Confidence can be defined as the worth or the value we give to the interaction. For User A buying(a transaction event) item X we increase the interaction weight, while User A viewing item Z has lesser weight than the ‘interaction of buying’. https://towardsdatascience.com/alternating-least-square-for-implicit-dataset-with-code-8e7999277f4b " 

In [None]:
skill_ids, scores  = model.recommend(userid, data_skills_row_norm_csr[userid], N = 20 , filter_already_liked_items=False)
recommendations_df = pd.DataFrame({"skill": data_skills_row_norm.columns[skill_ids], "score": scores, "already_liked": np.in1d(skill_ids, data_skills_row_norm_csr[userid].indices)})
recommendations_df

Unnamed: 0,skill,score,already_liked
0,Englisch,0.109891,False
1,MySQL,0.108704,True
2,HTML,0.081065,False
3,JavaScript,0.072065,True
4,SQL,0.071913,False
5,GIT,0.06957,True
6,SCRUM,0.06155,False
7,Test Driven Development (TDD),0.056983,False
8,Java,0.056517,False
9,JIRA,0.056304,False


In [None]:
skill_and_categories = df[["skill","category"]].drop_duplicates()

KeyError: "['category'] not in index"

In [None]:
recommendations_df.merge(skill_and_categories, how = "left", left_on = "skill", right_on = "skill")

Unnamed: 0,skill,score,already_liked,category
0,MySQL,0.44042,True,Datenbank
1,PHP,0.422259,True,Webentwicklung
2,Windows NT/2000/XP,0.38084,True,Betriebssystem
3,Apache,0.380116,True,Applikationsserver
4,TYPO3 Generalist,0.21658,False,TYPO3
5,PhpStorm,0.204278,True,Umgebungen
6,Windows 7,0.195201,True,Betriebssystem
7,Java,0.185855,False,Programmiersprachen / Scriptsprachen
8,Englisch,0.185376,False,Sprachen
9,C/C++,0.167171,False,Programmiersprachen / Scriptsprachen


In [None]:
# Id and skills
dict_skills_id = {}
for c,skill in enumerate(data_skills_row_norm.columns):
        dict_skills_id[c] = skill

dict_skills_id

{0: '.NET Compact Framework',
 1: '.NET Core',
 2: '.NET Framework',
 3: '3D-Modellierung',
 4: 'ABAP',
 5: 'ADO.NET',
 6: 'AIX',
 7: 'ARIS',
 8: 'ARIS ITArchitect',
 9: 'AS400',
 10: 'ASP Generalist',
 11: 'ASP.NET',
 12: 'ASP.NET MVC-Framework',
 13: 'ASP.NET WebAPI',
 14: 'AWS',
 15: 'AWS Lambda',
 16: 'Abstract',
 17: 'Access',
 18: 'Accessibility / WCAG',
 19: 'Active Directory',
 20: 'ActiveX',
 21: 'Adobe CC',
 22: 'Adobe Flash',
 23: 'Adobe Illustrator',
 24: 'Adobe InDesign',
 25: 'Adobe Indesign',
 26: 'Adobe Photoshop',
 27: 'Adobe Premiere',
 28: 'Adobe XD',
 29: 'After Effects',
 30: 'Agile Methoden',
 31: 'Ajax',
 32: 'Alexa-Skills',
 33: 'Alpine',
 34: 'Analytics',
 35: 'Android',
 36: 'Android Studio',
 37: 'Anforderungsanalyse',
 38: 'Anforderungsmanagement',
 39: 'Angular (2 und höher)',
 40: 'Angular Generalist',
 41: 'Angular Material',
 42: 'Angular Theming',
 43: 'AngularJS',
 44: 'Animations (transition, @keyframes)',
 45: 'Ansible',
 46: 'Ant',
 47: 'Apache',
 4

In [None]:
keys = [k for k, v in dict_skills_id.items() if v == 'Python']
print(keys)

[458]


### Display simular items

In [None]:
# find related item
skill_id, sim = model.similar_items(0, N=10)
# display the results using pandas for nicer formatting
simularity_df  = pd.DataFrame({"skill": data_skills_row_norm.columns[skill_id], "score": sim})
simularity_df.merge(skill_and_categories, how = "left", left_on = "skill", right_on = "skill")

Unnamed: 0,skill,score,category
0,.NET Compact Framework,1.0,.NET Frameworks und Tools
1,ADO.NET,0.674586,.NET Frameworks und Tools
2,DevExpress,0.635214,.NET Frameworks und Tools
3,Silverlight,0.632719,.NET Frameworks und Tools
4,Microsoft Enterprise Library,0.632117,.NET Frameworks und Tools
5,ASP Generalist,0.612088,.NET Frameworks und Tools
6,Windows Communication Foundation (WCF),0.599716,.NET Frameworks und Tools
7,Mono,0.578717,.NET Frameworks und Tools
8,Windows Forms,0.566209,.NET Frameworks und Tools
9,Exact,0.548891,ERP


# Add a Evaluation metric to the library

### APK
http://sdsawtelle.github.io/blog/output/mean-average-precision-MAP-for-recommender-systems.html

In [None]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if not actual:
        return 0.0

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
###
# Recommendations for user_id = 13
###
recommendations_list = list(recommendations_df.skill)
known_skills_list = list(dict_user_skills.keys())
len(known_skills_list)

18

In [None]:
apk(known_skills_list,recommendations_list,k=20)

0.4013227513227513

## Train-Test split

In [47]:
# train the model on a sparse matrix of user/item/confidence weights
train, test = train_test_split(data_skills_csr_weighted, train_percentage =  0.8, random_state = 1)
model = implicit.als.AlternatingLeastSquares(factors = 70, random_state=1, regularization= 0.02,iterations = 70 )
model.fit(train)

100%|██████████| 70/70 [00:06<00:00, 10.86it/s]


In [34]:
test[0]

<1x735 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [35]:
matrix.iloc[0].sum()

77

In [None]:
user_id  = 3
skill_ids, scores = model.recommend(0, data_skills_row_norm_csr[user_id], N = 4 , filter_already_liked_items=False)
recommendations_df = pd.DataFrame({"skill": data_skills_row_norm.columns[skill_ids], "score": scores, "already_liked": np.in1d(skill_ids, data_skills_row_norm_csr[userid].indices)})
recommendations_df

Unnamed: 0,skill,score,already_liked
0,Englisch,0.436307,False
1,MySQL,0.431594,True
2,HTML,0.321858,False
3,JavaScript,0.286125,True


In [None]:
most_common_skills = matrix.sum(axis=0).nlargest(10)	
for i in most_common_skills.index:
    print(i)

Englisch
MySQL
SCRUM
GIT
HTML
JavaScript
JIRA
CSS
SQL
Java


In [36]:
results = precision_at_k(model, train_user_items=train, test_user_items=test, K=4)
results

100%|██████████| 378/378 [00:00<00:00, 4111.21it/s]


0.3347670250896057

In [37]:
results = mean_average_precision_at_k(model, train_user_items=train, test_user_items=test, K=5)
results

100%|██████████| 378/378 [00:00<00:00, 6003.73it/s]


0.21753747795414485

In [None]:
mean_average_precision_at_k?

[1;31mDocstring:[0m
mean_average_precision_at_k(model, train_user_items, test_user_items, int K=10, show_progress=True, int num_threads=1)
Calculates MAP@K for a given trained model

   Parameters
   ----------
   model : RecommenderBase
       The fitted recommendation model to test
   train_user_items : csr_matrix
       Sparse matrix of user by item that contains elements that were used in training the model
   test_user_items : csr_matrix
       Sparse matrix of user by item that contains withheld elements to test on
   K : int
       Number of items to test on
   show_progress : bool, optional
       Whether to show a progress bar
   num_threads : int, optional
       The number of threads to use for testing. Specifying 0 means to default
       to the number of cores on the machine. Note: aside from the ALS and BPR
       models, setting this to more than 1 will likely hurt performance rather than
       help.

   Returns
   -------
   float
       the calculated MAP@k
   
[1;

### Notiz
* COnfidence durch Multiplikation erhöhen
* Beste model testen und anhand eines users ausm test set nachvollziehen

The model was fitted on the whole dataset.
Try to do a train test / cross valdiation later on.
* see: https://gist.github.com/jbochi/2e8ddcc5939e70e5368326aa034a144e
* evaluation of the implicit lib: https://github.com/benfred/implicit/blob/main/implicit/evaluation.pyx

In [42]:
data  = data.rename(columns = {'person' : 'user' ,"skill":"repo"})

data['user'] = data['user'].astype("category")
data['repo'] = data['repo'].astype("category")



In [129]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
from implicit.utils import nonzeros

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class UtilityMatrixTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, confidence=40):
        self.confidence = confidence

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return coo_matrix((np.ones(X.shape[0]),
                           (X['repo'].cat.codes.copy(),
                            X['user'].cat.codes.copy()))) * self.confidence






In [135]:
class ALSEstimator(BaseEstimator, TransformerMixin):
    def __init__(self, factors=50,
                       regularization=0.01,
                       iterations=10,
                       filter_seen=True):
        self.factors = factors
        self.regularization = regularization
        self.iterations = iterations
        self.filter_seen = filter_seen

    def fit(self, X, y=None):
        self.model = AlternatingLeastSquares(factors=self.factors,
                                             regularization=self.regularization,
                                             iterations=self.iterations,
                                             dtype=np.float32,
                                             use_native=True)
        self.model.fit(X)
        if self.filter_seen:
            self.fit_X = X
        return self
    
    def test(self,X):
        print("test")
    # def predict(self, X, y=None):
    #     # predictions = np.dot(self.model.item_factors, self.model.user_factors.T)
    #     # if self.filter_seen:
    #     #     predictions[self.fit_X.nonzero()] = -99
    #     return self.X #model.item_factors #predictions

In [None]:
#https://gist.github.com/mblondel/7337391

def dcg_score(y_true, y_score, k=10, gains="exponential"):
    """Discounted cumulative gain (DCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    DCG @k : float
    """
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    if gains == "exponential":
        gains = 2 ** y_true - 1
    elif gains == "linear":
        gains = y_true
    else:
        raise ValueError("Invalid gains option.")

    # highest rank is 1 so +2 instead of +1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)


def ndcg_score(y_true, y_score, k=10, gains="exponential"):
    """Normalized discounted cumulative gain (NDCG) at rank k
    Parameters
    ----------
    y_true : array-like, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array-like, shape = [n_samples]
        Predicted scores.
    k : int
        Rank.
    gains : str
        Whether gains should be "exponential" (default) or "linear".
    Returns
    -------
    NDCG @k : float
    """
    best = dcg_score(y_true, y_true, k, gains)
    if best == 0:
        return 0    
    actual = dcg_score(y_true, y_score, k, gains)
    return actual / best

def get_col(Y, col):
    return np.squeeze(np.asarray(Y[:,col]))

def ndcg_score_matrix(Y_true, Y_score, k=10, gains="exponential"):
    score = 0.0
    n_users = Y_true.shape[1]
    for u in range(n_users):
        s = ndcg_score(get_col(Y_true, u), get_col(Y_score, u))
        score += s
    return score / n_users

from sklearn.model_selection import PredefinedSplit

class LeavePOutByGroup():
    def __init__(self, X, p=5, n_splits=2):
        self.X = X
        self.p = p
        self.n_splits = n_splits
        test_fold = self.X.groupby("user").cumcount().apply(lambda x: int(x / p) if x < (n_splits * p) else -1)
        self.s = PredefinedSplit(test_fold)

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

    def split(self, X=None, y=None, groups=None):
        return self.s.split()

def ndcg_scorer(estimator, X_test):
    truth = UtilityMatrixTransformer(confidence=1).fit_transform(X_test).todense()
    predictions = estimator.predict(X_test)
    return ndcg_score_matrix(truth, predictions, k=10)

from sklearn.model_selection import cross_val_score

from sklearn.model_selection import GridSearchCV



In [66]:
test

<404x735 sparse matrix of type '<class 'numpy.float64'>'
	with 4828 stored elements in Compressed Sparse Row format>

In [67]:
from scipy.sparse import coo_matrix

In [69]:
model = ALSEstimator()

In [71]:
model

In [118]:
model.fit(X =data_skills_csr_weighted)

100%|██████████| 10/10 [00:00<00:00, 10.34it/s]


In [278]:
class ALSEstimator9(BaseEstimator, TransformerMixin):
    def __init__(self, factors=50,
                       regularization=0.01,
                       iterations=10,
                       filter_seen=True):
        self.factors = factors
        self.regularization = regularization
        self.iterations = iterations
        self.filter_seen = filter_seen

    def fit(self, X, y=None):
        self.model = implicit.als.AlternatingLeastSquares(factors=self.factors,
                                             regularization=self.regularization,
                                             iterations=self.iterations,
                                             dtype=np.float32,
                                             use_native=True)
        self.model.fit(X)
        if self.filter_seen:
            self.fit_X = X
        return self
    
    def test(self,X):
        print("test 7")

    def predict(self, X, y=None):
        predictions = np.dot(self.model.item_factors, self.model.user_factors.T)
        try:
            if self.filter_seen:
                predictions[self.fit_X.nonzero()] = -99
        except:
            pass
        return predictions 

In [236]:
def ndcg_scorer(estimator, X_test):
    truth = UtilityMatrixTransformer(confidence=1).fit_transform(X_test).todense()
    predictions = estimator.predict(X_test)
    return ndcg_score_matrix(truth, predictions, k=10)

In [279]:
model = ALSEstimator9()

In [282]:
model.fit(X =data_skills_csr_weighted)

100%|██████████| 10/10 [00:00<00:00, 19.78it/s]


In [283]:
model.test(X =data_skills_csr_weighted)

test 7


In [284]:
truev = UtilityMatrixTransformer(confidence=1).fit_transform(data).todense()
truev

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 1., 0., ..., 0., 1., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [285]:
model.predict(truev)

array([[ 0.03091457,  0.14247568,  0.03703462, ...,  0.08758902,
         0.18396544,  0.06955931],
       [ 0.10884745,  0.3343123 , -0.06991565, ...,  0.0751308 ,
         0.6773356 , -0.22305462],
       [ 0.19654448,  1.0405846 ,  0.13063546, ...,  0.02456833,
         0.8063447 , -0.24396434],
       ...,
       [-0.04881241,  0.05401535,  0.00422441, ...,  0.17349924,
         0.05439551, -0.12875067],
       [ 0.23767798,  0.10038684,  0.59019375, ..., -0.10556521,
        -0.11631756,  0.20595044],
       [-0.16228567, -0.15548603,  0.03777868, ..., -0.08817779,
        -0.1232048 ,  0.208762  ]], dtype=float32)

In [286]:
ndcg_scorer(model, data)

ValueError: Only ('multilabel-indicator', 'continuous-multioutput', 'multiclass-multioutput') formats are supported. Got multiclass instead

In [156]:
data_skills_csr_weighted.tocsr()[0]

<1x735 sparse matrix of type '<class 'numpy.float64'>'
	with 77 stored elements in Compressed Sparse Row format>

In [57]:
UtilityMatrixTransformer(confidence=1).fit_transform(data).todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 1., 0.],
        [0., 1., 0., ..., 0., 1., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 1., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [287]:
rec_pipeline = Pipeline([
        ('matrix', UtilityMatrixTransformer()),
        ('als', ALSEstimator9()),
])

param_grid = [
    {
        'matrix__confidence': [1, 3, 10, 40, 100],
        'als__factors': [20, 50, 100],
        'als__regularization': [1e-2, 1e-3, 1e-4],
    }
]

shuffled_train_set = data.reindex(np.random.permutation(data.index)).sort_values("user")
grid_search = GridSearchCV(rec_pipeline, param_grid,
                           cv=4,
                           scoring=ndcg_scorer, verbose=1)
grid_search.fit(shuffled_train_set)



Fitting 4 folds for each of 45 candidates, totalling 180 fits


100%|██████████| 10/10 [00:00<00:00, 26.47it/s]
Traceback (most recent call last):
  File "c:\Users\Maxi\Desktop\virtual_env\ds_ap\lib\site-packages\sklearn\model_selection\_validation.py", line 765, in _score
    scores = scorer(estimator, X_test)
  File "C:\Users\Maxi\AppData\Local\Temp\ipykernel_17124\3684545755.py", line 4, in ndcg_scorer
    return ndcg_score_matrix(truth, predictions, k=10)
  File "C:\Users\Maxi\AppData\Local\Temp\ipykernel_17124\1169761609.py", line 111, in ndcg_score_matrix
    s = ndcg_score(get_col(Y_true, u), get_col(Y_score, u))
  File "c:\Users\Maxi\Desktop\virtual_env\ds_ap\lib\site-packages\sklearn\metrics\_ranking.py", line 1638, in ndcg_score
    check_consistent_length(y_true, y_score, sample_weight)
  File "c:\Users\Maxi\Desktop\virtual_env\ds_ap\lib\site-packages\sklearn\utils\validation.py", line 387, in check_consistent_length
    raise ValueError(
ValueError: Found input variables with inconsistent numbers of samples: [735, 404]

100%|██████████|

In [288]:
grid_search.best_params_



{'als__factors': 20, 'als__regularization': 0.01, 'matrix__confidence': 1}

In [289]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

nan {'als__factors': 20, 'als__regularization': 0.01, 'matrix__confidence': 1}
nan {'als__factors': 20, 'als__regularization': 0.01, 'matrix__confidence': 3}
nan {'als__factors': 20, 'als__regularization': 0.01, 'matrix__confidence': 10}
nan {'als__factors': 20, 'als__regularization': 0.01, 'matrix__confidence': 40}
nan {'als__factors': 20, 'als__regularization': 0.01, 'matrix__confidence': 100}
nan {'als__factors': 20, 'als__regularization': 0.001, 'matrix__confidence': 1}
nan {'als__factors': 20, 'als__regularization': 0.001, 'matrix__confidence': 3}
nan {'als__factors': 20, 'als__regularization': 0.001, 'matrix__confidence': 10}
nan {'als__factors': 20, 'als__regularization': 0.001, 'matrix__confidence': 40}
nan {'als__factors': 20, 'als__regularization': 0.001, 'matrix__confidence': 100}
nan {'als__factors': 20, 'als__regularization': 0.0001, 'matrix__confidence': 1}
nan {'als__factors': 20, 'als__regularization': 0.0001, 'matrix__confidence': 3}
nan {'als__factors': 20, 'als__regu

In [None]:
folds = 4

results = pd.DataFrame()
def cross_validation_implicit_als(data):
    results_over_all = pd.DataFrame(columns=["factors","regularization","p_at_k_mean","map_at_k_mean","ndcg_at_k_mean"])
    factor_list = [20]
    regularization_list = [0.01]

    for f in factor_list:
        for r in regularization_list:
            avg_p_at_k_list = []
            avg_map_at_k_list = []
            avg_ndcg_at_k_list = []

            #Note: Shuffle is set on false, because it leads to ambiguous results regarding the map_at_k and the ndcg_at_k. The 
            # However p_at_k is not affected by the shuffle and stays consistent.
            skf = KFold(n_splits= folds , shuffle=False) 

            for r_seed in range(0,5):
                # Lists to store the results for each random_seed
                p_at_k_list = []
                map_at_k_list = []
                ndcg_at_k_list = []
                model = implicit.als.AlternatingLeastSquares(factors = f, random_state= r_seed, regularization= r)

                ###########################
                # Split the data
                ###########################
                for train_index, test_index in skf.split(data):
                    X_train = data[train_index]
                    X_test = data[test_index]
                    
                    ###########################
                    # Shuffle manually
                    ###########################

                    index = np.arange(np.shape(X_train)[0])
                    np.random.RandomState(seed = 1).shuffle(index)
                    
                    X_train = X_train[index, :] 
                    model.fit(X_train)

                    #############
                    # Fifth Step: We predict on the scaled test data
                    #############
                    p_at_k = precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
                    map_at_k  = mean_average_precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
                    var_ndcg_at_k = ndcg_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)

                    # results to list
                    p_at_k_list.append(p_at_k)
                    map_at_k_list.append(map_at_k) 
                    ndcg_at_k_list.append(var_ndcg_at_k) #store as df to create the avg confusion matrix later


                # store the results for each random_seed
                # After each iteration in the random_seed loop the lists (acc_list, bacc_list and confusion_m_list) are cleaned. Therefore we can do the following computations to store the average results for each random_seed:
                # Store the averages per random_seed in list. 
                avg_p_at_k_list.append(statistics.mean(p_at_k_list))
                avg_map_at_k_list.append(statistics.mean(map_at_k_list))
                avg_ndcg_at_k_list.append(statistics.mean(ndcg_at_k_list))

            
            row = {
                "factors": f,
                "regularization": r,
                "p_at_k_mean": statistics.mean(avg_p_at_k_list),  
                "map_at_k_mean":statistics.mean(avg_map_at_k_list), 
                "ndcg_at_k_mean":statistics.mean(avg_ndcg_at_k_list),
                }
            results_per_run = pd.DataFrame([row])
            results_over_all = pd.concat([results_over_all, results_per_run], ignore_index=True)
    return results_over_all
                
              #  return avg_p_at_k_list,avg_map_at_k_list,avg_ndcg_at_k_list


In [220]:
matrix_csr = scipy.sparse.csr_matrix(matrix.values)

In [None]:
cross_validation_implicit_als(matrix_csr)

100%|██████████| 15/15 [00:00<00:00, 29.49it/s]
100%|██████████| 81/81 [00:00<00:00, 2456.11it/s]
100%|██████████| 81/81 [00:00<00:00, 6234.99it/s]
100%|██████████| 81/81 [00:00<00:00, 6235.34it/s]
100%|██████████| 15/15 [00:00<00:00, 30.95it/s]
100%|██████████| 81/81 [00:00<00:00, 5403.57it/s]
100%|██████████| 81/81 [00:00<00:00, 5789.39it/s]
100%|██████████| 81/81 [00:00<00:00, 6234.99it/s]
100%|██████████| 15/15 [00:00<00:00, 28.32it/s]
100%|██████████| 81/81 [00:00<00:00, 6234.65it/s]
100%|██████████| 81/81 [00:00<00:00, 5789.09it/s]
100%|██████████| 81/81 [00:00<00:00, 4767.32it/s]
100%|██████████| 15/15 [00:00<00:00, 29.78it/s]
100%|██████████| 81/81 [00:00<00:00, 8105.23it/s]
100%|██████████| 81/81 [00:00<00:00, 6754.38it/s]
100%|██████████| 81/81 [00:00<00:00, 7368.64it/s]
100%|██████████| 15/15 [00:00<00:00, 28.32it/s]


ModelFitError: NaN encountered in factors