## Implicit Library

In [99]:
import implicit
import h5py
import pandas as pd
import numpy as np
import random
import statistics
from implicit.evaluation import train_test_split
from sklearn.metrics import ndcg_score
from numpy.random import permutation
from implicit.nearest_neighbours import (
    BM25Recommender,
    CosineRecommender,
    TFIDFRecommender,
    bm25_weight,
)
import scipy
from scipy.sparse import csr_matrix

## Preprocessing

In [2]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";") 
#df = [~df["category"].isin(["Betriebssystem", "Dienste", "Einsatzfelder / Erfahrungen / Schwerpunkte", "Rolle (intern)"])]
df.head(5)

Unnamed: 0,person,skill,category
0,12,Windows NT/2000/XP,Betriebssystem
1,12,MAC OS X,Betriebssystem
2,12,Windows 7,Betriebssystem
3,12,Windows 8,Betriebssystem
4,12,iOS,Betriebssystem


In [3]:
data = df

In [4]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";") 
#df = [~df["category"].isin(["Betriebssystem", "Dienste", "Einsatzfelder / Erfahrungen / Schwerpunkte", "Rolle (intern)"])]
df.head(5)

Unnamed: 0,person,skill,category
0,12,Windows NT/2000/XP,Betriebssystem
1,12,MAC OS X,Betriebssystem
2,12,Windows 7,Betriebssystem
3,12,Windows 8,Betriebssystem
4,12,iOS,Betriebssystem


In [5]:
data = df

In [6]:
matrix = pd.read_csv("data/origin_binary_matrix.csv")

In [7]:
matrix

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Normalizing the rows

In [8]:
data_skills  = matrix
data_skills 

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In the next step we will compute the item-item relationships of our skills. Our final goal here is to construct a new item by item matrix containing the weights (relationships) between each of our skills where a perfect correlation equals 1 and no correlation at all equals 0.
<br>
In order to do so, we will first normalize the user vectors. The idea behind this approach is, that a user with many skills contributes less to any individual skill. For example if a user, that rules only 3 skills, rules a skill X it is more valueable than if a user, that rules 20 skills, that particular skill.


* First we caclulat the magnitude for every user

In [9]:
# magnitude = sqrt(x2 + y2 + z2 + ...)
magnitude = np.sqrt(np.square(data_skills).sum(axis=1))

In [10]:
magnitude

0       8.774964
1       8.602325
2       5.567764
3       5.567764
4       1.000000
         ...    
399     2.828427
400     4.358899
401     3.000000
402     2.645751
403    12.206556
Length: 404, dtype: float64

* Now we use a users magnitude to normalize the ratings of this corresponding user

In [11]:
data_skills_row_norm=  data_skills.divide(magnitude, axis='index')

In [12]:
data_skills_row_norm

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
1,0.0,0.000000,0.116248,0.000000,0.000000,0.116248,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
2,0.0,0.000000,0.000000,0.179605,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.179605,0.0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
4,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
400,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
401,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
402,0.0,0.377964,0.377964,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0


### Create sparse Matrix representation for the implicit lib

In [13]:
data_skills_row_norm_csr = scipy.sparse.csr_matrix(data_skills_row_norm.values)


In [14]:
data_skills_row_norm_csr?

[1;31mType:[0m        csr_matrix
[1;31mString form:[0m
(0, 22)	0.11396057645963795
           (0, 30)	0.11396057645963795
           (0, 35)	0.11396057645963795
           (0, 4 <...>  (403, 713)	0.08192319205190406
           (403, 719)	0.08192319205190406
           (403, 730)	0.08192319205190406
[1;31mFile:[0m        c:\users\maximus\envs\ds_ap\lib\site-packages\scipy\sparse\_csr.py
[1;31mDocstring:[0m  
Compressed Sparse Row matrix

This can be instantiated in several ways:
    csr_matrix(D)
        with a dense matrix or rank-2 ndarray D

    csr_matrix(S)
        with another sparse matrix S (equivalent to S.tocsr())

    csr_matrix((M, N), [dtype])
        to construct an empty matrix with shape (M, N)
        dtype is optional, defaulting to dtype='d'.

    csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
        where ``data``, ``row_ind`` and ``col_ind`` satisfy the
        relationship ``a[row_ind[k], col_ind[k]] = data[k]``.

    csr_matrix((data, indices, ind

In [15]:
# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=64) 

In [100]:
# train the model on a sparse matrix of user/item/confidence weights
model.fit(data_skills_row_norm_csr)

100%|██████████| 15/15 [00:02<00:00,  6.74it/s]


In [18]:
userid= 13
dict_user_skills = {}
for value, name in zip(data_skills_row_norm.iloc[userid,:], data_skills_row_norm.columns):
    if value > 0:
        dict_user_skills[name] = value
    else:
        next

dict_user_skills

{'Android': 0.23570226039551587,
 'Apache': 0.23570226039551587,
 'GIT': 0.23570226039551587,
 'JavaScript': 0.23570226039551587,
 'Joomla': 0.23570226039551587,
 'Microsoft SQL Server': 0.23570226039551587,
 'MySQL': 0.23570226039551587,
 'Oracle': 0.23570226039551587,
 'PHP': 0.23570226039551587,
 'PhpStorm': 0.23570226039551587,
 'Pimcore': 0.23570226039551587,
 'SUSE Linux': 0.23570226039551587,
 'Subversion': 0.23570226039551587,
 'Symfony': 0.23570226039551587,
 'Ubuntu': 0.23570226039551587,
 'Windows 10': 0.23570226039551587,
 'Windows 7': 0.23570226039551587,
 'Windows NT/2000/XP': 0.23570226039551587}

### Create a df that holds the recommendations and the scores 
* Die Scores sind in der implicip library equivalent zu der confidence. Die Confidence gibt an, wie häufig ein item x konsumiert wurde. Allerdings soll die Intensität, mit der dieser Effekt steigt, sukkzessive abnehmen, um den Einfluss eines Superfans (jmd der überdurchschnittkich viel von Item x konsumiert) zu schwächen. Hier sind parallelen zu den Motiven der Row-Normalisation zu erkennen. <br>
Ich habe im ersten Lauf die Werte meiner row-normalisation als confidence werte verwendet. <br>
    * Sollte nach angepasst werden. Hierzu: https://en.wikipedia.org/wiki/Okapi_BM25

Maybe it makes sense <br>
"
Confidence can be defined as the worth or the value we give to the interaction. For User A buying(a transaction event) item X we increase the interaction weight, while User A viewing item Z has lesser weight than the ‘interaction of buying’. https://towardsdatascience.com/alternating-least-square-for-implicit-dataset-with-code-8e7999277f4b " 

In [101]:
skill_ids, scores  = model.recommend(userid, data_skills_row_norm_csr[userid], N = 20 , filter_already_liked_items=False)
recommendations_df = pd.DataFrame({"skill": data_skills_row_norm.columns[skill_ids], "score": scores, "already_liked": np.in1d(skill_ids, data_skills_row_norm_csr[userid].indices)})
recommendations_df

Unnamed: 0,skill,score,already_liked
0,PHP,1.228633,True
1,Windows 10,1.056569,True
2,Microsoft SQL Server,0.923457,True
3,PhpStorm,0.893593,True
4,Windows NT/2000/XP,0.796476,True
5,GIT,0.754838,True
6,Windows 7,0.742885,True
7,MySQL,0.707243,True
8,Apache,0.661934,True
9,Android,0.641628,True


In [20]:
skill_and_categories = df[["skill","category"]].drop_duplicates()

In [21]:
recommendations_df.merge(skill_and_categories, how = "left", left_on = "skill", right_on = "skill")

Unnamed: 0,skill,score,already_liked,category
0,PHP,1.206353,True,Webentwicklung
1,Microsoft SQL Server,1.063125,True,Datenbank
2,Windows 10,1.034622,True,Betriebssystem
3,Windows NT/2000/XP,0.867914,True,Betriebssystem
4,MySQL,0.817383,True,Datenbank
5,PhpStorm,0.699708,True,Umgebungen
6,Android,0.675158,True,Betriebssystem
7,Windows 7,0.648864,True,Betriebssystem
8,Apache,0.573233,True,Applikationsserver
9,SUSE Linux,0.55121,True,Betriebssystem


In [22]:
# Id and skills
dict_skills_id = {}
for c,skill in enumerate(data_skills_row_norm.columns):
        dict_skills_id[c] = skill

dict_skills_id

{0: '.NET Compact Framework',
 1: '.NET Core',
 2: '.NET Framework',
 3: '3D-Modellierung',
 4: 'ABAP',
 5: 'ADO.NET',
 6: 'AIX',
 7: 'ARIS',
 8: 'ARIS ITArchitect',
 9: 'AS400',
 10: 'ASP Generalist',
 11: 'ASP.NET',
 12: 'ASP.NET MVC-Framework',
 13: 'ASP.NET WebAPI',
 14: 'AWS',
 15: 'AWS Lambda',
 16: 'Abstract',
 17: 'Access',
 18: 'Accessibility / WCAG',
 19: 'Active Directory',
 20: 'ActiveX',
 21: 'Adobe CC',
 22: 'Adobe Flash',
 23: 'Adobe Illustrator',
 24: 'Adobe InDesign',
 25: 'Adobe Indesign',
 26: 'Adobe Photoshop',
 27: 'Adobe Premiere',
 28: 'Adobe XD',
 29: 'After Effects',
 30: 'Agile Methoden',
 31: 'Ajax',
 32: 'Alexa-Skills',
 33: 'Alpine',
 34: 'Analytics',
 35: 'Android',
 36: 'Android Studio',
 37: 'Anforderungsanalyse',
 38: 'Anforderungsmanagement',
 39: 'Angular (2 und höher)',
 40: 'Angular Generalist',
 41: 'Angular Material',
 42: 'Angular Theming',
 43: 'AngularJS',
 44: 'Animations (transition, @keyframes)',
 45: 'Ansible',
 46: 'Ant',
 47: 'Apache',
 4

In [23]:
keys = [k for k, v in dict_skills_id.items() if v == 'Python']
print(keys)

[458]


### Display simular items

In [24]:
# find related item
skill_id, sim = model.similar_items(0, N=10)
# display the results using pandas for nicer formatting
simularity_df  = pd.DataFrame({"skill": data_skills_row_norm.columns[skill_id], "score": sim})
simularity_df.merge(skill_and_categories, how = "left", left_on = "skill", right_on = "skill")

Unnamed: 0,skill,score,category
0,.NET Compact Framework,1.0,.NET Frameworks und Tools
1,Microsoft Enterprise Library,0.649056,.NET Frameworks und Tools
2,ADO.NET,0.610234,.NET Frameworks und Tools
3,Silverlight,0.604521,.NET Frameworks und Tools
4,Windows Communication Foundation (WCF),0.600455,.NET Frameworks und Tools
5,DevExpress,0.600387,.NET Frameworks und Tools
6,ASP Generalist,0.58962,.NET Frameworks und Tools
7,AS400,0.562319,Betriebssystem
8,BizTalk,0.549183,.NET Frameworks und Tools
9,Mono,0.547131,.NET Frameworks und Tools


# Add a Evaluation metric to the library

### APK
http://sdsawtelle.github.io/blog/output/mean-average-precision-MAP-for-recommender-systems.html

In [25]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if not actual:
        return 0.0

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [26]:
###
# Recommendations for user_id = 13
###
recommendations_list = list(recommendations_df.skill)
known_skills_list = list(dict_user_skills.keys())
len(known_skills_list)

18

In [27]:
apk(known_skills_list,recommendations_list,k=20)

0.8888888888888888

## Train-Test split

In [103]:
# train the model on a sparse matrix of user/item/confidence weights
train, test = train_test_split(data_skills_row_norm_csr, train_percentage =  0.8, random_state = 1)
model.fit(train)

100%|██████████| 15/15 [00:02<00:00,  7.08it/s]


In [29]:
from implicit.evaluation import precision_at_k,mean_average_precision_at_k, ndcg_at_k

In [104]:
results = precision_at_k(model, train_user_items=train, test_user_items=test, K=4)
results

100%|██████████| 378/378 [00:00<00:00, 3230.85it/s]


0.25591397849462366

In [112]:
results = mean_average_precision_at_k(model, train_user_items=train, test_user_items=test, K=5)
results

100%|██████████| 378/378 [00:00<00:00, 3634.46it/s]


0.16117136978248114

In [32]:
mean_average_precision_at_k?

[1;31mDocstring:[0m
mean_average_precision_at_k(model, train_user_items, test_user_items, int K=10, show_progress=True, int num_threads=1)
Calculates MAP@K for a given trained model

   Parameters
   ----------
   model : RecommenderBase
       The fitted recommendation model to test
   train_user_items : csr_matrix
       Sparse matrix of user by item that contains elements that were used in training the model
   test_user_items : csr_matrix
       Sparse matrix of user by item that contains withheld elements to test on
   K : int
       Number of items to test on
   show_progress : bool, optional
       Whether to show a progress bar
   num_threads : int, optional
       The number of threads to use for testing. Specifying 0 means to default
       to the number of cores on the machine. Note: aside from the ALS and BPR
       models, setting this to more than 1 will likely hurt performance rather than
       help.

   Returns
   -------
   float
       the calculated MAP@k
   
[1;

## Crossvalidation

In [33]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";") 
#df = [~df["category"].isin(["Betriebssystem", "Dienste", "Einsatzfelder / Erfahrungen / Schwerpunkte", "Rolle (intern)"])]
df.head(5)

Unnamed: 0,person,skill,category
0,12,Windows NT/2000/XP,Betriebssystem
1,12,MAC OS X,Betriebssystem
2,12,Windows 7,Betriebssystem
3,12,Windows 8,Betriebssystem
4,12,iOS,Betriebssystem


In [34]:
df = df.drop(columns=["category"])

In [35]:
data = df
data.head(5)

Unnamed: 0,person,skill
0,12,Windows NT/2000/XP
1,12,MAC OS X
2,12,Windows 7
3,12,Windows 8
4,12,iOS


In [36]:
NUM_TRIALS = 5
NUM_OUTER_SPLITS = 5

In [148]:
from sklearn.model_selection import cross_validate,KFold
from sklearn.metrics import make_scorer
from random import shuffle


In [221]:
folds = 4

results = pd.DataFrame()
def cross_validation_implicit(data):
    results_over_all = pd.DataFrame(columns=["factors","alpha","regularization","p_at_k_mean","map_at_k_mean","ndcg_at_k_mean"])
    factor_list = [1,5,10,15,25,50]
    alpha_list = [0.5,1.0, 1.5]
    regularization_list = [0.005,0.01,0.02]


    for f in factor_list:
        for a in alpha_list:
            for r in regularization_list:
                avg_p_at_k_list = []
                avg_map_at_k_list = []
                avg_ndcg_at_k_list = []

                #Note: Shuffle is set on false, because it leads to ambiguous results regarding the map_at_k and the ndcg_at_k. The 
                # However p_at_k is not affected by the shuffle and stays consistent.
                skf = KFold(n_splits= folds , shuffle=False) 

                for r_seed in range(0,5):
                    # Lists to store the results for each random_seed
                    p_at_k_list = []
                    map_at_k_list = []
                    ndcg_at_k_list = []
                    model = implicit.als.AlternatingLeastSquares(factors = f, random_state= r_seed, alpha= a, regularization= r)

                    ###########################
                    # Split the data
                    ###########################
                    for train_index, test_index in skf.split(data):
                        X_train = data[train_index]
                        X_test = data[test_index]
                        
                        ###########################
                        # Shuffle manually
                        ###########################

                        index = np.arange(np.shape(X_train)[0])
                        np.random.RandomState(seed = 1).shuffle(index)
                        
                        X_train = X_train[index, :] 
                        model.fit(X_train)

                        #############
                        # Fifth Step: We predict on the scaled test data
                        #############
                        p_at_k = precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
                        map_at_k  = mean_average_precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
                        var_ndcg_at_k = ndcg_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)

                        # results to list
                        p_at_k_list.append(p_at_k)
                        map_at_k_list.append(map_at_k) 
                        ndcg_at_k_list.append(var_ndcg_at_k) #store as df to create the avg confusion matrix later


                    # store the results for each random_seed
                    # After each iteration in the random_seed loop the lists (acc_list, bacc_list and confusion_m_list) are cleaned. Therefore we can do the following computations to store the average results for each random_seed:
                    # Store the averages per random_seed in list. 
                    avg_p_at_k_list.append(statistics.mean(p_at_k_list))
                    avg_map_at_k_list.append(statistics.mean(map_at_k_list))
                    avg_ndcg_at_k_list.append(statistics.mean(ndcg_at_k_list))

                
                row = {
                    "factors": f,
                    "alpha": a,
                    "regularization": r,
                    "p_at_k_mean": statistics.mean(avg_p_at_k_list),  
                    "map_at_k_mean":statistics.mean(avg_map_at_k_list), 
                    "ndcg_at_k_mean":statistics.mean(avg_ndcg_at_k_list),
                    }
                results_per_run = pd.DataFrame([row])
                results_over_all = pd.concat([results_over_all, results_per_run], ignore_index=True)
    return results_over_all
                
              #  return avg_p_at_k_list,avg_map_at_k_list,avg_ndcg_at_k_list


In [222]:
results_of_als_grid = cross_validation_implicit(data_skills_row_norm_csr)
results_of_als_grid

100%|██████████| 15/15 [00:00<00:00, 105.74it/s]
100%|██████████| 101/101 [00:00<00:00, 3741.08it/s]
100%|██████████| 101/101 [00:00<00:00, 2804.20it/s]
100%|██████████| 101/101 [00:00<00:00, 2295.62it/s]
100%|██████████| 15/15 [00:00<00:00, 192.31it/s]
100%|██████████| 101/101 [00:00<00:00, 3257.12it/s]
100%|██████████| 101/101 [00:00<00:00, 3156.43it/s]
100%|██████████| 101/101 [00:00<00:00, 3482.15it/s]
100%|██████████| 15/15 [00:00<00:00, 159.58it/s]
100%|██████████| 101/101 [00:00<00:00, 1985.06it/s]
100%|██████████| 101/101 [00:00<00:00, 2657.89it/s]
100%|██████████| 101/101 [00:00<00:00, 2729.66it/s]
100%|██████████| 15/15 [00:00<00:00, 156.26it/s]
100%|██████████| 101/101 [00:00<00:00, 2463.55it/s]
100%|██████████| 101/101 [00:00<00:00, 2589.47it/s]
100%|██████████| 101/101 [00:00<00:00, 2061.53it/s]
100%|██████████| 15/15 [00:00<00:00, 211.28it/s]
100%|██████████| 101/101 [00:00<00:00, 2885.41it/s]
100%|██████████| 101/101 [00:00<00:00, 3257.80it/s]
100%|██████████| 101/101 [0

Unnamed: 0,factors,alpha,regularization,p_at_k_mean,map_at_k_mean,ndcg_at_k_mean
0,1,0.5,0.005,0.570055,0.48013,0.552944
1,1,0.5,0.01,0.568256,0.47831,0.551325
2,1,0.5,0.02,0.567724,0.477992,0.550915
3,1,1.0,0.005,0.567751,0.478264,0.551782
4,1,1.0,0.01,0.566586,0.477153,0.550738
5,1,1.0,0.02,0.567751,0.478264,0.551782
6,1,1.5,0.005,0.571134,0.480731,0.553795
7,1,1.5,0.01,0.57082,0.480434,0.5536
8,1,1.5,0.02,0.570611,0.480236,0.55347
9,5,0.5,0.005,0.476881,0.374682,0.456882


In [202]:
def add_result(results,name, p_at_k,map_at_k,ndcg_at_k):
    """This function appends a name, an average accuravy, an average balanced accuracy and a average confusion matrix as rows to a dataframe.

    Args:
        results (dataframe): A dataframe to append the other arguments as rows
        name (string): Identifier
        acc (list of floats): computes the average accuracy
        baccs (list of floats): computes the average balanced accuracy
        confusion_m (list of arrays): computes the average Confusion Matrix
    """
    row = {
        "name":name,
        "p_at_k_mean": statistics.mean(p_at_k),  
        "map_at_k_mean":statistics.mean(map_at_k), 
        "ndcg_at_k_mean":statistics.mean(ndcg_at_k),
        }
    return results.append(row,ignore_index=True)

results = pd.DataFrame()

In [203]:
def add_result2(results,f,a,r, p_at_k,map_at_k,ndcg_at_k):
    """This function appends a name, an average accuravy, an average balanced accuracy and a average confusion matrix as rows to a dataframe.

    Args:
        results (dataframe): A dataframe to append the other arguments as rows
        name (string): Identifier
        acc (list of floats): computes the average accuracy
        baccs (list of floats): computes the average balanced accuracy
        confusion_m (list of arrays): computes the average Confusion Matrix
    """
    row = {
        "factors":f,
        "alpha":a,
        "regularization":r,
        "p_at_k_mean": statistics.mean(p_at_k),  
        "map_at_k_mean":statistics.mean(map_at_k), 
        "ndcg_at_k_mean":statistics.mean(ndcg_at_k),
        }
    return results.append(row,ignore_index=True)

results = pd.DataFrame()

In [192]:
p_k, map_k, ndcg_k = cross_validation_implicit(data=data_skills_row_norm_csr)
results = add_result(results, "ALS",p_k,map_k, ndcg_k)
results

100%|██████████| 15/15 [00:01<00:00, 10.56it/s]
100%|██████████| 101/101 [00:00<00:00, 2970.68it/s]
100%|██████████| 101/101 [00:00<00:00, 3254.62it/s]
100%|██████████| 101/101 [00:00<00:00, 2658.10it/s]
100%|██████████| 15/15 [00:00<00:00, 16.22it/s]
100%|██████████| 101/101 [00:00<00:00, 3258.08it/s]
100%|██████████| 101/101 [00:00<00:00, 3258.20it/s]
100%|██████████| 101/101 [00:00<00:00, 3156.24it/s]
100%|██████████| 15/15 [00:01<00:00, 11.14it/s]
100%|██████████| 101/101 [00:00<00:00, 3156.50it/s]
100%|██████████| 101/101 [00:00<00:00, 2729.91it/s]
100%|██████████| 101/101 [00:00<00:00, 2885.69it/s]
100%|██████████| 15/15 [00:01<00:00, 12.67it/s]
100%|██████████| 101/101 [00:00<00:00, 2349.18it/s]
100%|██████████| 101/101 [00:00<00:00, 2348.81it/s]
100%|██████████| 101/101 [00:00<00:00, 2725.96it/s]
100%|██████████| 15/15 [00:00<00:00, 15.03it/s]
100%|██████████| 101/101 [00:00<00:00, 3156.83it/s]
100%|██████████| 101/101 [00:00<00:00, 2970.97it/s]
100%|██████████| 101/101 [00:00<

Unnamed: 0,name,p_at_k_mean,map_at_k_mean,ndcg_at_k_mean
0,ALS,0.418204,0.309401,0.40115
1,ALS,0.418204,0.309401,0.40115
2,ALS,0.418204,0.309401,0.40115
3,ALS,0.418204,0.309401,0.40115


In [180]:
from sklearn.model_selection import GridSearchCV


In [None]:

def cross_validation_implicit(data):
    avg_p_at_k_list = []
    avg_map_at_k_list = []
    avg_ndcg_at_k_list = []

    #Note: Shuffle is set on false, because it leads to ambiguous results regarding the map_at_k and the ndcg_at_k. The 
    # However p_at_k is not affected by the shuffle and stays consistent.
    skf = KFold(n_splits= folds , shuffle=False) 

    for r_seed in range(0,5):
        # Lists to store the results for each random_seed
        p_at_k_list = []
        map_at_k_list = []
        ndcg_at_k_list = []
        model = implicit.als.AlternatingLeastSquares(factors = 10, random_state= r_seed)

        ###########################
        # Split the data
        ###########################
        for train_index, test_index in skf.split(data):
            X_train = data[train_index]
            X_test = data[test_index]
            
            ###########################
            # Shuffle manually
            ###########################

            index = np.arange(np.shape(X_train)[0])
            np.random.RandomState(seed = 1).shuffle(index)
            
            X_train = X_train[index, :] 
            model.fit(X_train)

            #############
            # Fifth Step: We predict on the scaled test data
            #############
            p_at_k = precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
            map_at_k  = mean_average_precision_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)
            var_ndcg_at_k = ndcg_at_k(model, train_user_items=X_train, test_user_items=X_test, K=5)

            # results to list
            p_at_k_list.append(p_at_k)
            map_at_k_list.append(map_at_k) 
            ndcg_at_k_list.append(var_ndcg_at_k) #store as df to create the avg confusion matrix later


        # store the results for each random_seed
        # After each iteration in the random_seed loop the lists (acc_list, bacc_list and confusion_m_list) are cleaned. Therefore we can do the following computations to store the average results for each random_seed:
        # Store the averages per random_seed in list. 
        avg_p_at_k_list.append(statistics.mean(p_at_k_list))
        avg_map_at_k_list.append(statistics.mean(map_at_k_list))
        avg_ndcg_at_k_list.append(statistics.mean(ndcg_at_k_list))


    return avg_p_at_k_list,avg_map_at_k_list,avg_ndcg_at_k_list


The model was fitted on the whole dataset.
Try to do a train test / cross valdiation later on.
* see: https://gist.github.com/jbochi/2e8ddcc5939e70e5368326aa034a144e
* evaluation of the implicit lib: https://github.com/benfred/implicit/blob/main/implicit/evaluation.pyx