## Implicit Library

In [10]:
from implicit.evaluation import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
import implicit
import h5py
import pandas as pd
import numpy as np
import random
from matplotlib import pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
from sklearn.pipeline import Pipeline
import time
from numpy.random import permutation
from implicit.nearest_neighbours import (
    BM25Recommender,
    CosineRecommender,
    TFIDFRecommender,
    bm25_weight,
)
import scipy
from scipy.sparse import csr_matrix

## Preprocessing

In [12]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";") 
#df = [~df["category"].isin(["Betriebssystem", "Dienste", "Einsatzfelder / Erfahrungen / Schwerpunkte", "Rolle (intern)"])]
df.head(5)

Unnamed: 0,person,skill,category
0,12,Windows NT/2000/XP,Betriebssystem
1,12,MAC OS X,Betriebssystem
2,12,Windows 7,Betriebssystem
3,12,Windows 8,Betriebssystem
4,12,iOS,Betriebssystem


In [13]:
data = df

In [14]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";") 
#df = [~df["category"].isin(["Betriebssystem", "Dienste", "Einsatzfelder / Erfahrungen / Schwerpunkte", "Rolle (intern)"])]
df.head(5)

Unnamed: 0,person,skill,category
0,12,Windows NT/2000/XP,Betriebssystem
1,12,MAC OS X,Betriebssystem
2,12,Windows 7,Betriebssystem
3,12,Windows 8,Betriebssystem
4,12,iOS,Betriebssystem


In [15]:
data = df

In [16]:
matrix = pd.read_csv("data/origin_binary_matrix.csv")

In [17]:
matrix

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Normalizing the rows

In [18]:
data_skills  = matrix
data_skills 

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
400,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
402,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In the next step we will compute the item-item relationships of our skills. Our final goal here is to construct a new item by item matrix containing the weights (relationships) between each of our skills where a perfect correlation equals 1 and no correlation at all equals 0.
<br>
In order to do so, we will first normalize the user vectors. The idea behind this approach is, that a user with many skills contributes less to any individual skill. For example if a user, that rules only 3 skills, rules a skill X it is more valueable than if a user, that rules 20 skills, that particular skill.


* First we caclulat the magnitude for every user

In [19]:
# magnitude = sqrt(x2 + y2 + z2 + ...)
magnitude = np.sqrt(np.square(data_skills).sum(axis=1))

In [20]:
magnitude

0       8.774964
1       8.602325
2       5.567764
3       5.567764
4       1.000000
         ...    
399     2.828427
400     4.358899
401     3.000000
402     2.645751
403    12.206556
Length: 404, dtype: float64

* Now we use a users magnitude to normalize the ratings of this corresponding user

In [21]:
data_skills_row_norm=  data_skills.divide(magnitude, axis='index')

In [22]:
data_skills_row_norm

Unnamed: 0,.NET Compact Framework,.NET Core,.NET Framework,3D-Modellierung,ABAP,ADO.NET,AIX,ARIS,ARIS ITArchitect,AS400,...,ramda.js,ranorex,samba,script.aculo.us,varnish,visual paradigm,vnc,vs code,xHTML,xtCommerce
0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
1,0.0,0.000000,0.116248,0.000000,0.000000,0.116248,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
2,0.0,0.000000,0.000000,0.179605,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.179605,0.0
3,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
4,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
400,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
401,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
402,0.0,0.377964,0.377964,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0


### Create sparse Matrix representation for the implicit lib

In [23]:
data_skills_row_norm_csr = scipy.sparse.csr_matrix(data_skills_row_norm.values)


In [24]:
data_skills_row_norm_csr?

[1;31mType:[0m        csr_matrix
[1;31mString form:[0m
(0, 22)	0.11396057645963795
           (0, 30)	0.11396057645963795
           (0, 35)	0.11396057645963795
           (0, 4 <...>  (403, 713)	0.08192319205190406
           (403, 719)	0.08192319205190406
           (403, 730)	0.08192319205190406
[1;31mFile:[0m        c:\users\maximus\envs\ds_ap\lib\site-packages\scipy\sparse\_csr.py
[1;31mDocstring:[0m  
Compressed Sparse Row matrix

This can be instantiated in several ways:
    csr_matrix(D)
        with a dense matrix or rank-2 ndarray D

    csr_matrix(S)
        with another sparse matrix S (equivalent to S.tocsr())

    csr_matrix((M, N), [dtype])
        to construct an empty matrix with shape (M, N)
        dtype is optional, defaulting to dtype='d'.

    csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)])
        where ``data``, ``row_ind`` and ``col_ind`` satisfy the
        relationship ``a[row_ind[k], col_ind[k]] = data[k]``.

    csr_matrix((data, indices, ind

In [25]:
# initialize a model
model = implicit.als.AlternatingLeastSquares(factors=64) 

In [26]:
# train the model on a sparse matrix of user/item/confidence weights
model.fit(data_skills_row_norm_csr)

100%|██████████| 15/15 [00:02<00:00,  7.36it/s]


In [27]:
model.fit?

[1;31mSignature:[0m [0mmodel[0m[1;33m.[0m[0mfit[0m[1;33m([0m[0muser_items[0m[1;33m,[0m [0mshow_progress[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m [0mcallback[0m[1;33m=[0m[1;32mNone[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Factorizes the user_items matrix.

After calling this method, the members 'user_factors' and 'item_factors' will be
initialized with a latent factor model of the input data.

The user_items matrix does double duty here. It defines which items are liked by which
users (P_ui in the original paper), as well as how much confidence we have that the user
liked the item (C_ui).

The negative items are implicitly defined: This code assumes that positive items in the
user_items matrix means that the user liked the item. The negatives are left unset in this
sparse matrix: the library will assume that means Piu = 0 and Ciu = 1 for all these items.
Negative items can also be passed with a higher confidence value by passing a negative
va

In [28]:
userid= 13
dict_user_skills = {}
for value, name in zip(data_skills_row_norm.iloc[userid,:], data_skills_row_norm.columns):
    if value > 0:
        dict_user_skills[name] = value
    else:
        next

dict_user_skills

{'Android': 0.23570226039551587,
 'Apache': 0.23570226039551587,
 'GIT': 0.23570226039551587,
 'JavaScript': 0.23570226039551587,
 'Joomla': 0.23570226039551587,
 'Microsoft SQL Server': 0.23570226039551587,
 'MySQL': 0.23570226039551587,
 'Oracle': 0.23570226039551587,
 'PHP': 0.23570226039551587,
 'PhpStorm': 0.23570226039551587,
 'Pimcore': 0.23570226039551587,
 'SUSE Linux': 0.23570226039551587,
 'Subversion': 0.23570226039551587,
 'Symfony': 0.23570226039551587,
 'Ubuntu': 0.23570226039551587,
 'Windows 10': 0.23570226039551587,
 'Windows 7': 0.23570226039551587,
 'Windows NT/2000/XP': 0.23570226039551587}

### Create a df that holds the recommendations and the scores 
* Die Scores sind in der implicip library equivalent zu der confidence. Die Confidence gibt an, wie häufig ein item x konsumiert wurde. Allerdings soll die Intensität, mit der dieser Effekt steigt, sukkzessive abnehmen, um den Einfluss eines Superfans (jmd der überdurchschnittkich viel von Item x konsumiert) zu schwächen. Hier sind parallelen zu den Motiven der Row-Normalisation zu erkennen. <br>
Ich habe im ersten Lauf die Werte meiner row-normalisation als confidence werte verwendet. <br>
    * Sollte nach angepasst werden. Hierzu: https://en.wikipedia.org/wiki/Okapi_BM25

Maybe it makes sense <br>
"
Confidence can be defined as the worth or the value we give to the interaction. For User A buying(a transaction event) item X we increase the interaction weight, while User A viewing item Z has lesser weight than the ‘interaction of buying’. https://towardsdatascience.com/alternating-least-square-for-implicit-dataset-with-code-8e7999277f4b " 

In [29]:
skill_ids, scores  = model.recommend(userid, data_skills_row_norm_csr[userid], N = 20 , filter_already_liked_items=False)
recommendations_df = pd.DataFrame({"skill": data_skills_row_norm.columns[skill_ids], "score": scores, "already_liked": np.in1d(skill_ids, data_skills_row_norm_csr[userid].indices)})
recommendations_df

Unnamed: 0,skill,score,already_liked
0,PHP,1.188693,True
1,Windows 10,1.046153,True
2,PhpStorm,0.86493,True
3,Microsoft SQL Server,0.859961,True
4,Windows 7,0.781825,True
5,Windows NT/2000/XP,0.72648,True
6,Subversion,0.677786,True
7,JavaScript,0.656517,True
8,Android,0.594309,True
9,Symfony,0.585106,True


In [30]:
skill_and_categories = df[["skill","category"]].drop_duplicates()

In [31]:
recommendations_df.merge(skill_and_categories, how = "left", left_on = "skill", right_on = "skill")

Unnamed: 0,skill,score,already_liked,category
0,PHP,1.188693,True,Webentwicklung
1,Windows 10,1.046153,True,Betriebssystem
2,PhpStorm,0.86493,True,Umgebungen
3,Microsoft SQL Server,0.859961,True,Datenbank
4,Windows 7,0.781825,True,Betriebssystem
5,Windows NT/2000/XP,0.72648,True,Betriebssystem
6,Subversion,0.677786,True,"CI/CD, Build- und Versionskontrollsysteme"
7,JavaScript,0.656517,True,Programmiersprachen / Scriptsprachen
8,Android,0.594309,True,Betriebssystem
9,Symfony,0.585106,True,PHP


In [32]:
# Id and skills
dict_skills_id = {}
for c,skill in enumerate(data_skills_row_norm.columns):
        dict_skills_id[c] = skill

dict_skills_id

{0: '.NET Compact Framework',
 1: '.NET Core',
 2: '.NET Framework',
 3: '3D-Modellierung',
 4: 'ABAP',
 5: 'ADO.NET',
 6: 'AIX',
 7: 'ARIS',
 8: 'ARIS ITArchitect',
 9: 'AS400',
 10: 'ASP Generalist',
 11: 'ASP.NET',
 12: 'ASP.NET MVC-Framework',
 13: 'ASP.NET WebAPI',
 14: 'AWS',
 15: 'AWS Lambda',
 16: 'Abstract',
 17: 'Access',
 18: 'Accessibility / WCAG',
 19: 'Active Directory',
 20: 'ActiveX',
 21: 'Adobe CC',
 22: 'Adobe Flash',
 23: 'Adobe Illustrator',
 24: 'Adobe InDesign',
 25: 'Adobe Indesign',
 26: 'Adobe Photoshop',
 27: 'Adobe Premiere',
 28: 'Adobe XD',
 29: 'After Effects',
 30: 'Agile Methoden',
 31: 'Ajax',
 32: 'Alexa-Skills',
 33: 'Alpine',
 34: 'Analytics',
 35: 'Android',
 36: 'Android Studio',
 37: 'Anforderungsanalyse',
 38: 'Anforderungsmanagement',
 39: 'Angular (2 und höher)',
 40: 'Angular Generalist',
 41: 'Angular Material',
 42: 'Angular Theming',
 43: 'AngularJS',
 44: 'Animations (transition, @keyframes)',
 45: 'Ansible',
 46: 'Ant',
 47: 'Apache',
 4

In [33]:
keys = [k for k, v in dict_skills_id.items() if v == 'Python']
print(keys)

[458]


### Display simular items

In [34]:
# find related item
skill_id, sim = model.similar_items(0, N=10)
# display the results using pandas for nicer formatting
simularity_df  = pd.DataFrame({"skill": data_skills_row_norm.columns[skill_id], "score": sim})
simularity_df.merge(skill_and_categories, how = "left", left_on = "skill", right_on = "skill")

Unnamed: 0,skill,score,category
0,.NET Compact Framework,1.0,.NET Frameworks und Tools
1,Silverlight,0.704862,.NET Frameworks und Tools
2,ASP Generalist,0.680982,.NET Frameworks und Tools
3,ADO.NET,0.667087,.NET Frameworks und Tools
4,Windows Communication Foundation (WCF),0.65109,.NET Frameworks und Tools
5,Windows Forms,0.646928,.NET Frameworks und Tools
6,Windows Presentation Foundation (WPF),0.619443,.NET Frameworks und Tools
7,VB.NET,0.61229,Programmiersprachen / Scriptsprachen
8,DirectX,0.611853,Technik/Tools
9,DevExpress,0.604337,.NET Frameworks und Tools


# Add a Evaluation metric to the library

### APK
http://sdsawtelle.github.io/blog/output/mean-average-precision-MAP-for-recommender-systems.html

In [35]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if not actual:
        return 0.0

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [36]:
###
# Recommendations for user_id = 13
###
recommendations_list = list(recommendations_df.skill)
known_skills_list = list(dict_user_skills.keys())
len(known_skills_list)

18

In [37]:
apk(known_skills_list,recommendations_list,k=20)

0.8888888888888888

## Train-Test split

In [38]:
# train the model on a sparse matrix of user/item/confidence weights
train, test = train_test_split(data_skills_row_norm_csr, train_percentage =  0.8, random_state = 1)
model.fit(train)

100%|██████████| 15/15 [00:01<00:00,  7.83it/s]


In [39]:
from implicit.evaluation import precision_at_k,mean_average_precision_at_k

In [40]:
results = precision_at_k(model, train_user_items=train, test_user_items=test, K=5)
results

100%|██████████| 378/378 [00:00<00:00, 2762.70it/s]


0.25732708089097306

In [41]:
results = mean_average_precision_at_k(model, train_user_items=train, test_user_items=test, K=5)
results

100%|██████████| 378/378 [00:00<00:00, 3170.39it/s]


0.16656305114638476

In [42]:
mean_average_precision_at_k?

[1;31mDocstring:[0m
mean_average_precision_at_k(model, train_user_items, test_user_items, int K=10, show_progress=True, int num_threads=1)
Calculates MAP@K for a given trained model

   Parameters
   ----------
   model : RecommenderBase
       The fitted recommendation model to test
   train_user_items : csr_matrix
       Sparse matrix of user by item that contains elements that were used in training the model
   test_user_items : csr_matrix
       Sparse matrix of user by item that contains withheld elements to test on
   K : int
       Number of items to test on
   show_progress : bool, optional
       Whether to show a progress bar
   num_threads : int, optional
       The number of threads to use for testing. Specifying 0 means to default
       to the number of cores on the machine. Note: aside from the ALS and BPR
       models, setting this to more than 1 will likely hurt performance rather than
       help.

   Returns
   -------
   float
       the calculated MAP@k
   
[1;

## Crossvalidation

In [43]:
df = pd.read_csv("person-skills_2022-06-27.csv",sep=";") 
#df = [~df["category"].isin(["Betriebssystem", "Dienste", "Einsatzfelder / Erfahrungen / Schwerpunkte", "Rolle (intern)"])]
df.head(5)

Unnamed: 0,person,skill,category
0,12,Windows NT/2000/XP,Betriebssystem
1,12,MAC OS X,Betriebssystem
2,12,Windows 7,Betriebssystem
3,12,Windows 8,Betriebssystem
4,12,iOS,Betriebssystem


In [44]:
df = df.drop(columns=["category"])

In [55]:
data = df
data.head(5)

Unnamed: 0,person,skill
0,12,Windows NT/2000/XP
1,12,MAC OS X
2,12,Windows 7
3,12,Windows 8
4,12,iOS


In [57]:
NUM_TRIALS = 5
NUM_OUTER_SPLITS = 5

In [60]:
from sklearn.model_selection import cross_validate,KFold
from sklearn.metrics import make_scorer


In [69]:
cross_validate(estimator=implicit.als.AlternatingLeastSquares(),X= data_skills_row_norm_csr ,scoring=make_scorer(precision_at_k),cv=5)

TypeError: Cannot clone object '<implicit.cpu.als.AlternatingLeastSquares object at 0x00000247E7F8B130>' (type <class 'implicit.cpu.als.AlternatingLeastSquares'>): it does not seem to be a scikit-learn estimator as it does not implement a 'get_params' method.

The model was fitted on the whole dataset.
Try to do a train test / cross valdiation later on.
* see: https://gist.github.com/jbochi/2e8ddcc5939e70e5368326aa034a144e
* evaluation of the implicit lib: https://github.com/benfred/implicit/blob/main/implicit/evaluation.pyx