In [11]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

import pathlib
DATA_FOLDER = pathlib.Path("/home/jovyan/work/Dan/data")
TABLE_PATH = DATA_FOLDER / 'pwdb/pickle/df_columns_labels.pkl'

# Math stuff
import numpy as np

# Data visualisation
import pandas as pd

# Vectorize algotrithm
from gensim.models import Word2Vec

# Support Vector Classifier Algorithm
from sklearn.neighbors import KNeighborsClassifier as KNC

# Intermediate steps of the pipeline must be ‘transforms’,
# that is, they must implement fit and transform methods.
from sklearn.pipeline import Pipeline

# Splitting data into train and test
from sklearn.model_selection import train_test_split

# Metrics Evaluation Methods
from sklearn.metrics import (confusion_matrix, f1_score, precision_score, accuracy_score,
                             recall_score, log_loss, mean_squared_error, mean_absolute_error)

In [2]:
# read data
df = pd.read_pickle(TABLE_PATH)
df.head()

Unnamed: 0,Concatinated Data (clean),Category,Type of measure,Target groups,One person or microenterprises,Self-employed,Solo-self-employed,SMEs,Sector specific set of companies,SMEs.1,...,Youth (18-25) in employment,Workers in care facilities,Older people in employment (aged 55+),Undeclared workers,The COVID-19 risk group,Other groups of citizens,Disabled workers,Youth (18-25) in employment.1,Female workers,Contractors of a company
0,"[hardship, case, fund, safety, net, selfemploy...",2,0,"One person or microenterprises, Self-employed,...",1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[state, support, tourism, access, financeas, t...",0,0,"SMEs, Sector specific set of companies",0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,"[bank, guarantees, smes, oneperson, enterprise...",0,0,"One person or microenterprises, SMEs",1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,"[emergency, measures, relating, shorttime, wor...",6,0,"Employees in standard employment, Other groups...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"[airbus, agreement, making, unworked, hours, p...",3,2,"Employees in standard employment, Larger corpo...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Idependent data
columns = df['Concatinated Data (clean)']
# Label data
target_groups = df[df.columns[4:]]

In [8]:
# Insert our data and set minimal word count to 10, and size of each word to 300 vectors
model = Word2Vec.load('/home/jovyan/work/Dan/data/pwdb/word2vec/df.model')
w2v_dict = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}


In [9]:
class MeanEmbeddingVectorizer(object):
    """Calculate the mean of each word"""
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(w2v_dict))])
        else:
            self.dim=0

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0) for words in X]
        )

In [12]:
# Use pipes to implement steps of fit and transform method
knn_w2v = Pipeline([
    # Add the words we want to mean
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v_dict)),
    # Use SVC algorithm
    ("KNN", KNC())])

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    columns, target_groups, random_state=42, test_size=0.3, shuffle=True)

In [19]:
knn_w2v.fit(X_train, y_train)


Pipeline(steps=[('word2vec vectorizer',
                 <__main__.MeanEmbeddingVectorizer object at 0x7fc104a06430>),
                ('KNN', KNeighborsClassifier())])

In [21]:
prediction = knn_w2v.predict(X_test)
prediction

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [22]:
class EvaluationMetrics:
    def __init__(self, actual, prediction):
        self.actual = actual
        self.prediction = prediction

    def confusion_matrix(self):
        matrix = confusion_matrix(self.actual, self.prediction)
        return matrix

    def evaluation(self, column):
        accuracy = accuracy_score(self.actual, self.prediction)
        precision = precision_score(self.actual, self.prediction, average="macro")
        recall = recall_score(self.actual, self.prediction, average="macro")
        f1 = f1_score(self.actual, self.prediction, average="macro")
        mae = mean_absolute_error(self.actual, self.prediction)
        mse = mean_squared_error(self.actual, self.prediction)

        evaluate_metrics = pd.DataFrame({'Evaluation Metrics': ['Accuracy', 'Precission', 'Recall',
                                                                'F1 Score', 'Mean Absolute Error',
                                                                'Mean Squared Error'],
                                         column: [accuracy, precision, recall, f1, mae, mse]})

        return evaluate_metrics

In [25]:
evaluate = EvaluationMetrics(y_test, prediction)
evaluate.evaluation("Target groups")

Unnamed: 0,Evaluation Metrics,Target groups
0,Accuracy,0.067616
1,Precission,0.031674
2,Recall,0.009911
3,F1 Score,0.013355
4,Mean Absolute Error,0.023081
5,Mean Squared Error,0.023081
