In [2]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

import pathlib
DATA_FOLDER = pathlib.Path("/home/jovyan/work/Dan/data")
TABLE_PATH = DATA_FOLDER / 'pwdb/pickle/df_columns_labels.pkl'

# Math stuff
import numpy as np

# Data visualisation
import pandas as pd

# Vectorize algotrithm
from gensim.models import Word2Vec

# Support Vector Classifier Algorithm
from sklearn.svm import SVC

# Intermediate steps of the pipeline must be ‘transforms’,
# that is, they must implement fit and transform methods.
from sklearn.pipeline import Pipeline

# Splitting data into train and test
from sklearn.model_selection import train_test_split

# Metrics Evaluation Methods
from sklearn.metrics import (confusion_matrix, f1_score, precision_score, accuracy_score,
                             recall_score, log_loss, mean_squared_error, mean_absolute_error)

In [3]:
# read data
df = pd.read_pickle(TABLE_PATH)
df.head()

Unnamed: 0,Concatinated Data (clean),Category,Type of measure,Target groups,One person or microenterprises,Self-employed,Solo-self-employed,SMEs,Sector specific set of companies,SMEs.1,...,Youth (18-25) in employment,Workers in care facilities,Older people in employment (aged 55+),Undeclared workers,The COVID-19 risk group,Other groups of citizens,Disabled workers,Youth (18-25) in employment.1,Female workers,Contractors of a company
0,"[hardship, case, fund, safety, net, selfemploy...",2,0,"One person or microenterprises, Self-employed,...",1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[state, support, tourism, access, financeas, t...",0,0,"SMEs, Sector specific set of companies",0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,"[bank, guarantees, smes, oneperson, enterprise...",0,0,"One person or microenterprises, SMEs",1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,"[emergency, measures, relating, shorttime, wor...",6,0,"Employees in standard employment, Other groups...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"[airbus, agreement, making, unworked, hours, p...",3,2,"Employees in standard employment, Larger corpo...",0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Idependent data
columns = df['Concatinated Data (clean)']
# Label data
category = df['Category']

## word2vec model

In [5]:
model = Word2Vec.load('/home/jovyan/work/Dan/data/pwdb/word2vec/df.model')
w2v_dict = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}

In [6]:
class MeanEmbeddingVectorizer(object):
    """Calculate the mean of each word"""
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(w2v_dict))])
        else:
            self.dim=0

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0) for words in X]
        )


## SVM Model training

In [7]:
# Use pipes to implement steps of fit and transform method
svm_w2v = Pipeline([
    # Add the words we want to mean
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v_dict)),
    # Use SVC algorithm
    ("SVM", SVC())])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    columns, category, random_state=42, test_size=0.3, shuffle=True)

In [9]:
svm_w2v.fit(X_train, y_train)

Pipeline(steps=[('word2vec vectorizer',
                 <__main__.MeanEmbeddingVectorizer object at 0x7f8e7479b310>),
                ('SVM', SVC())])

In [10]:
# SVM is quite bad when we talk about unstable data, it must be perfectly balanced to achive a good expectation
prediction = svm_w2v.predict(X_test)
prediction

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0,
       0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
       2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0,
       0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0])

## Evaluation metrics

In [15]:
def confusion_matrix(actual, prediction):
    """
    assuming we have actual test and predicted labels
    and we want to see the confusion matrix of those 2 labels
    :actual: the real test label
    :prediction: predicted label
    """

    matrix = confusion_matrix(actual, prediction)

    return matrix


def evaluation(actual, prediction, title):
    """
    assuming we have actual test and predicted labels
    and we want to see the evaluation score of those 2 labels
    :actual: the real test label
    :prediction: predicted label
    """
    accuracy = accuracy_score(actual, prediction)
    precision = precision_score(actual, prediction, average="macro")
    recall = recall_score(actual, prediction, average="macro")
    f1 = f1_score(actual, prediction, average="macro")
    mae = mean_absolute_error(actual, prediction)
    mse = mean_squared_error(actual, prediction)

    evaluate_metrics = pd.DataFrame({'Evaluation Metrics': ['Accuracy', 'Precission', 'Recall',
                                                            'F1 Score', 'Mean Absolute Error',
                                                            'Mean Squared Error'],
                                     title: [accuracy, precision, recall, f1, mae, mse]})

    return evaluate_metrics

In [16]:
evaluation(y_test, prediction, 'Category')

TypeError: evaluation() takes 2 positional arguments but 3 were given

Unnamed: 0,Evaluation Metrics,Category
0,Accuracy,0.370107
1,Precission,0.084101
2,Recall,0.167901
3,F1 Score,0.109338
4,Mean Absolute Error,2.960854
5,Mean Squared Error,16.320285


In [18]:
# with mlflow.start_run():
#     # Use pipes to implement steps of fit and transform method
#     svm_w2v = Pipeline([
#         # Add the words we want to mean
#         ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v_dict)),
#         # Use SVC algorithm
#         ("SVM", SVC())])
#
#     svm_w2v.fit(X_train, y_train)
#     prediction = svm_w2v.predict(X_test)
#     accuracy = accuracy_score(y_test, prediction)
#
#     print(accuracy)
#
#     mlflow.log_metric('accuracy', accuracy)
#     mlflow.sklearn.log_model(svm_w2v, "model")
#     modelpath = "" # ???
#     mlflow.slearn.save_model(svm_w2v, modelpath)