In [24]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Reusable classes
import sys
sys.path.append("/home/jovyan/work/Dan/upload/src/")

# Data path
import pathlib
DATA_FOLDER = pathlib.Path("/home/jovyan/work/Dan/data")
TABLE_PATH = DATA_FOLDER / 'pwdb/pickle/df_columns_labels.pkl'

# Math stuff
import numpy as np
from numpy.core.records import ndarray

# Data visualisation
import pandas as pd
from pandas import Series

# Vectorize algotrithm
from gensim.models import Word2Vec

# Support Vector Classifier Algorithm
from sklearn.svm import SVC

# Intermediate steps of the pipeline must be ‘transforms’,
# that is, they must implement fit and transform methods.
from sklearn.pipeline import Pipeline

# Splitting data into train and test
from sklearn.model_selection import train_test_split

# Metrics Evaluation Methods
from evaluation_metrics import model_evaluation_metrics

In [25]:
# Read data
df = pd.read_pickle(TABLE_PATH)
df.head()

Unnamed: 0,Concatenated Data (clean),Category,Type of measure,Target groups,One person or microenterprises|Self-employed|Solo-self-employed,SMEs|Sector specific set of companies,One person or microenterprises|SMEs,Employees in standard employment|Other groups of workers,Employees in standard employment|Larger corporations,Employees in standard employment,...,Companies providing essential services|Employees in standard employment|Workers in essential services,Employees in standard employment|One person or microenterprises|SMEs,Employees in standard employment|Larger corporations|Other businesses,Children (minors)|Disabled|Older citizens|Parents|SMEs|Single parents|Single parents in employment|The COVID-19 risk group|Workers in care facilities|Workers in essential services,Employees in standard employment|Workers in care facilities|Workers in non-standard forms of employment,Companies providing essential services|Workers in essential services,Contractors of a company,Other businesses|Unemployed,Seasonal workers|Workers in non-standard forms of employment,Employees in standard employment|Particular professions
0,"[hardship, case, fund, safety, net, selfemploy...",2,0,One person or microenterprises|Self-employed|S...,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"[state, support, tourism, access, financeas, t...",0,0,SMEs|Sector specific set of companies,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"[bank, guarantees, smes, oneperson, enterprise...",0,0,One person or microenterprises|SMEs,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[emergency, measures, relating, shorttime, wor...",6,0,Employees in standard employment|Other groups ...,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"[airbus, agreement, making, unworked, hours, p...",3,2,Employees in standard employment|Larger corpor...,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
# Independent data
columns = df['Concatenated Data (clean)']
# Label data
category = df['Type of measure']

## word2vec model

In [28]:
# Insert our data and set minimal word count to 10, and size of each word to 300 vectors
model = Word2Vec.load('/home/jovyan/work/Dan/data/pwdb/word2vec/df.model')
w2v_dict = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}

In [29]:
class MeanEmbeddingVectorizer(object):
    """Calculate the mean of each word"""
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(w2v_dict))])
        else:
            self.dim=0

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0) for words in X]
        )


In [30]:
# Use pipes to implement steps of fit and transform method
svm_w2v = Pipeline([
    # Add the words we want to mean
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v_dict)),
    # Use SVC algorithm
    ("SVM", SVC())])

## Train SVM Model

In [31]:
# Split data into test and train sets
# Where "train" contains 70% of data and "test" - 30%
X_train, X_test, y_train, y_test = train_test_split(
    columns, category, random_state=42, test_size=0.3, shuffle=True)

In [32]:
# Fit SVM model with out train data
svm_w2v.fit(X_train, y_train)

Pipeline(steps=[('word2vec vectorizer',
                 <__main__.MeanEmbeddingVectorizer object at 0x7fd05c382970>),
                ('SVM', SVC())])

In [34]:
# SVM prediction based on test data
prediction = svm_w2v.predict(X_test)
prediction

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Evaluation

In [35]:
# View evaluation metrics table
model_evaluation_metrics(y_test, prediction, 'Type of measure')


Unnamed: 0,Evaluation Metrics,Type of measure
0,Accuracy,0.715302
1,Precission,0.119217
2,Recall,0.166667
3,F1 Score,0.139004
4,Mean Absolute Error,0.608541
5,Mean Squared Error,1.782918
