In [40]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Reusable classes
import sys
sys.path.append("/home/jovyan/work/upload/ml-experiments/sc_wrangling/")

# Data path
import pathlib
FOLDER = pathlib.Path("/home/jovyan/work/upload/")
TABLE_PATH = FOLDER / 'data/pickle/df_columns_labels.pkl'

# compress the trained model
import pickle
SAVED_MODEL_PATH = pathlib.Path('/home/jovyan/work/upload/data/trained_models')

# Manage ML lifecycle
import mlflow
from mlflow import log_params, set_tags, log_metrics, log_artifact
MLFLOW_TRACKING_URI = 'http://srv.meaningfy.ws:8989'

# Data visualisation
import pandas as pd

# Vectorize algotrithm
from gensim.models import Word2Vec

# Random Forest Algorithm
from sklearn.ensemble import RandomForestClassifier as RFC
# :n_estimators: The number of trees in the forest
N_ESTIMATORS = 17
# :criterion: The function to measure the quality of a split
CRITERION = "entropy"
# :min_samples_split:The minimum number of samples
# required to split an internal node
MIN_SAMPLES_SPLIT = 3

# Intermediate steps of the pipeline must be ‘transforms’,
# that is, they must implement fit and transform methods.
from sklearn.pipeline import Pipeline

# Splitting data into train and test
from sklearn.model_selection import train_test_split
RANDOM_STATE = 42
# Where "train" contains 70% of data and "test" - 30%
TEST_SIZE = 0.3
SHUFFLE = True


# Mean embedding function
from mean_vectorizer import MeanEmbeddingVectorizer

# Metrics Evaluation Methods
from evaluation_metrics import model_evaluation_metrics

# Transform DataFrame to dictionary
from dictionary_transformation import series_pair_to_dict


In [29]:
# Read data
df = pd.read_pickle(TABLE_PATH)
df.head()

Unnamed: 0,Concatenated Data (clean),Category,Subcategory,Type of measure,Target groups,One person or microenterprises|Self-employed|Solo-self-employed,SMEs|Sector specific set of companies,One person or microenterprises|SMEs,Employees in standard employment|Other groups of workers,Employees in standard employment|Larger corporations,...,Children (minors)|Disabled|Older citizens|Parents|SMEs|Single parents|Single parents in employment|The COVID-19 risk group|Workers in care facilities|Workers in essential services,Employees in standard employment|Workers in care facilities|Workers in non-standard forms of employment,Companies providing essential services|Workers in essential services,Contractors of a company,Other businesses|Unemployed,Seasonal workers|Workers in non-standard forms of employment,Employees in standard employment|Particular professions,Businesses,Citizens,Workers
0,"[hardship, case, fund, safety, net, selfemploy...",2,12,0,One person or microenterprises|Self-employed|S...,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
1,"[state, support, tourism, access, financeas, t...",0,0,0,SMEs|Sector specific set of companies,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,"[bank, guarantees, smes, oneperson, enterprise...",0,0,0,One person or microenterprises|SMEs,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,"[emergency, measures, relating, shorttime, wor...",6,14,0,Employees in standard employment|Other groups ...,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,1
4,"[airbus, agreement, making, unworked, hours, p...",3,7,2,Employees in standard employment|Larger corpor...,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,1


In [30]:
# Independent data
columns = df['Concatenated Data (clean)']
# Label data
subcategory = df['Subcategory']

## word2vec model

In [31]:
# Insert our data and set minimal word count to 10, and size of each word to 300 vectors
model = Word2Vec.load('/home/jovyan/work/Dan/data/pwdb/word2vec/df.model')
w2v_dict = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}

In [32]:
# Use pipes to implement steps of fit and transform method
rfc_w2v = Pipeline([
    # Add the words we want to mean
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v_dict)),
    ("Random Forest", RFC(n_estimators=N_ESTIMATORS, criterion=CRITERION,
                          min_samples_split=MIN_SAMPLES_SPLIT))])

## Train Random Forest Model

In [33]:
# Split data into test and train sets
X_train, X_test, y_train, y_test = train_test_split(
    columns, subcategory, random_state=RANDOM_STATE, test_size=TEST_SIZE, shuffle=SHUFFLE)

In [34]:
# Fit Random Forest model with out train data
rfc_w2v.fit(X_train, y_train)

Pipeline(steps=[('word2vec vectorizer',
                 <mean_vectorizer.MeanEmbeddingVectorizer object at 0x7f9e019bc070>),
                ('Random Forest',
                 RandomForestClassifier(criterion='entropy',
                                        min_samples_split=3,
                                        n_estimators=17))])

### Save the model into pickle file

In [35]:
with open(SAVED_MODEL_PATH / 'subcategory_random_forest_word2vec.pkl', 'wb') as pickle_model:
    pickle.dump(rfc_w2v, pickle_model)

In [36]:
# Model Evaluation
rfc_w2v.score(X_train, y_train)

1.0

In [37]:
# Random Forest prediction based on test data
prediction = rfc_w2v.predict(X_test)
prediction

array([16, 15,  0, 11, 20, 10, 29, 10, 10,  2,  2, 28, 21, 10, 10,  3, 15,
       26, 20, 29, 14,  0,  0, 16,  9, 10, 10, 21, 28, 27, 16,  2,  2, 21,
       10, 20,  2, 16, 16, 10, 16, 16, 18,  2, 14, 20, 13,  4, 10, 30, 14,
       20, 14, 30, 28, 30,  9, 26,  9, 18, 34,  0,  0, 21,  9,  0,  9, 14,
       20, 10, 20, 15,  0,  0, 27,  9, 10, 14, 24, 20,  9,  4,  0, 10, 10,
        0,  0,  2, 20,  2, 29, 10, 10, 10, 32, 30, 10, 10,  0, 10,  2, 29,
        2, 34, 14, 10, 16, 20,  9,  2,  2, 21,  9, 10, 14,  2, 16, 20, 10,
       30, 29, 14,  9, 10, 22, 21, 30, 16, 20, 20, 10,  0, 16,  9, 10, 24,
        0,  9, 27, 19, 20, 16,  4, 20, 20, 10, 16, 29, 26,  2,  0, 12, 27,
       15, 12, 16,  9, 10, 30, 10, 21, 21,  4,  2, 28,  0,  4, 14, 12,  9,
       28, 27, 30, 28,  3, 10, 16, 10, 27,  0, 26, 26, 14, 10,  0,  0, 10,
        0, 16, 10, 10, 33, 15, 29, 20,  4, 28, 10,  2, 20, 20, 28,  0, 14,
       16, 14, 20, 15, 10, 19, 14, 10, 10, 28,  0, 10, 21, 10, 10,  9, 24,
       29,  2, 30, 30, 10

## Evaluation

In [38]:
# View evaluation metrics table
evaluation = model_evaluation_metrics(y_test, prediction)
evaluation

Unnamed: 0,Metric Labels,Metric values
0,Accuracy,0.281139
1,Precision,0.15633
2,Recall,0.15625
3,F1-Score,0.141699
4,Mean Absolute Error,7.733096
5,Mean Squared Error,122.309609


In [39]:
# Transform columns data to dict to use in MLFlow
transformation = series_pair_to_dict(evaluation, 'Metric Labels', 'Metric values')
transformation

{'Accuracy': 0.28113879003558717,
 'Precision': 0.1563297150332754,
 'Recall': 0.15625009109857596,
 'F1-Score': 0.14169879822053746,
 'Mean Absolute Error': 7.733096085409253,
 'Mean Squared Error': 122.30960854092527}

In [41]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="Subcategory (Random Forest, Word2Vec)")

with mlflow.start_run():

    parameters = {"Language model": 'word2vec',
                  "Random state": RANDOM_STATE,
                  "Test size": TEST_SIZE,
                  "Shuffle": SHUFFLE,
                  "n_estimators": N_ESTIMATORS,
                  "criterion": CRITERION,
                  "min_samples_split": MIN_SAMPLES_SPLIT
                 }
    log_params(parameters)
    log_metrics(transformation)
    log_artifact('/home/jovyan/work/upload/data/trained_models/subcategory_random_forest_word2vec.pkl')
