In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Reusable classes
import sys
sys.path.append("/home/jovyan/work/upload/ml-experiments/sc_wrangling/")

# Data path
import pathlib
FOLDER = pathlib.Path("/home/jovyan/work/upload/")
TABLE_PATH = FOLDER / 'data/pickle/df_columns_labels.pkl'

# Manage ML lifecycle
import mlflow
from mlflow import log_params, set_tags, log_metrics
MLFLOW_TRACKING_URI = 'http://srv.meaningfy.ws:8989'

# Math stuff
import numpy as np

# Data visualisation
import pandas as pd

# Vectorize algotrithm
from gensim.models import Word2Vec

from sklearn.multioutput import MultiOutputClassifier
N_JOBS = -1

# Support Vector Classifier Algorithm
from sklearn.svm import LinearSVC, SVC

# Intermediate steps of the pipeline must be ‘transforms’,
# that is, they must implement fit and transform methods.
from sklearn.pipeline import Pipeline

# Splitting data into train and test
from sklearn.model_selection import train_test_split
RANDOM_STATE = 42
# Where "train" contains 70% of data and "test" - 30%
TEST_SIZE = 0.3
SHUFFLE = True

# Mean embedding function
from mean_vectorizer import MeanEmbeddingVectorizer

# Metrics Evaluation Methods
from evaluation_metrics import model_evaluation_metrics

# Transform DataFrame to dictionary
from dictionary_transformation import series_pair_to_dict

In [2]:
# Read data
df = pd.read_pickle(TABLE_PATH)

In [3]:
# Independent data
columns = df['Concatenated Data (clean)']
# Label data
target_groups_classes = df[['Businesses', 'Citizens', 'Workers']]

## word2vec model

In [4]:
# Insert our data and set minimal word count to 10, and size of each word to 300 vectors
model = Word2Vec.load('/home/jovyan/work/Dan/data/pwdb/word2vec/df.model')
w2v_dict = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}

In [5]:
# Use pipes to implement steps of fit and transform method
svc_w2v = Pipeline([
    # Add the words we want to mean
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v_dict)),
    # Use LinearSVC algorithm in Multi-output classifier
    ("Multi-label classifier", MultiOutputClassifier(
        LinearSVC(), n_jobs=N_JOBS))])

## Train LinearSVC Model

In [6]:
# Split data into test and train sets
X_train, X_test, y_train, y_test = train_test_split(
    columns, target_groups_classes, random_state=RANDOM_STATE, test_size=TEST_SIZE, shuffle=SHUFFLE)

In [7]:
# Fit SVM model with out train data
svc_w2v.fit(X_train, y_train)

Pipeline(steps=[('word2vec vectorizer',
                 <mean_vectorizer.MeanEmbeddingVectorizer object at 0x7f18d2dbae80>),
                ('Multi-label classifier',
                 MultiOutputClassifier(estimator=LinearSVC(), n_jobs=-1))])

In [8]:
svc_w2v.score(X_train, y_train)

0.5397553516819572

In [9]:
# SVM prediction based on test data
prediction = svc_w2v.predict(X_test)

In [10]:
evaluation = model_evaluation_metrics(y_test, prediction, 'Target groups')
evaluation

Unnamed: 0,Evaluation Metrics,Target groups
0,Accuracy,0.491103
1,Precission,0.681239
2,Recall,0.544825
3,F1 Score,0.582826
4,Mean Absolute Error,0.284698
5,Mean Squared Error,0.284698


In [11]:
transformation = series_pair_to_dict(evaluation, 'Evaluation Metrics', 'Target groups')
transformation

{'Accuracy': 0.49110320284697506,
 'Precission': 0.6812386156648452,
 'Recall': 0.5448249743007088,
 'F1 Score': 0.5828259172521467,
 'Mean Absolute Error': 0.2846975088967972,
 'Mean Squared Error': 0.2846975088967972}

In [16]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="Target group classes (SVM, Word2Vec)")

with mlflow.start_run():

    parameters = {"Language model": 'word2vec',
                  "Random state": RANDOM_STATE,
                  "Test size": TEST_SIZE,
                  "Shuffle": SHUFFLE,
                  "n_jobs": N_JOBS}
    log_params(parameters)

    log_metrics(transformation)

INFO: 'Target group classes (SVM, Word2Vec)' does not exist. Creating a new experiment
