In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Reusable classes
import sys
sys.path.append("/home/jovyan/work/upload/ml-experiments/sc_wrangling")

# Data path
import pathlib
FOLDER = pathlib.Path("/home/jovyan/work/upload/")
TABLE_PATH = FOLDER / 'data/pickle/df_columns_labels.pkl'
LAW2VEC_PATH = FOLDER / 'data/law2vec/Law2Vec.200d.txt'

# Manage ML lifecycle
import mlflow
from mlflow import log_params, set_tags, log_metrics
MLFLOW_TRACKING_URI = 'http://srv.meaningfy.ws:8989'

# Math stuff
import numpy as np

# Data visualisation
import pandas as pd

# Vectorize algotrithm
from gensim.models import Word2Vec, KeyedVectors

# Support Vector Classifier Algorithm
from sklearn.svm import SVC
# :gamma: is a parameter for non linear hyperplanes. The higher the gamma
#         value it tries to exactly fit the training data set.
GAMMA = 1
# :C: is the penalty parameter of the error term. It controls the trade
#     off between smooth decision boundary and classifying the training points correctly.
#     !!! Increasing C values may lead to overfitting the training data. !!!
C = 5

# Intermediate steps of the pipeline must be ‘transforms’,
# that is, they must implement fit and transform methods.
from sklearn.pipeline import Pipeline

# Splitting data into train and test
from sklearn.model_selection import train_test_split
RANDOM_STATE = 42
# Where "train" contains 70% of data and "test" - 30%
TEST_SIZE = 0.3
SHUFFLE = True

# Mean embedding function
from mean_vectorizer import MeanEmbeddingVectorizer

# Metrics Evaluation Methods
from evaluation_metrics import model_evaluation_metrics

# Transform DataFrame to dictionary
from dictionary_transformation import series_pair_to_dict

In [2]:
# Read data
df = pd.read_pickle(TABLE_PATH)
# Independent data
columns = df['Concatenated Data (clean)']
# Label data
category = df['Category']

## law2vec model

In [3]:
# Load a word2vec model stored in the C *text* format.
model = KeyedVectors.load_word2vec_format(LAW2VEC_PATH, binary=False)
l2v_dict = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}


In [4]:
# Use pipes to implement steps of fit and transform method
svm_l2v = Pipeline([
    # Add the words we want to mean
    ("law2vec vectorizer", MeanEmbeddingVectorizer(l2v_dict)),
    # Use SVC algorithm
    ("SVM", SVC(gamma=GAMMA, C=C))])

## Train SVM Model

In [5]:
# split data into test and train sets
# where "train" contains 70% of data and "test" - 30%
X_train, X_test, y_train, y_test = train_test_split(
    columns, category, random_state=RANDOM_STATE, test_size=TEST_SIZE, shuffle=SHUFFLE)


In [6]:
# Fit SVM model with out train data
svm_l2v.fit(X_train, y_train)

Pipeline(steps=[('law2vec vectorizer',
                 <mean_vectorizer.MeanEmbeddingVectorizer object at 0x7f9a70362370>),
                ('SVM', SVC(C=5, gamma=1))])

In [7]:
# Model Evaluation
svm_l2v.score(X_train, y_train)

0.9740061162079511

In [8]:
# SVM prediction based on test data
prediction = svm_l2v.predict(X_test)
prediction

array([0, 8, 0, 4, 3, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 4, 7, 3, 0, 0, 6, 0,
       0, 0, 2, 7, 0, 4, 2, 3, 7, 6, 6, 0, 0, 3, 4, 7, 4, 0, 5, 7, 5, 5,
       6, 4, 3, 3, 7, 3, 5, 5, 6, 5, 2, 6, 0, 6, 2, 0, 4, 4, 0, 4, 0, 0,
       7, 6, 3, 0, 4, 0, 0, 0, 3, 0, 2, 6, 2, 3, 7, 2, 0, 0, 0, 0, 0, 0,
       6, 0, 4, 0, 0, 6, 3, 4, 0, 0, 0, 0, 4, 0, 6, 4, 6, 0, 0, 3, 0, 4,
       4, 0, 0, 0, 6, 2, 4, 3, 0, 0, 0, 6, 0, 2, 5, 5, 6, 0, 6, 3, 0, 0,
       0, 0, 0, 4, 0, 0, 4, 0, 5, 3, 5, 3, 4, 7, 7, 0, 2, 3, 0, 2, 3, 2,
       2, 4, 0, 0, 5, 0, 0, 3, 6, 6, 2, 0, 3, 0, 2, 0, 4, 5, 3, 2, 3, 0,
       0, 0, 6, 0, 0, 5, 6, 0, 0, 0, 0, 9, 4, 0, 0, 5, 2, 7, 5, 5, 2, 0,
       8, 3, 3, 2, 0, 6, 7, 6, 5, 2, 0, 5, 6, 0, 0, 3, 0, 0, 0, 7, 2, 0,
       0, 7, 3, 3, 3, 0, 4, 4, 6, 6, 2, 0, 5, 0, 6, 9, 0, 5, 3, 0, 0, 2,
       5, 3, 0, 0, 2, 0, 2, 0, 0, 6, 3, 6, 5, 5, 3, 8, 4, 4, 2, 4, 4, 3,
       3, 0, 7, 0, 0, 6, 7, 2, 3, 5, 2, 2, 0, 2, 0, 0, 4])

## Evaluation

In [10]:
evaluation = model_evaluation_metrics(y_test, prediction)
evaluation

Unnamed: 0,Metric Labels,Metric values
0,Accuracy,0.683274
1,Precision,0.705795
2,Recall,0.572693
3,F1-Score,0.598628
4,Mean Absolute Error,1.220641
5,Mean Squared Error,5.960854


In [11]:
# Transform columns data to dict to use in MLFlow
transformation = series_pair_to_dict(evaluation, 'Metric Labels', 'Metric values')
transformation

{'Accuracy': 0.6832740213523132,
 'Precision': 0.7057946216203097,
 'Recall': 0.5726933026721079,
 'F1-Score': 0.5986282879345141,
 'Mean Absolute Error': 1.2206405693950177,
 'Mean Squared Error': 5.960854092526691}

In [12]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="Category (Law2Vec)")

with mlflow.start_run():

    parameters = {"Language model": 'law2vec',
                  "Random state": RANDOM_STATE,
                  "Test size": TEST_SIZE,
                  "Shuffle": SHUFFLE,
                  "Gamma": GAMMA,
                  "C": C}
    log_params(parameters)

    log_metrics(transformation)