In [6]:
import os
import pickle
import json
import functools

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.preprocessing import StandardScaler, PowerTransformer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.decomposition import TruncatedSVD

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

import mlflow

In [7]:
SCORING = "f1_micro"

RANDOM_STATE = 42

DATA_FOLDER = "data/"

X_TRAIN_FILE_NAME = "X_train_augmented.csv"
X_VALID_FILE_NAME = "X_valid.csv"

Y_TRAIN_FILE_NAME = "y_train_augmented.csv"
Y_VALID_FILE_NAME = "y_valid.csv"

In [8]:
X_train = pd.read_csv(os.path.join(DATA_FOLDER, X_TRAIN_FILE_NAME), index_col=0)
X_valid = pd.read_csv(os.path.join(DATA_FOLDER, X_VALID_FILE_NAME), index_col=0)

y_train = pd.read_csv(os.path.join(DATA_FOLDER, Y_TRAIN_FILE_NAME), index_col=0).iloc[:,0]
y_valid = pd.read_csv(os.path.join(DATA_FOLDER, Y_VALID_FILE_NAME), index_col=0).iloc[:,0]

In [9]:
X_train.values.shape, y_train.shape

((10262, 2), (10262,))

In [10]:
X_full = pd.concat([X_train, X_valid])
y = pd.concat([y_train, y_valid])

X_full.shape, y.shape

((11262, 2), (11262,))

In [11]:
pipelines = [
    Pipeline([
        ("vectorizer", CountVectorizer()),
        ("model", MultinomialNB())
    ]),
    Pipeline([
        ("vectorizer", CountVectorizer()),
        ("model", LinearSVC(max_iter=1500)) 
        # we do not standarize matrix (it will be nearly binary matrix)
        # for tfidf - also mainly zeros and other values rathej small
    ])
]

param_grids = [
    
    # combinations for the first model
    [
        # set 1
        {
            "vectorizer": [CountVectorizer()],
            "vectorizer__min_df": [1, 5, 15],
            "vectorizer__max_df": [1.0, 0.1, 0.05],
            "vectorizer__stop_words": [None, ["@anonymized_account"]],  
               #uwzględniamy wystąpienie "@anonymized_account" lub nie
            "vectorizer__token_pattern": ['(?u)\\b\\w\\w+\\b', '(?u)\\b\\w+\\b'] 
            # uwzględniamy pojedyncze litery lub tylko slowa od 2 znakow
        },
        
        # set 2
        {
            "vectorizer": [CountVectorizer()],
            "vectorizer__analyzer": ['char'] 
                 # rozpatrujemy czestosci ZNAKOW zamiast slow
        },
        # set 3
        {
            "vectorizer": [CountVectorizer()],
            "vectorizer__analyzer": ['char_wb'],
            "vectorizer__ngram_range":[(1,2)]
                # we consider chars and bigrams of chars
        }
    ],
    
    # combinations for the second model
    {
        "vectorizer": [CountVectorizer(), TfidfVectorizer()],
        "vectorizer__min_df": [1, 5, 15],
        "vectorizer__max_df": [1.0, 0.1, 0.05],
        "model__C": [0.01, 0.1, 1, 10],
        "model__class_weight": [None, "balanced", {0:1, 1:5, 2:20}]
    }
]

In [12]:
# Due to high complexiy of svm classifier
# instead of performing cross validation 
# I used train-valid split on train set -
# here is funtion built for using it as folds iterator in GridSearchCV

n = X_full.shape[0]

IND_TRAIN = np.arange(0, n-X_valid.shape[0], dtype=int)
INT_VAL = np.arange(n-X_valid.shape[0], n, dtype=int)

def train_valid_split():
    yield IND_TRAIN, INT_VAL 

In [13]:
mlflow.set_experiment("classic_ML_on_augmented_data")

for text_variant in ["text", "text_lemmatized"]:

    X = X_full[text_variant]
    X_tr = X_train[text_variant]
    X_val = X_valid[text_variant]
    
    
    for pipeline, param_grid in zip(pipelines, param_grids):
        with mlflow.start_run():
            optimizer = GridSearchCV(pipeline, param_grid, 
                                     scoring = SCORING,
                                     cv = train_valid_split(), 
                                     n_jobs=-1,
                                     refit=True)
            optimizer.fit(X, y)

            best_model = pipeline
            best_model.set_params(**optimizer.best_params_)
            best_model.fit(X_tr, y_train)
            # we calculate additional metrics only for best set of parameters        
            y_val_pred = best_model.predict(X_val)
            y_fitted = best_model.predict(X_tr)
            
            f1_macro_score = f1_score(y_valid, y_val_pred, average="macro")
            acc_score = accuracy_score(y_valid, y_val_pred)
       
            f1_micro_score_train = f1_score(y_train, y_fitted, average="micro")
            f1_macro_score_train = f1_score(y_train, y_fitted, average="macro")
            acc_score_train = accuracy_score(y_train, y_fitted)

            mlflow.log_param("pipeline", str(pipeline.steps))
            mlflow.log_param("text_variant", text_variant)
            mlflow.log_param("best_params", str(optimizer.best_params_))

            mlflow.log_metric("f1_micro", optimizer.best_score_)
            mlflow.log_metric("f1_macro", f1_macro_score)
            mlflow.log_metric("accuracy", acc_score)
            mlflow.log_metric("f1_micro_train", f1_micro_score_train)
            mlflow.log_metric("f1_macro_train", f1_macro_score_train)
            mlflow.log_metric("accuracy_train", acc_score_train)
            
            

            with open("best_model.pkl", "wb") as f:
                pickle.dump(optimizer.best_estimator_, f)
            mlflow.log_artifact("best_model.pkl")
            os.remove("best_model.pkl")

            with open("best_params.json", "w") as f:
                json.dump({k:str(v) for k,v in optimizer.best_params_.items()}, f)
            mlflow.log_artifact("best_params.json")
            os.remove("best_params.json")
            
            with open("optimization_history.json", "w") as f:
                json.dump({k:str(v) for k,v in optimizer.cv_results_.items()}, f)
            mlflow.log_artifact("optimization_history.json")
            os.remove("optimization_history.json")

INFO: 'classic_ML_on_augmented_data' does not exist. Creating a new experiment
