In [1]:
import os
import pickle
import json

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.preprocessing import StandardScaler

from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

import mlflow

import torch
from transformers import RobertaModel, AutoModel, PreTrainedTokenizerFast

In [2]:
SCORING = "f1_micro"

RANDOM_STATE = 42


DATA_FOLDER = "data/"

X_TRAIN_FILE_NAME = "X_train_augmented.csv"
X_VALID_FILE_NAME = "X_valid.csv"

Y_TRAIN_FILE_NAME = "y_train_augmented.csv"
Y_VALID_FILE_NAME = "y_valid.csv"


X_train = pd.read_csv(os.path.join(DATA_FOLDER, X_TRAIN_FILE_NAME), index_col=0)
X_valid = pd.read_csv(os.path.join(DATA_FOLDER, X_VALID_FILE_NAME), index_col=0)

y_train = pd.read_csv(os.path.join(DATA_FOLDER, Y_TRAIN_FILE_NAME), index_col=0).iloc[:,0]
y_valid = pd.read_csv(os.path.join(DATA_FOLDER, Y_VALID_FILE_NAME), index_col=0).iloc[:,0]

In [3]:
model_dir = "roberta_base_transformers"
TOKENIZER = PreTrainedTokenizerFast(tokenizer_file=os.path.join(model_dir, "tokenizer.json"))
MODEL: RobertaModel = AutoModel.from_pretrained(model_dir)
    
def embedd_text(x):
    
    x_tokenized = TOKENIZER.encode(x)
    x_encoded = MODEL(torch.tensor([x_tokenized]))[0][0][1].detach().numpy()
    
    return x_encoded

Some weights of the model checkpoint at roberta_base_transformers were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta_base_transformers and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be ab

In [4]:
X_train = np.array([embedd_text(x) for x in X_train.text.values])
X_valid = np.array([embedd_text(x) for x in X_valid.text.values])

X_train.shape, X_valid.shape

((10262, 768), (1000, 768))

In [5]:
pipelines = [
       Pipeline([
            ("scaler", StandardScaler()),
            ("model", LinearSVC())
    ]),
       Pipeline([
            ("scaler", StandardScaler()),
            ("model", MLPClassifier())
    ])
]

param_grids = [
    
    # kombinacje dla pierwszego klasyfikatora
    {
        "model__C": [0.01, 0.1, 1, 10],
        "model__class_weight": [None, "balanced", {0:1, 1:5, 2:20}, {0:1, 1:10, 2:30}]
    },

    # kombinacje dla drugiego klasyfikatora
    {
        "model__hidden_layer_sizes": [16, 32, 64, 128, (32, 8)]
    }
]

In [6]:
X = np.concatenate([X_train, X_valid])
y = pd.concat([y_train, y_valid])

n = X.shape[0]

IND_TRAIN = np.arange(0, n-X_valid.shape[0], dtype=int)
INT_VAL = np.arange(n-X_valid.shape[0], n, dtype=int)

def train_valid_split():
    yield IND_TRAIN, INT_VAL 
    

mlflow.set_experiment("text_encodind_with_bert_based_models_on_augmented_aata")

for pipeline, param_grid in zip(pipelines, param_grids):
    with mlflow.start_run():
        optimizer = GridSearchCV(pipeline, param_grid, 
                                 scoring = SCORING,
                                 cv = train_valid_split(), 
                                 n_jobs=-1,
                                 refit=True)
        optimizer.fit(X, y)

        best_model = pipeline
        best_model.set_params(**optimizer.best_params_)
        best_model.fit(X_train, y_train)
        # we calculate additional metrics only for best set of parameters        
        y_val_pred = best_model.predict(X_valid)
        y_fitted = best_model.predict(X_train)

        f1_macro_score = f1_score(y_valid, y_val_pred, average="macro")
        acc_score = accuracy_score(y_valid, y_val_pred)

        f1_micro_score_train = f1_score(y_train, y_fitted, average="micro")
        f1_macro_score_train = f1_score(y_train, y_fitted, average="macro")
        acc_score_train = accuracy_score(y_train, y_fitted)

        mlflow.log_param("pipeline", str(pipeline.steps))
        mlflow.log_param("best_params", str(optimizer.best_params_))

        mlflow.log_metric("f1_micro", optimizer.best_score_)
        mlflow.log_metric("f1_macro", f1_macro_score)
        mlflow.log_metric("accuracy", acc_score)
        mlflow.log_metric("f1_micro_train", f1_micro_score_train)
        mlflow.log_metric("f1_macro_train", f1_macro_score_train)
        mlflow.log_metric("accuracy_train", acc_score_train)



        with open("best_model.pkl", "wb") as f:
            pickle.dump(optimizer.best_estimator_, f)
        mlflow.log_artifact("best_model.pkl")
        os.remove("best_model.pkl")

        with open("best_params.json", "w") as f:
            json.dump({k:str(v) for k,v in optimizer.best_params_.items()}, f)
        mlflow.log_artifact("best_params.json")
        os.remove("best_params.json")

        with open("optimization_history.json", "w") as f:
            json.dump({k:str(v) for k,v in optimizer.cv_results_.items()}, f)
        mlflow.log_artifact("optimization_history.json")
        os.remove("optimization_history.json")

INFO: 'text_encodind_with_bert_based_models_on_augmented_aata' does not exist. Creating a new experiment


