In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd

import sys
sys.path.append("../")

from main import plot_cm
from sklearn.metrics import classification_report, confusion_matrix

import mlflow

mlflow.set_tracking_uri("../mlruns/")
mlflow.set_experiment("BOW")

In [None]:
mlflow.start_run()

In [None]:
train = pd.read_csv("../data/silver/train.tsv", sep="\t")
valid = pd.read_csv("../data/silver/valid.tsv", sep="\t")
test = pd.read_csv("../data/silver/test.tsv", sep="\t")

train.head()

In [None]:
vectorizer = CountVectorizer(stop_words="english", ngram_range=(1, 3), min_df=2)
mlflow.log_params(vectorizer.get_params())

features_train = vectorizer.fit_transform(train.text)
features_valid = vectorizer.transform(valid.text)
features_test  = vectorizer.transform(test.text)

target_train = train.label
target_valid = valid.label
target_test  = test.label

In [None]:
model = LogisticRegression(n_jobs=-1, max_iter=3000, penalty="l2", C=0.3)
model.fit(features_train, target_train)

mlflow.log_params(model.get_params())

In [None]:
from pandas import json_normalize

def eval_set(features, target, set_type):
    pred = model.predict(features)
    cm = confusion_matrix(target, pred, normalize='true')
    report = classification_report(target, pred, output_dict=True)
    report = json_normalize(report)
    report.columns = [f"{set_type}.{c}" for c in report.columns]
    report = report.iloc[0].to_dict()
    return cm, report

cm_train, report_train = eval_set(features_train, target_train, "train")
cm_valid, report_valid = eval_set(features_valid, target_valid, "valid")
cm_test,  report_test  = eval_set(features_test,  target_test,  "test")

mlflow.log_metrics(report_train)
mlflow.log_metrics(report_valid)
mlflow.log_metrics(report_test)

plot_path = plot_cm([
    [cm_train, "Train"],
    [cm_valid, "Valid"],
    [cm_test,  "Test" ]
], epoch=0, root="../reports/figures")

mlflow.log_artifact(plot_path)

In [None]:
mlflow.end_run()