In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np


import os
import os.path as osp
import pickle as pkl

import sys
sys.path.append("../")

from main import plot_cm
from sklearn.metrics import classification_report, confusion_matrix

import mlflow

mlflow.set_tracking_uri("../mlruns/")
mlflow.set_experiment("AVE")

In [None]:
mlflow.start_run()

In [None]:
def load_set(set_type = "train", encoder_type = "bert"):
    raw_dir = f"../data/gold/{set_type}/raw"

    dirs = os.listdir(raw_dir)
    dirs = [osp.join(raw_dir, d) for d in dirs if d.isnumeric()]

    features = []
    labels = []

    for _, dir in enumerate(dirs):
        # Load data
        node_features_path = osp.join(dir, f'features_{encoder_type}.npy')
        node_features = np.load(node_features_path)
        feature = node_features.mean(axis=0)

        label_path = osp.join(dir, 'label.pkl')
        label = pkl.load(open(label_path, 'rb'))

        features.append(feature)
        labels.append(label)
    
    features = np.array(features)
    labels = np.array(labels)

    return features, labels

encoder_type = "w2v" # "bert" or "w2v"

features_train, target_train = load_set("train", encoder_type)
features_valid, target_valid = load_set("valid", encoder_type)
features_test, target_test   = load_set("test",  encoder_type)

mlflow.log_param("encoder_type", encoder_type)

In [None]:
model = LogisticRegression(max_iter=3000, penalty="l2", C=1.0)
model.fit(features_train, target_train)

mlflow.log_params(model.get_params())

In [None]:
from pandas import json_normalize

def eval_set(features, target, set_type):
    pred = model.predict(features)
    cm = confusion_matrix(target, pred, normalize='true')
    report = classification_report(target, pred, output_dict=True)
    report = json_normalize(report)
    report.columns = [f"{set_type}.{c}" for c in report.columns]
    report = report.iloc[0].to_dict()
    return cm, report

cm_train, report_train = eval_set(features_train, target_train, "train")
cm_valid, report_valid = eval_set(features_valid, target_valid, "valid")
cm_test,  report_test  = eval_set(features_test,  target_test,  "test")

mlflow.log_metrics(report_train)
mlflow.log_metrics(report_valid)
mlflow.log_metrics(report_test)

plot_path = plot_cm([
    [cm_train, "Train"],
    [cm_valid, "Valid"],
    [cm_test,  "Test" ]
], epoch=0, root="..")

mlflow.log_artifact(plot_path)

In [None]:
mlflow.end_run()