# Install the neptune-notebooks widget
[Read the docs](https://docs.neptune.ai/integrations-and-supported-tools/ide-and-notebooks/jupyter-lab-and-jupyter-notebook)

# Initialize a neptune project
[Read the docs](https://docs.neptune.ai/you-should-know/core-concepts#project)

In [None]:
import neptune.new as neptune

In [None]:
WORKSPACE_NAME = "showcase"
PROJECT_NAME = "project-text-classification"

In [None]:
project = neptune.init_project(name=f"{WORKSPACE_NAME}/{PROJECT_NAME}")

# Log project level metadata

## Version and track datasets
[Read the docs](https://docs.neptune.ai/how-to-guides/data-versioning)

In [None]:
DATASET_PATH = "../../data"

In [None]:
project["data/files"].track_files(f"{DATASET_PATH}")

In [None]:
import csv

import pandas as pd

In [None]:
df_raw = pd.read_csv(f"{DATASET_PATH}/raw/legal_text_classification.csv")
df_raw.dropna(subset=["case_text"], inplace=True)
df_raw

In [None]:
df_raw.isna().sum()

In [None]:
sum(df_raw.case_text.duplicated())

In [None]:
df_raw.drop_duplicates(subset="case_text", inplace=True)
sum(df_raw.case_text.duplicated())

## Log dataset sample
[Read the docs](https://docs.neptune.ai/you-should-know/what-can-you-log-and-display#files)

In [None]:
from io import StringIO

from neptune.new.types import File

In [None]:
csv_buffer = StringIO()
df_raw.sample(100).to_csv(csv_buffer, index=False)
project["data/sample"].upload(File.from_stream(csv_buffer, extension="csv"))

## Log metadata plots
[Read the docs](https://docs.neptune.ai/you-should-know/what-can-you-log-and-display#images)

In [None]:
fig = df_raw.case_outcome.value_counts().plot(kind="bar")
fig

In [None]:
project["data/distribution"].upload(fig.figure)

# Initialize a new neptune run for baseline model
[Read the docs](https://docs.neptune.ai/you-should-know/core-concepts#run)

In [None]:
run = neptune.init_run(
    project=f"{WORKSPACE_NAME}/{PROJECT_NAME}",
    name="text classification using fasttext",
    description="training on raw data",
    tags=["fasttext", "raw"],
)

In [None]:
df_fasttext_raw = df_raw[["case_outcome", "case_text"]]
df_fasttext_raw["label"] = "__label__" + df_fasttext_raw.case_outcome
df_fasttext_raw = df_fasttext_raw[["label", "case_text"]]
df_fasttext_raw

In [None]:
df_fasttext_raw.to_csv(
    f"{DATASET_PATH}/fasttext/raw.txt",
    sep=" ",
    header=False,
    index=False,
    quoting=csv.QUOTE_NONE,
    quotechar="",
    escapechar=" ",
)

In [None]:
! head -5 $DATASET_PATH"/fasttext/raw.txt"

## Track run-specific files
[Read the docs](https://docs.neptune.ai/how-to-guides/data-versioning/compare-datasets#step-2-add-tracking-of-the-dataset-version)

In [None]:
csv_buffer = StringIO()

df_fasttext_raw.sample(100).to_csv(csv_buffer, index=False)
run["data/sample"].upload(File.from_stream(csv_buffer, extension="csv"))

In [None]:
def train_test_valid_split(X: pd.DataFrame, y: list) -> pd.DataFrame:
    """Splits `X` into train, test and validation sets stratified on `y`"""

    from sklearn.model_selection import train_test_split

    X_train, _X_test, y_train, _y_test = train_test_split(X, y, stratify=y, test_size=0.4)
    X_test, X_valid, y_test, y_valid = train_test_split(_X_test, _y_test, stratify=_y_test, test_size=0.5)

    print(X_train.shape)
    print(len(y_train))
    print(X_test.shape)
    print(len(y_test))
    print(X_valid.shape)
    print(len(y_valid))

    return X_train, y_train, X_test, y_test, X_valid, y_valid

In [None]:
X = df_fasttext_raw["case_text"]
y = df_fasttext_raw["label"]

In [None]:
X_train, y_train, X_test, y_test, X_valid, y_valid = train_test_valid_split(X, y)

In [None]:
df_train = pd.DataFrame(data=[y_train, X_train]).T
df_train

In [None]:
df_test = pd.DataFrame(data=[y_test, X_test]).T
df_valid = pd.DataFrame(data=[y_valid, X_valid]).T

In [None]:
df_train.to_csv(
    f"{DATASET_PATH}/raw/train.txt",
    sep=" ",
    header=False,
    index=False,
    quoting=csv.QUOTE_NONE,
    quotechar="",
    escapechar=" ",
)
df_test.to_csv(
    f"{DATASET_PATH}/raw/test.txt",
    sep=" ",
    header=False,
    index=False,
    quoting=csv.QUOTE_NONE,
    quotechar="",
    escapechar=" ",
)
df_valid.to_csv(
    f"{DATASET_PATH}/raw/valid.txt",
    sep=" ",
    header=False,
    index=False,
    quoting=csv.QUOTE_NONE,
    quotechar="",
    escapechar=" ",
)

In [None]:
run["data/files"].track_files(f"{DATASET_PATH}/raw")

## Log metadata to run
[Read the docs](https://docs.neptune.ai/you-should-know/logging-metadata)

In [None]:
metadata = {
    "train_size": len(df_train),
    "test_size": len(df_test),
    "valid_size": len(df_valid),
}
metadata

In [None]:
run["data/metadata"] = metadata

## Register a model 
[Read the docs](https://docs.neptune.ai/how-to-guides/model-registry)

In [None]:
model = neptune.init_model(
    name="fasttext",
    key="FTXT",
    project=f"{WORKSPACE_NAME}/{project.get_structure()['sys']['name'].fetch()}",
)

### Create a new model version
[Read the docs](https://docs.neptune.ai/how-to-guides/model-registry/creating-model-versions)

In [None]:
model_version = neptune.init_model_version(
    project=f"{WORKSPACE_NAME}/{project.get_structure()['sys']['name'].fetch()}",
    model=model.get_structure()["sys"]["id"].fetch(),
)

### Associate model version to run and vice-versa

In [None]:
run_dict = {
    "id": run.get_structure()["sys"]["id"].fetch(),
    "name": run.get_structure()["sys"]["name"].fetch(),
    "url": run.get_run_url(),
}
run_dict

In [None]:
model_version["run"] = run_dict

In [None]:
model_version_dict = {
    "id": model_version.get_structure()["sys"]["id"].fetch(),
    "url": model_version.get_url(),
}
model_version_dict

In [None]:
run["model"] = model_version_dict

In [None]:
import fasttext

In [None]:
clf = fasttext.train_supervised(input=f"{DATASET_PATH}/raw/train.txt")

In [None]:
clf.save_model("../../models/fasttext_baseline.bin")

### Upload model binary to model registry
[Read the docs](https://docs.neptune.ai/how-to-guides/model-registry/creating-model-versions)

In [None]:
model_version["serialized_model"].upload("../../models/fasttext_baseline.bin")

### Log model properties to model_version

In [None]:
properties = {k: v for k, v in vars(clf).items() if k not in ["_words", "f"]}
properties

In [None]:
model_version["properties"] = properties

## Log parameters, metrics and debugging information to run

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, f1_score

In [None]:
_, precision, recall = clf.test(f"{DATASET_PATH}/raw/test.txt")
print(precision, recall)

In [None]:
run["test/metrics/precision"] = precision
run["test/metrics/recall"] = recall

In [None]:
preds = [clf.predict(text)[0][0] for text in X_test.values]
set(preds)

In [None]:
print(classification_report(y_test, preds, zero_division=0))
run["test/metrics/classification_report"] = classification_report(y_test, preds, output_dict=True, zero_division=0)

In [None]:
df_clf_rpt = pd.DataFrame(classification_report(y_test, preds, output_dict=True, zero_division=0)).T
run["test/metrics/classification_report/report"].upload(File.as_html(df_clf_rpt))

In [None]:
f1_score(y_test, preds, average="weighted")
run["test/metrics/f1_score"] = f1_score(y_test, preds, average="weighted")

In [None]:
fig = ConfusionMatrixDisplay.from_predictions(y_test, preds, xticks_rotation="vertical", colorbar=False)
run["test/debug/plots/confusion_matrix"].upload(fig.figure_)

In [None]:
df_test["prediction"] = preds
df_test

In [None]:
import plotly.graph_objects as go

In [None]:
labels = [s.replace("__label__", "") for s in df_test.label.value_counts().index]
fig = go.Figure(
    data=[
        go.Bar(name="Actual", x=labels, y=df_test.label.value_counts()),
        go.Bar(name="Prediction", x=labels, y=df_test.prediction.value_counts()),
    ]
)
fig.update_layout(title="Actual vs Prediction", barmode="group")
fig.show()

In [None]:
run["test/debug/plots/prediction_distribution"].upload(fig)

In [None]:
df_debug = df_test[df_test.label != df_test.prediction]

csv_buffer = StringIO()

df_debug.to_csv(csv_buffer, index=False)
run["test/debug/misclassifications"].upload(File.from_stream(csv_buffer, extension="csv"))

## Stop current model version, model, run, and project

In [None]:
model_version.stop()
model.stop()
run.stop()
project.stop()

# Initialize a new neptune run for processed data
[Read the docs](https://docs.neptune.ai/you-should-know/core-concepts#run)

In [None]:
run = neptune.init_run(
    project=f"{WORKSPACE_NAME}/{PROJECT_NAME}",
    name="text classification using fasttext",
    description="training on processed data",
    tags=["fasttext", "processed"],
)

In [None]:
def clean_text(df: pd.DataFrame, col: str) -> pd.DataFrame:
    """Cleans a dataframe `df` string column `col` by applying the following transformations:
    * Convert string to lower-case
    * Convert HTML literals to tags
    * Remove punctuation
    * Remove numbers
    * Remove single-letter words
    * Remove stopwords (skipped in this case for performance reasons)
    * Stemming words

    Args:
        df: Dataframe containing sgtring columns `col` to be cleaned
        col: String column to be cleaned

    Returns:
        A copy of the dataframe `df` with the column `col` cleaned
    """

    import re

    import nltk
    from nltk.corpus import stopwords
    from nltk.stem import SnowballStemmer
    from tqdm.notebook import tqdm

    tqdm.pandas()
    stop = set(stopwords.words("english"))
    pat = r"\b(?:{})\b".format("|".join(stop))

    _df = df.copy()
    _df[col] = (
        df[col]
        .progress_apply(str.lower)  # Converting to lowercase
        .progress_apply(lambda x: re.sub(r"[^\w\s]", " ", x))  # Removing punctuation
        .progress_apply(lambda x: " ".join(x for x in x.split() if not any(c.isdigit() for c in x)))  # Removing numbers
        .progress_apply(lambda x: re.sub(r"\b\w\b", "", x))  # Removing single-letter words
        .str.replace(pat, "", regex=True)
        .progress_apply(lambda x: re.sub(" +", " ", x))  # Removing multiple-whitespaces
    )

    return _df

In [None]:
df_processed = clean_text(df_fasttext_raw, "case_text")
df_processed

In [None]:
sum(df_processed.case_text.duplicated())

In [None]:
df_processed.drop_duplicates(subset="case_text", inplace=True)
sum(df_processed.case_text.duplicated())

In [None]:
df_processed.to_csv(
    f"{DATASET_PATH}/fasttext/processed.txt",
    sep=" ",
    header=False,
    index=False,
    quoting=csv.QUOTE_NONE,
    quotechar="",
    escapechar=" ",
)

## Track run-specific files
[Read the docs](https://docs.neptune.ai/how-to-guides/data-versioning/compare-datasets#step-2-add-tracking-of-the-dataset-version)

In [None]:
csv_buffer = StringIO()

df_processed.sample(100).to_csv(csv_buffer, index=False)
run["data/sample"].upload(File.from_stream(csv_buffer, extension="csv"))

In [None]:
X = df_processed["case_text"]
y = df_processed["label"]

In [None]:
X_train, y_train, X_test, y_test, X_valid, y_valid = train_test_valid_split(X, y)

In [None]:
df_train = pd.DataFrame(data=[y_train, X_train]).T
df_train

In [None]:
df_test = pd.DataFrame(data=[y_test, X_test]).T
df_valid = pd.DataFrame(data=[y_valid, X_valid]).T

In [None]:
df_train.to_csv(
    f"{DATASET_PATH}/processed/train.txt",
    sep=" ",
    header=False,
    index=False,
    quoting=csv.QUOTE_NONE,
    quotechar="",
    escapechar=" ",
)
df_test.to_csv(
    f"{DATASET_PATH}/processed/test.txt",
    sep=" ",
    header=False,
    index=False,
    quoting=csv.QUOTE_NONE,
    quotechar="",
    escapechar=" ",
)
df_valid.to_csv(
    f"{DATASET_PATH}/processed/valid.txt",
    sep=" ",
    header=False,
    index=False,
    quoting=csv.QUOTE_NONE,
    quotechar="",
    escapechar=" ",
)

In [None]:
run["data/files"].track_files(f"{DATASET_PATH}/processed")

## Log metadata to run
[Read the docs](https://docs.neptune.ai/you-should-know/logging-metadata)

In [None]:
metadata = {
    "train_size": len(df_train),
    "test_size": len(df_test),
    "valid_size": len(df_valid),
}
metadata

In [None]:
run["data/metadata"] = metadata

## Create a new model version
[Read the docs](https://docs.neptune.ai/how-to-guides/model-registry/creating-model-versions)

### Fetch existing models in the project

In [None]:
project = neptune.init_project(name=f"{WORKSPACE_NAME}/{PROJECT_NAME}")

In [None]:
project.fetch_models_table().to_pandas()

In [None]:
model_id = project.fetch_models_table().to_pandas().sort_values(["sys/modification_time"], ascending=False)["sys/id"][0]
project.stop()
model_id

In [None]:
model_version = neptune.init_model_version(
    model=model_id,
    project=f"{WORKSPACE_NAME}/{project.get_structure()['sys']['name'].fetch()}",
)

### Associate model version to run and vice-versa

In [None]:
run_dict = {
    "id": run.get_structure()["sys"]["id"].fetch(),
    "name": run.get_structure()["sys"]["name"].fetch(),
    "url": run.get_run_url(),
}
run_dict

In [None]:
model_version["run"] = run_dict

In [None]:
model_version_dict = {
    "id": model_version.get_structure()["sys"]["id"].fetch(),
    "url": model_version.get_url(),
}
model_version_dict

In [None]:
run["model"] = model_version_dict

In [None]:
clf = fasttext.train_supervised(input=f"{DATASET_PATH}/processed/train.txt")

### Upload serialized model to model registry
[Read the docs](https://docs.neptune.ai/how-to-guides/model-registry/creating-model-versions)

In [None]:
clf.save_model("../../models/fasttext_processed.bin")

In [None]:
model_version["serialized_model"].upload("../../models/fasttext_processed.bin")

### Log model properties to model_version

In [None]:
properties = {k: v for k, v in vars(clf).items() if k not in ["_words", "f"]}
properties

In [None]:
model_version["properties"] = properties

## Log parameters, metrics and debugging information to run

In [None]:
_, precision, recall = clf.test(f"{DATASET_PATH}/processed/test.txt")
print(precision, recall)

In [None]:
run["test/metrics/precision"] = precision
run["test/metrics/recall"] = recall

In [None]:
preds = [clf.predict(text)[0][0] for text in X_test.values]
set(preds)

In [None]:
print(classification_report(y_test, preds, zero_division=0))
run["test/metrics/classification_report"] = classification_report(y_test, preds, output_dict=True, zero_division=0)

In [None]:
df_clf_rpt = pd.DataFrame(classification_report(y_test, preds, output_dict=True, zero_division=0)).T
run["test/metrics/classification_report/report"].upload(File.as_html(df_clf_rpt))

In [None]:
print(f1_score(y_test, preds, average="weighted"))
run["test/metrics/f1_score"] = f1_score(y_test, preds, average="weighted")

In [None]:
fig = ConfusionMatrixDisplay.from_predictions(y_test, preds, xticks_rotation="vertical", colorbar=False)
run["test/debug/plots/confusion_matrix"].upload(fig.figure_)

In [None]:
df_test["prediction"] = preds
df_test

In [None]:
labels = [s.replace("__label__", "") for s in df_test.label.value_counts().index]
fig = go.Figure(
    data=[
        go.Bar(name="Actual", x=labels, y=df_test.label.value_counts()),
        go.Bar(name="Prediction", x=labels, y=df_test.prediction.value_counts()),
    ]
)
fig.update_layout(title="Actual vs Prediction", barmode="group")
fig.show()

In [None]:
run["test/debug/plots/prediction_distribution"].upload(fig)

In [None]:
df_debug = df_test[df_test.label != df_test.prediction]

csv_buffer = StringIO()

df_debug.to_csv(csv_buffer, index=False)
run["test/debug/misclassifications"].upload(File.from_stream(csv_buffer, extension="csv"))

## Stop current model version, and run

In [None]:
model_version.stop()
run.stop()

# Initialize a new neptune run for autofinetuned model
[Read the docs](https://docs.neptune.ai/you-should-know/core-concepts#run)

In [None]:
run = neptune.init_run(
    project=f"{WORKSPACE_NAME}/{PROJECT_NAME}",
    name="text classification using fasttext",
    description="Autotuned model",
    tags=["fasttext", "processed", "finetuned"],
)

## Track run-specific files
[Read the docs](https://docs.neptune.ai/how-to-guides/data-versioning/compare-datasets#step-2-add-tracking-of-the-dataset-version)

In [None]:
csv_buffer = StringIO()

df_processed.sample(100).to_csv(csv_buffer, index=False)
run["data/sample"].upload(File.from_stream(csv_buffer, extension="csv"))

In [None]:
run["data/files"].track_files(f"{DATASET_PATH}/processed")

## Log metadata to run
[Read the docs](https://docs.neptune.ai/you-should-know/logging-metadata)

In [None]:
metadata = {
    "train_size": len(df_train),
    "test_size": len(df_test),
    "valid_size": len(df_valid),
}
metadata

In [None]:
run["data/metadata"] = metadata

## Create a new model version
[Read the docs](https://docs.neptune.ai/how-to-guides/model-registry/creating-model-versions)

### Fetch existing models in the project

In [None]:
project = neptune.init_project(name=f"{WORKSPACE_NAME}/{PROJECT_NAME}")

In [None]:
project.fetch_models_table().to_pandas()

In [None]:
model_id = project.fetch_models_table().to_pandas().sort_values(["sys/modification_time"], ascending=False)["sys/id"][0]
project.stop()
model_id

In [None]:
model_version = neptune.init_model_version(
    model=model_id,
    project=f"{WORKSPACE_NAME}/{project.get_structure()['sys']['name'].fetch()}",
)

### Associate model version to run and vice-versa

In [None]:
run_dict = {
    "id": run.get_structure()["sys"]["id"].fetch(),
    "name": run.get_structure()["sys"]["name"].fetch(),
    "url": run.get_run_url(),
}
run_dict

In [None]:
model_version["run"] = run_dict

In [None]:
model_version_dict = {
    "id": model_version.get_structure()["sys"]["id"].fetch(),
    "url": model_version.get_url(),
}
model_version_dict

In [None]:
run["model"] = model_version_dict

In [None]:
import os

In [None]:
clf = fasttext.train_supervised(
    input=f"{DATASET_PATH}/processed/train.txt",
    autotuneValidationFile=f"{DATASET_PATH}/processed/valid.txt",
    verbose=3,
    thread=os.cpu_count() - 2,
    autotuneDuration=120,
)

### Upload serialized model to model registry
[Read the docs](https://docs.neptune.ai/how-to-guides/model-registry/creating-model-versions)

In [None]:
clf.save_model("../../models/fasttext_finetuned.bin")

In [None]:
model_version["serialized_model"].upload("../../models/fasttext_finetuned.bin")

### Log model properties to model_version

In [None]:
properties = {k: v for k, v in vars(clf).items() if k not in ["_words", "f"]}
properties

In [None]:
model_version["properties"] = properties

## Log parameters, metrics and debugging information to run

In [None]:
_, precision, recall = clf.test(f"{DATASET_PATH}/processed/test.txt")
print(precision, recall)

In [None]:
run["test/metrics/precision"] = precision
run["test/metrics/recall"] = recall

In [None]:
preds = [clf.predict(text)[0][0] for text in X_test.values]
set(preds)

In [None]:
print(classification_report(y_test, preds, zero_division=0))
run["test/metrics/classification_report"] = classification_report(y_test, preds, output_dict=True, zero_division=0)

In [None]:
df_clf_rpt = pd.DataFrame(classification_report(y_test, preds, output_dict=True, zero_division=0)).T
run["test/metrics/classification_report/report"].upload(File.as_html(df_clf_rpt))

In [None]:
print(f1_score(y_test, preds, average="weighted"))
run["test/metrics/f1_score"] = f1_score(y_test, preds, average="weighted")

In [None]:
fig = ConfusionMatrixDisplay.from_predictions(y_test, preds, xticks_rotation="vertical", colorbar=False)
run["test/debug/plots/confusion_matrix"].upload(fig.figure_)

In [None]:
df_test["prediction"] = preds
df_test

In [None]:
labels = [s.replace("__label__", "") for s in df_test.label.value_counts().index]
fig = go.Figure(
    data=[
        go.Bar(name="Actual", x=labels, y=df_test.label.value_counts()),
        go.Bar(name="Prediction", x=labels, y=df_test.prediction.value_counts()),
    ]
)
fig.update_layout(title="Actual vs Prediction", barmode="group")
fig.show()

In [None]:
run["test/debug/plots/prediction_distribution"].upload(fig)

In [None]:
df_debug = df_test[df_test.label != df_test.prediction]

csv_buffer = StringIO()

df_debug.to_csv(csv_buffer, index=False)
run["test/debug/misclassifications"].upload(File.from_stream(csv_buffer, extension="csv"))

## Stop current model version, and run

In [None]:
model_version.stop()
run.stop()

# Initialize study-level run for optuna finetuned model

In [None]:
import uuid

sweep_id = uuid.uuid1()
print("sweep-id: ", sweep_id)

In [None]:
run = neptune.init_run(
    project=f"{WORKSPACE_NAME}/{PROJECT_NAME}",
    name="text classification using fasttext",
    description="tuning fasttext classification model using Optuna",
    tags=["fasttext", "processed", "finetuned", "optuna", "study-level"],
)

In [None]:
run["data/files"].track_files(f"{DATASET_PATH}/processed")

## Log metadata to run
[Read the docs](https://docs.neptune.ai/you-should-know/logging-metadata)

In [None]:
metadata = {
    "train_size": len(df_train),
    "test_size": len(df_test),
    "valid_size": len(df_valid),
}
metadata

In [None]:
run["data/metadata"] = metadata

## Log sweep and trial parameters
[Read the docs](https://docs.neptune.ai/integrations-and-supported-tools/hyperparameter-optimization/optuna)

In [None]:
import optuna

In [None]:
def objective_with_logging(trial):

    params = {
        "lr": trial.suggest_float("lr", 0.1, 1, step=0.1),
        "dim": trial.suggest_int("dim", 10, 1000, log=True),
        "ws": trial.suggest_int("ws", 1, 10),
        "epoch": trial.suggest_int("epoch", 1, 100),
        "minCount": trial.suggest_int("minCount", 1, 10),
        "wordNgrams": trial.suggest_int("wordNgrams", 1, 3),
        "loss": trial.suggest_categorical("loss", ["hs", "softmax", "ova"]),
        "bucket": trial.suggest_int("bucket", 1000000, 6000000, log=True),
        "lrUpdateRate": trial.suggest_int("lrUpdateRate", 1, 100, log=True),
        "t": trial.suggest_float("t", 0.00001, 0.1, log=True),
    }

    # create a trial-level Run
    run_trial_level = neptune.init_run(
        project=f"{WORKSPACE_NAME}/{PROJECT_NAME}",
        name="text classification using fasttext",
        description="Tuning fasttext classification model using Optuna",
        tags=["fasttext", "processed", "trial-level"],
    )

    # log sweep id to trial-level Run
    run_trial_level["sweep_id"] = sweep_id

    # log parameters of a trial-level Run
    clf = fasttext.train_supervised(
        input=f"{DATASET_PATH}/processed/train.txt",
        verbose=0,
        **params,
    )

    properties = {k: v for k, v in vars(clf).items() if k not in ["_words", "f"]}
    run_trial_level["model/properties"] = properties

    # run training and calculate the score for this parameter configuration
    _, precision, recall = clf.test(f"{DATASET_PATH}/processed/valid.txt")
    run_trial_level["validation/metrics/precision"] = precision
    run_trial_level["validation/metrics/recall"] = recall

    preds = [clf.predict(text)[0][0] for text in X_valid.values]

    run_trial_level["validation/metrics/classification_report"] = classification_report(
        y_valid, preds, output_dict=True, zero_division=0
    )

    score = f1_score(y_valid, preds, average="weighted")
    run_trial_level["validation/metrics/f1_score"] = score

    # stop trial-level Run
    run_trial_level.stop()

    return score

In [None]:
import neptune.new.integrations.optuna as optuna_utils

neptune_callback = optuna_utils.NeptuneCallback(run)

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(
    objective_with_logging,
    n_trials=10,
    callbacks=[neptune_callback],
)

In [None]:
run["study/sweep_id"] = sweep_id

## Create a new model version
[Read the docs](https://docs.neptune.ai/how-to-guides/model-registry/creating-model-versions)

### Fetch existing models in the project

In [None]:
project = neptune.init_project(name=f"{WORKSPACE_NAME}/{PROJECT_NAME}")

In [None]:
project.fetch_models_table().to_pandas()

In [None]:
model_id = project.fetch_models_table().to_pandas().sort_values(["sys/modification_time"], ascending=False)["sys/id"][0]
project.stop()
model_id

In [None]:
model_version = neptune.init_model_version(
    model=model_id,
    project=f"{WORKSPACE_NAME}/{project.get_structure()['sys']['name'].fetch()}",
)

### Associate model version to run and vice-versa

In [None]:
run_dict = {
    "id": run.get_structure()["sys"]["id"].fetch(),
    "name": run.get_structure()["sys"]["name"].fetch(),
    "url": run.get_run_url(),
}
run_dict

In [None]:
model_version["run"] = run_dict

In [None]:
model_version_dict = {
    "id": model_version.get_structure()["sys"]["id"].fetch(),
    "url": model_version.get_url(),
}
model_version_dict

In [None]:
run["model"] = model_version_dict

In [None]:
clf = fasttext.train_supervised(
    input=f"{DATASET_PATH}/processed/train.txt",
    verbose=5,
    **study.best_params,
)

### Upload serialized model to model registry
[Read the docs](https://docs.neptune.ai/how-to-guides/model-registry/creating-model-versions)

In [None]:
clf.save_model("../../models/fasttext_optimized.bin")

In [None]:
model_version["serialized_model"].upload("../../models/fasttext_optimized.bin")

### Log model properties to model_version

In [None]:
properties = {k: v for k, v in vars(clf).items() if k not in ["_words", "f"]}
properties

In [None]:
model_version["properties"] = properties

## Log parameters, metrics and debugging information to run

In [None]:
_, precision, recall = clf.test(f"{DATASET_PATH}/processed/test.txt")
print(precision, recall)

In [None]:
run["test/metrics/precision"] = precision
run["test/metrics/recall"] = recall

In [None]:
preds = [clf.predict(text)[0][0] for text in X_test.values]
set(preds)

In [None]:
print(classification_report(y_test, preds, zero_division=0))
run["test/metrics/classification_report"] = classification_report(y_test, preds, output_dict=True, zero_division=0)

In [None]:
df_clf_rpt = pd.DataFrame(classification_report(y_test, preds, output_dict=True, zero_division=0)).T
run["test/metrics/classification_report/report"].upload(File.as_html(df_clf_rpt))

In [None]:
f1_score(y_test, preds, average="weighted")
run["test/metrics/f1_score"] = f1_score(y_test, preds, average="weighted")

In [None]:
fig = ConfusionMatrixDisplay.from_predictions(y_test, preds, xticks_rotation="vertical", colorbar=False)
run["test/debug/plots/confusion_matrix"].upload(fig.figure_)

In [None]:
df_test["prediction"] = preds
df_test

In [None]:
labels = [s.replace("__label__", "") for s in df_test.label.value_counts().index]
fig = go.Figure(
    data=[
        go.Bar(name="Actual", x=labels, y=df_test.label.value_counts()),
        go.Bar(name="Prediction", x=labels, y=df_test.prediction.value_counts()),
    ]
)
fig.update_layout(title="Actual vs Prediction", barmode="group")
fig.show()

In [None]:
run["test/debug/plots/prediction_distribution"].upload(fig)

In [None]:
df_debug = df_test[df_test.label != df_test.prediction]

csv_buffer = StringIO()

df_debug.to_csv(csv_buffer, index=False)
run["test/debug/misclassifications"].upload(File.from_stream(csv_buffer, extension="csv"))

## Stop current model version, and run

In [None]:
model_version.stop()
run.stop()

# Explore the [project](https://app.neptune.ai/showcase/project-text-classification) in the Neptune app
