In [None]:
import neptune.new as neptune
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.metrics import precision_score, accuracy_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
def log_raw_data(run: neptune.Run, base_namespace: str, df: pd.DataFrame):
    run[f"{base_namespace}/version"].track_files(
        "../data/covid_and_healthy_spectra.csv"
    )

    run[f"{base_namespace}/n_rows"] = df.shape[0]
    run[f"{base_namespace}/n_cols"] = df.shape[1]

    run[f"{base_namespace}/target/n_Healthy"] = df.diagnostic.value_counts()["Healthy"]
    run[f"{base_namespace}/target/n_SARS-CoV-2"] = df.diagnostic.value_counts()[
        "SARS-CoV-2"
    ]
    run[f"{base_namespace}/target/class_balance"] = neptune.types.File.as_html(
        px.histogram(df.diagnostic)
    )

    df.head(n=30).to_csv("data_sample.csv")
    run[f"{base_namespace}/sample"].upload("data_sample.csv")


def log_dataset(
    run: neptune.Run,
    base_namespace: str,
    data: pd.DataFrame,
    target: pd.Series,
):
    run[f"{base_namespace}/n_rows"] = data.shape[0]
    run[f"{base_namespace}/n_cols"] = data.shape[1]

    run[f"{base_namespace}/target/n_Healthy"] = target.value_counts()[0]
    run[f"{base_namespace}/target/n_SARS-CoV-2"] = target.value_counts()[1]
    run[f"{base_namespace}/target/class_balance"] = neptune.types.File.as_html(
        px.histogram(target, text_auto=True)
    )


def log_training_report(run: neptune.Run, base_namespace: str, y_data: zip):
    for dataset, y_pair in zip(["train", "valid", "test"], y_data):
        run[f"{base_namespace}/{dataset}/precision"] = precision_score(
            y_pair[0], y_pair[1]
        )
        run[f"{base_namespace}/{dataset}/accuracy"] = accuracy_score(
            y_pair[0], y_pair[1]
        )
        # run[f"{base_namespace}/{dataset}/recall"] = recall_score(y_pair[0], y_pair[1])


def log_pca(run: neptune.Run, base_namespace: str, pca: PCA):
    run[f"{base_namespace}/explained_variance_ratio"].log(
        list(pca.explained_variance_ratio_)
    )
    run[f"{base_namespace}/singular_values"].log(list(pca.singular_values_))

    exp_var = np.cumsum(pca.explained_variance_ratio_)

    fig = px.area(
        x=range(1, exp_var.shape[0] + 1),
        y=exp_var,
        labels={"x": "# Components", "y": "Explained Variance"},
    )

    run[f"{base_namespace}/explained_variance_chart"] = neptune.types.File.as_html(fig)

In [None]:
from getpass import getpass
api_token = getpass("Enter Neptune API token: ")

In [None]:
project = getpass("Enter project name: ")

In [None]:
DATA_PATH = "../data/covid_and_healthy_spectra.csv"

#### Create Neptune run

In [None]:
run = neptune.init_run(
    api_token=api_token,
    project=project,
    source_files=["../environment.yml"],
    tags=["svm", "notebook-run"],
)

#### Log configuration

In [None]:
config = {
    "test_size": 0.30,
    "val_size": 0.50,
    "scaler": True,
    "pca": True,
    "n_components": 5,
    "seed": 2022,
    "column_select": False,
    "nth_column": 10,
    "log_model": True,
}

# (neptune) log config
run["config"] = config

#### Log data management info

In [None]:
df = pd.read_csv(DATA_PATH)

# (neptune) log data version, data dimensions, target occurrences
log_raw_data(run=run, base_namespace="data/raw", df=df)

df.diagnostic = df.diagnostic.apply(lambda x: 1 if x == "SARS-CoV-2" else 0)

y = df.diagnostic
df = df[df.columns.drop("diagnostic")]
X = df.iloc[:, :10]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=config["test_size"], random_state=config["seed"]
)
X_test, X_val, y_test, y_val = train_test_split(
    X_test, y_test, test_size=config["val_size"], random_state=config["seed"]
)

#### modeling pipeline, feature management, metrics reporting

In [None]:
if config["scaler"]:
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    X_val = scaler.transform(X_val)

if config["pca"]:
    pca = PCA(n_components=config["n_components"])
    pca.fit(X_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    X_val = pca.transform(X_val)

    # (neptune) log PCA results
    log_pca(run=run, base_namespace="data/pca", pca=pca)

# (neptune) log metadata for train, valid, test
log_dataset(run=run, base_namespace="data/train", data=X_train, target=y_train)
log_dataset(run=run, base_namespace="data/valid", data=X_val, target=y_val)
log_dataset(run=run, base_namespace="data/test", data=X_test, target=y_test)


In [None]:
clf = svm.SVC()
clf.fit(X_train, y_train)

y_train_pred = clf.predict(X_train)
y_val_pred = clf.predict(X_val)
y_test_pred = clf.predict(X_test)

y_data = zip((y_train, y_val, y_test), (y_train_pred, y_val_pred, y_test_pred))

# (neptune) log metrics
log_training_report(run=run, base_namespace="modeling", y_data=y_data)

if config["log_model"]:
    run["modeling/pickled_model"] = neptune.types.File.as_pickle(clf)

In [None]:
run.stop()