In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
config = {
    # Preprocessing
    "min_combined_length": 10,
    "to_lower": True,
    "to_upper": False,
    "remove_punctuation": True,
    "remove_chinese": True,
    "stem": False,
    "only_4digit": True,
    "only_exist": True,

    # Embeddings
    "embeddings_engine": "fasttext",

    # Final Model
    "model": "knn",  # knn, gradient_boosting
}


In [None]:
from workshop.data import load_and_preprocess_data 

df_prepped = load_and_preprocess_data(config=config)
df_prepped

# Subsetting (for quicker development)

In [None]:
from workshop.embedding import get_embeddings

df_embeddings = get_embeddings(df_prepped)

In [None]:
df_embeddings

# Modelling

## Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

target_colname = "isco88"

# Use everything except the target as features
features = df_embeddings.copy()
# Drop all columns except the target
labels = df_prepped.copy()[target_colname]

(
    X_train, X_test,
    y_train, y_true,
    text_train, text_test,
) = train_test_split(
    features,
    labels,
    df_prepped["combined_text"],
    test_size=0.2,
    random_state=0,
)

## Model Fitting

In [None]:
from workshop.modelling import train_model, predict

model = train_model(X_train, y_train)


In [None]:
y_pred = predict(model, X_test)

## Evaluation

In [None]:
from sklearn.metrics import accuracy_score, f1_score

y_pred = predict(model, X_test)

metrics = {
    "accuracy": accuracy_score(y_true, y_pred),
    "f1": f1_score(y_true, y_pred, average="weighted"),
}
metrics

In [None]:
import pandas as pd
from workshop.modelling import correct_at_digit

df_eval = pd.DataFrame({
    "combined_text": text_test,
    "isco88_true": y_true,
    "isco88_pred": y_pred,
})  
df_eval["correct_4"] = df_eval["isco88_true"] == df_eval["isco88_pred"]
df_eval["correct_3"] = correct_at_digit(df_eval["isco88_pred"], df_eval["isco88_true"], 3)
df_eval["correct_2"] = correct_at_digit(df_eval["isco88_pred"], df_eval["isco88_true"], 2)
df_eval["correct_1"] = correct_at_digit(df_eval["isco88_pred"], df_eval["isco88_true"], 1)
df_eval

In [None]:
correct_at_digits = {
    "4_digits": df_eval["correct_4"].mean(),
    "3_digits": df_eval["correct_3"].mean(),
    "2_digits": df_eval["correct_2"].mean(),
    "1_digits": df_eval["correct_1"].mean()
}
correct_at_digits


In [None]:
miscodings = df_eval[~df_eval["correct_4"]]
miscodings