In [None]:
%load_ext autoreload
%autoreload 2


In [None]:
config = {
    # Preprocessing
    "min_combined_length": 0,
    "to_lower": True,
    "to_upper": False,
    "remove_punctuation": True,
    "remove_chinese": True,
    "stem": False,
    "only_4digit": True,
    "only_exist": True,
    
    # Data
    "add_isco88": True,

    # Embeddings
    "embeddings_engine": "fasttext", # fasttext, bag_of_words

    # Final Model
    "model": "knn",  # knn, gradient_boosting, xgboost
}


In [None]:
from workshop.data import load_and_preprocess_data 

df_prepped = load_and_preprocess_data(config=config)
df_prepped

# Subsetting (for quicker development)

In [None]:
from workshop.embedding import get_embeddings

df_embeddings = get_embeddings(df_prepped, "combined_text", config=config)

In [None]:
df_embeddings

# Modelling

## Train-Test Split

In [None]:
from workshop.preprocessing_Asialymph import train_split

target_colname = "isco88"
text_colname = "combined_text"

# Use everything except the target as features
features = df_embeddings.copy()
# Drop all columns except the target
labels = df_prepped.copy()[target_colname]

combined = df_embeddings
combined[target_colname] = df_prepped[target_colname]
combined[text_colname] = df_prepped[text_colname]

combined_train, combined_test = train_split(combined)

X_train = combined_train.drop(columns=[target_colname, text_colname])
X_test = combined_test.drop(columns=[target_colname, text_colname])
y_train = combined_train[target_colname]
y_true = combined_test[target_colname]
text_train = combined_train[text_colname]
text_test = combined_test[text_colname]


## Add additional data

In [None]:
from workshop.data import load_isco88_structure
from workshop.embedding import get_embeddings

# Augment training data with ISCO88 structure
if config["add_isco88"]:
    isco_index = load_isco88_structure(config=config)
    isco_index

    isco_index_embeddings = get_embeddings(isco_index, "occupations", config=config)
    isco_index_embeddings

    X_train = pd.concat([X_train, isco_index_embeddings])
    y_train = pd.concat([y_train, isco_index["isco88"]])

In [None]:
print("Train", X_train.shape)
print("Test", X_test.shape)

## Model Fitting

In [None]:
from workshop.modelling import train_model, predict

train_output = train_model(X_train, y_train, config=config)
train_output["model"]


## Evaluation

In [None]:
from sklearn.metrics import accuracy_score, f1_score

y_pred = predict(train_output, X_test, config=config)

metrics = {
    "accuracy": accuracy_score(y_true, y_pred),
    "f1": f1_score(y_true, y_pred, average="weighted"),
}
metrics

In [None]:
import pandas as pd
from workshop.modelling import correct_at_digit

df_eval = pd.DataFrame({
    "combined_text": text_test,
    "isco88_true": y_true,
    "isco88_pred": y_pred,
})  
df_eval["correct_4"] = df_eval["isco88_true"] == df_eval["isco88_pred"]
df_eval["correct_3"] = correct_at_digit(df_eval["isco88_pred"], df_eval["isco88_true"], 3)
df_eval["correct_2"] = correct_at_digit(df_eval["isco88_pred"], df_eval["isco88_true"], 2)
df_eval["correct_1"] = correct_at_digit(df_eval["isco88_pred"], df_eval["isco88_true"], 1)
df_eval

In [None]:
correct_at_digits = {
    "4_digits": df_eval["correct_4"].mean(),
    "3_digits": df_eval["correct_3"].mean(),
    "2_digits": df_eval["correct_2"].mean(),
    "1_digits": df_eval["correct_1"].mean()
}
correct_at_digits


In [None]:
miscodings = df_eval[~df_eval["correct_4"]]
miscodings

In [None]:
from workshop.data import load_and_preprocess_validation_data

validation_data = load_and_preprocess_validation_data()
validation_embeddings = get_embeddings(validation_data, "combined_text", config=config)

y_pred_val = predict(train_output, validation_embeddings, config=config)

metrics = {
    "accuracy": accuracy_score(validation_data["isco88"], y_pred_val),
    "f1": f1_score(validation_data["isco88"], y_pred_val, average="weighted"),
}
metrics