# Imports/setting up directories
Note: I'm using the dark grid style for seaborn (my prefered plotting library)

In [1]:
%load_ext jupyter_black
# Utilities/Misc
import os
from pathlib import Path
import pickle as pkl
import warnings

# Data handling
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Dimension reduction
from sklearn.decomposition import PCA

# ML Modeling/Optimization
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from skopt import BayesSearchCV

# Scoring
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    balanced_accuracy_score,
    roc_auc_score,
)

# NLP-specific tools
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Custom modules
from src.preprocessing import EmbeddingAwareTokenizer, do_nothing

In [2]:
# Paths and directories
CWD = Path(os.getcwd())
DATA_DIR = CWD / "data"
CORPUS_DIR = DATA_DIR / "corpus_files"
OBJ_DIR = DATA_DIR / "objects"
MODEL_SEARCH = DATA_DIR / "model_search_results"
MODEL_SEARCH.mkdir(parents=True, exist_ok=True)

# Load/Prepare Data

In [3]:
doc_corpus_path = CORPUS_DIR / "document_corpus.pkl"
doc_df: pd.DataFrame = pd.read_pickle(doc_corpus_path)
doc_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3201 entries, 0 to 3200
Data columns (total 56 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   UID                        3201 non-null   int16         
 1   clean_text                 3201 non-null   object        
 2   num_sections               3201 non-null   int64         
 3   credit_text                3201 non-null   object        
 4   num_credit_sections        3201 non-null   int64         
 5   clean_word_tokens          3201 non-null   object        
 6   credit_word_tokens         3201 non-null   object        
 7   link                       3201 non-null   object        
 8   submission_author          3189 non-null   object        
 9   submission_id              3201 non-null   object        
 10  submission_title           3201 non-null   object        
 11  subreddit                  3201 non-null   category      
 12  submis

In [4]:
doc_df["submission_flair"].value_counts()

Subclass      1187
Monster        562
Class          478
Race           229
Spell          194
Item           166
Compendium     152
Feat           122
Mechanic       104
Background       7
Name: submission_flair, dtype: int64

In [6]:
if "split" not in doc_df.columns:
    model_data = doc_df[~doc_df["submission_flair"].isin(["Background", "Compendium"])]
    train_df, test_df = train_test_split(
        model_data, train_size=0.75, random_state=29359
    )
    doc_df["split"] = None
    doc_df.loc[doc_df["UID"].isin(train_df["UID"]), "split"] = "train"
    doc_df.loc[doc_df["UID"].isin(test_df["UID"]), "split"] = "test"
    doc_df.to_pickle(doc_corpus_path)
else:
    train_df = doc_df[doc_df["split"].isin(["train", "validation"])].copy()
    test_df = doc_df[doc_df["split"] == "test"].copy()

train_y = train_df["submission_flair"].values
test_y = test_df["submission_flair"].values

# Full Text

I'd like to compare my credit/introduction-cleaned model to the raw text to see how much it helped (or hindered) classification overall.

I already know *a priori* that it has many features unique to the GMBinder/Homebrewery format that are not found in "real" texts (i.e. professionally published texts, where credits and indexes only appear at the start of an overall book, not next to specific elements).

The cleaning I performed is a good thing for that reason alone, but I'm curious if it will help generalizability even within my scraped dataset.

### Vectorize Raw Texts

In [7]:
# Get stop words
stop_words = stopwords.words("english")
stop_words = [word_tokenize(sw) for sw in stop_words]
stop_words = [token for sw_tokens in stop_words for token in sw_tokens]

# Get list of rare words
full_count_vectorizer = CountVectorizer(
    tokenizer=word_tokenize,
    lowercase=True,
    token_pattern=None,
    stop_words=stop_words,
)
full_counts = full_count_vectorizer.fit_transform(train_df["full_text"]).toarray()
full_vocabulary = full_count_vectorizer.vocabulary_
full_infrequent_words = [
    word for word, index in full_vocabulary.items() if full_counts[:, index].sum() <= 5
]

# Expand stop words to include rare terms
stop_words.extend(full_infrequent_words)
stop_words = [word_tokenize(sw) for sw in stop_words]
stop_words = [token for sw_tokens in stop_words for token in sw_tokens]

# Get vectors
full_tfidf_vectorizer = TfidfVectorizer(
    tokenizer=word_tokenize,
    lowercase=True,
    token_pattern=None,
    stop_words=stop_words,
)
full_train_X = full_tfidf_vectorizer.fit_transform(train_df["full_text"])
full_test_X = full_tfidf_vectorizer.transform(test_df["full_text"])

# Get dense array
full_test_X = full_test_X.toarray()
full_train_X = full_train_X.toarray()

# Fit/get PCs
full_pca = PCA(n_components=1781)
full_train_X = full_pca.fit_transform(full_train_X)
full_test_X = full_pca.transform(full_test_X)
print(f"Calculated {full_pca.n_components_} components")

Calculated 1781 components


### Perform Bayes Search

In [8]:
full_search_path = MODEL_SEARCH / "full_bayes.pkl"
full_model_path = OBJ_DIR / "full_ovr_svr_model.pkl"
full_search_results_path = MODEL_SEARCH / "full_bayes_search_ovr_svc.csv"

if full_search_path.is_file():
    with open(full_search_path, "rb") as bs:
        full_search: BayesSearchCV = pkl.load(bs)
        full_results = pd.read_csv(full_search_results_path)

    if full_model_path.is_file():
        with open(full_model_path, "rb") as bm:
            full_model: OneVsRestClassifier = pkl.load(bm)
    elif isinstance(full_search.best_estimator_, OneVsRestClassifier):
        main_text_model = full_search.best_estimator_
    else:
        warnings.warn(
            "Unable to retrieve best estimator, check top params.", UserWarning
        )
else:
    full_svc = SVC(class_weight="balanced", probability=True)
    full_ovr = OneVsRestClassifier(full_svc)
    full_search = BayesSearchCV(
        full_ovr,
        {
            "estimator__C": (1e-6, 1e6, "log-uniform"),
            "estimator__gamma": (1e-6, 1e1, "log-uniform"),
            "estimator__degree": (1, 8),  # integer valued parameter
            "estimator__kernel": ["linear", "poly", "rbf"],  # categorical parameter
        },
        n_iter=64,
        cv=5,
        n_jobs=-1,
        scoring="f1_macro",
    )

    full_search.fit(full_train_X, train_y)
    full_model = full_search.best_estimator_
    with open(full_search_path, "wb") as bs:
        pkl.dump(full_search, bs)
    with open(full_model_path, "wb") as bm:
        pkl.dump(full_model, bm)
    full_results = pd.DataFrame(full_search.cv_results_)
    full_results.to_csv(full_search_results_path)

fit_time = full_results["mean_fit_time"].sum() * 5
n_models = len(full_results) * 5
print(f"{fit_time/60:.2f} total minutes to fit all {n_models} models")

full_search

382.62 total minutes to fit all 320 models


## Score Full Text Models

In [9]:
full_train_preds = full_model.predict(full_train_X)

full_train_acc = accuracy_score(train_y, full_train_preds)
full_train_balanced_acc = balanced_accuracy_score(train_y, full_train_preds)
print(
    f"W/ Credit: Accuracy = {full_train_acc*100:.2f}%; class-balanced accuracy = {full_train_balanced_acc*100:.2f}%"
)
full_train_f1_micro = f1_score(train_y, full_train_preds, average="micro")
full_train_f1_macro = f1_score(train_y, full_train_preds, average="macro")
print(
    f"W/ Credit: Micro-averaged F1 = {full_train_f1_micro:.2f}; macro-averaged F1 = {full_train_f1_macro:.2f}"
)

W/ Credit: Accuracy = 96.76%; class-balanced accuracy = 96.79%
W/ Credit: Micro-averaged F1 = 0.97; macro-averaged F1 = 0.96


In [10]:
train_Y = full_model.label_binarizer_.transform(train_y)
full_train_probs = full_model.predict_proba(full_train_X)

train_micro_roc_auc_ovr = roc_auc_score(
    train_Y.toarray(),
    full_train_probs,
    multi_class="ovr",
    average="micro",
)
train_macro_roc_auc_ovr = roc_auc_score(
    train_Y.toarray(),
    full_train_probs,
    multi_class="ovr",
    average="macro",
)
print(
    f"W/O Credit: Micro-averaged OVR AUC-ROC = {train_micro_roc_auc_ovr:.2f}; macro-averaged OVR AUC-ROC = {train_macro_roc_auc_ovr:.2f}"
)

W/O Credit: Micro-averaged OVR AUC-ROC = 1.00; macro-averaged OVR AUC-ROC = 1.00


In [11]:
full_test_preds = full_model.predict(full_test_X)

full_test_acc = accuracy_score(test_y, full_test_preds)
full_test_balanced_acc = balanced_accuracy_score(test_y, full_test_preds)
print(
    f"W/ Credit: Accuracy = {full_test_acc*100:.2f}%; class-balanced accuracy = {full_test_balanced_acc*100:.2f}%"
)
full_test_f1_micro = f1_score(test_y, full_test_preds, average="micro")
full_test_f1_macro = f1_score(test_y, full_test_preds, average="macro")
print(
    f"W/ Credit: Micro-averaged F1 = {full_test_f1_micro:.2f}; macro-averaged F1 = {full_test_f1_macro:.2f}"
)

W/ Credit: Accuracy = 92.12%; class-balanced accuracy = 87.86%
W/ Credit: Micro-averaged F1 = 0.92; macro-averaged F1 = 0.88


In [12]:
test_Y = full_model.label_binarizer_.transform(test_y)
full_test_probs = full_model.predict_proba(full_test_X)

test_micro_roc_auc_ovr = roc_auc_score(
    test_Y.toarray(),
    full_test_probs,
    multi_class="ovr",
    average="micro",
)
test_macro_roc_auc_ovr = roc_auc_score(
    test_Y.toarray(),
    full_test_probs,
    multi_class="ovr",
    average="macro",
)
print(
    f"W/O Credit: Micro-averaged OVR AUC-ROC = {test_micro_roc_auc_ovr:.2f}; macro-averaged OVR AUC-ROC = {test_macro_roc_auc_ovr:.2f}"
)

W/O Credit: Micro-averaged OVR AUC-ROC = 0.99; macro-averaged OVR AUC-ROC = 0.99


## Credit-Cleaned Text

### Pre-Processing of Credit-Cleaned Text
* Vectorize Pre-Tokenized Inputs
* Reduce Dimensions for Efficiency

In [13]:
# Define stop words
with open(OBJ_DIR / "tokenizer.pkl", "rb") as p:
    ea_tokenizer: EmbeddingAwareTokenizer = pkl.load(p)

stop_words = stopwords.words("english")
stop_words = [ea_tokenizer.tokenize(sw) for sw in stop_words]
stop_words = [token for sw_tokens in stop_words for token in sw_tokens]
stop_words = [token for token in stop_words if "<" not in token]

# Get vectors
main_text_tfidf_vectorizer = TfidfVectorizer(
    tokenizer=do_nothing,
    lowercase=False,
    token_pattern=None,
    stop_words=stop_words,
)

main_text_train_X = main_text_tfidf_vectorizer.fit_transform(
    train_df["clean_word_tokens"]
)
main_text_test_X = main_text_tfidf_vectorizer.transform(test_df["clean_word_tokens"])

# Convert to dense
main_text_test_X = main_text_test_X.toarray()
main_text_train_X = main_text_train_X.toarray()

# Get PCs
main_text_pca = PCA(n_components=1791)
main_text_train_X = main_text_pca.fit_transform(main_text_train_X)
main_text_test_X = main_text_pca.transform(main_text_test_X)
print(f"Calculated {main_text_pca.n_components_} components")

Calculated 1791 components


### Perform Bayes Search

In [14]:
main_text_search_path = MODEL_SEARCH / "main_text_bayes.pkl"
main_text_model_path = OBJ_DIR / "main_text_ovr_svr_model.pkl"
main_text_search_results_path = MODEL_SEARCH / "main_text_bayes_search_ovr_svc.csv"

if main_text_search_path.is_file():
    with open(main_text_search_path, "rb") as bs:
        main_text_search: BayesSearchCV = pkl.load(bs)
        main_text_results = pd.read_csv(main_text_search_results_path)

    if main_text_model_path.is_file():
        with open(main_text_model_path, "rb") as bm:
            main_text_model: OneVsRestClassifier = pkl.load(bm)
    elif isinstance(main_text_search.best_estimator_, OneVsRestClassifier):
        main_text_model = main_text_search.best_estimator_
    else:
        warnings.warn(
            "Unable to retrieve best estimator, check top params.", UserWarning
        )

else:
    main_text_svc = SVC(class_weight="balanced", probability=True)
    main_text_ovr = OneVsRestClassifier(main_text_svc)
    main_text_search = BayesSearchCV(
        main_text_ovr,
        {
            "estimator__C": (1e-6, 1e6, "log-uniform"),
            "estimator__gamma": (1e-6, 1e1, "log-uniform"),
            "estimator__degree": (1, 8),  # integer valued parameter
            "estimator__kernel": ["linear", "poly", "rbf"],  # categorical parameter
        },
        n_iter=64,
        cv=5,
        n_jobs=-1,
        scoring="f1_macro",
    )

    main_text_search.fit(main_text_train_X, train_y)
    main_text_model = main_text_search.best_estimator_
    with open(main_text_search_path, "wb") as bs:
        pkl.dump(main_text_search, bs)
    with open(main_text_model_path, "wb") as bm:
        pkl.dump(main_text_model, bm)
    main_text_results = pd.DataFrame(main_text_search.cv_results_)
    main_text_results.to_csv(main_text_search_results_path)


fit_time = main_text_results["mean_fit_time"].sum() * 5
n_models = len(main_text_results) * 5
print(f"{fit_time/60:.2f} total minutes to fit all {n_models} models")

main_text_search

431.44 total minutes to fit all 320 models


## Score Cleaned Text Models

### Training Performance

In [15]:
main_text_train_preds = main_text_model.predict(main_text_train_X)

main_text_train_acc = accuracy_score(train_y, main_text_train_preds)
main_text_train_balanced_acc = balanced_accuracy_score(train_y, main_text_train_preds)
print(
    f"W/O Credit: Accuracy = {main_text_train_acc*100:.2f}%; class-balanced accuracy = {main_text_train_balanced_acc*100:.2f}%"
)
main_text_train_f1_micro = f1_score(train_y, main_text_train_preds, average="micro")
main_text_train_f1_macro = f1_score(train_y, main_text_train_preds, average="macro")
print(
    f"W/O Credit: Micro-averaged F1 = {main_text_train_f1_micro:.2f}; macro-averaged F1 = {main_text_train_f1_macro:.2f}"
)

W/O Credit: Accuracy = 98.11%; class-balanced accuracy = 98.25%
W/O Credit: Micro-averaged F1 = 0.98; macro-averaged F1 = 0.98


In [16]:
train_Y = main_text_model.label_binarizer_.transform(train_y)
main_text_train_probs = main_text_model.predict_proba(main_text_train_X)

train_micro_roc_auc_ovr = roc_auc_score(
    train_Y.toarray(),
    main_text_train_probs,
    multi_class="ovr",
    average="micro",
)
train_macro_roc_auc_ovr = roc_auc_score(
    train_Y.toarray(),
    main_text_train_probs,
    multi_class="ovr",
    average="macro",
)
print(
    f"W/O Credit: Micro-averaged OVR AUC-ROC = {train_micro_roc_auc_ovr:.2f}; macro-averaged OVR AUC-ROC = {train_macro_roc_auc_ovr:.2f}"
)

W/O Credit: Micro-averaged OVR AUC-ROC = 1.00; macro-averaged OVR AUC-ROC = 1.00


### Testing Performance

In [17]:
main_text_test_preds = main_text_model.predict(main_text_test_X)

# Accuracy
main_text_test_acc = accuracy_score(test_y, main_text_test_preds)
main_text_test_balanced_acc = balanced_accuracy_score(test_y, main_text_test_preds)
print(
    f"W/O Credit: Accuracy = {main_text_test_acc*100:.2f}%; class-balanced accuracy = {main_text_test_balanced_acc*100:.2f}%"
)

main_text_test_f1_micro = f1_score(test_y, main_text_test_preds, average="micro")
main_text_test_f1_macro = f1_score(test_y, main_text_test_preds, average="macro")
print(
    f"W/O Credit: Micro-averaged F1 = {main_text_test_f1_micro:.2f}; macro-averaged F1 = {main_text_test_f1_macro:.2f}"
)

W/O Credit: Accuracy = 91.85%; class-balanced accuracy = 87.72%
W/O Credit: Micro-averaged F1 = 0.92; macro-averaged F1 = 0.88


In [18]:
test_Y = main_text_model.label_binarizer_.transform(test_y)
main_text_test_probs = main_text_model.predict_proba(main_text_test_X)

test_micro_roc_auc_ovr = roc_auc_score(
    test_Y.toarray(),
    main_text_test_probs,
    multi_class="ovr",
    average="micro",
)
test_macro_roc_auc_ovr = roc_auc_score(
    test_Y.toarray(),
    main_text_test_probs,
    multi_class="ovr",
    average="macro",
)
print(
    f"W/O Credit: Micro-averaged OVR AUC-ROC = {test_micro_roc_auc_ovr:.2f}; macro-averaged OVR AUC-ROC = {test_macro_roc_auc_ovr:.2f}"
)

W/O Credit: Micro-averaged OVR AUC-ROC = 0.99; macro-averaged OVR AUC-ROC = 0.99
