## Comparing Models

##### We will compare the three models we've trained using:

###### Aggregate metrics
###### Performance visualizations
###### Dataset visualization

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from pathlib import Path
import joblib
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.calibration import calibration_curve
from sklearn.metrics import brier_score_loss

import sys
sys.path.append("..")
np.random.seed(42)
import warnings
warnings.filterwarnings("ignore")

from ml_editor.data_processing import format_raw_df, get_split_by_author, get_vectorized_series, get_feature_vector_and_label

from ml_editor.model_evaluation import get_feature_importance, get_calibration_plot
from ml_editor.model_v2 import POS_NAMES

data_path=Path("../data/writers_with_features.csv")
df=pd.read_csv(data_path)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\msi/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
clf_1=joblib.load(Path("../models/model_1.pkl"))
vectorizer_1=joblib.load(Path("../models/vectorizer_1.pkl"))
clf_2=joblib.load(Path("../models/model_2.pkl"))
vectorizer_2=joblib.load(Path("../models/vectorizer_2.pkl"))

clf_3=joblib.load(Path("../models/model_3.pkl"))

In [3]:
train_df, test_df=get_split_by_author(df, test_size=0.2, random_state=42)

In [4]:
train_df["vectors"]=get_vectorized_series(train_df["full_text"].copy(), vectorizer_1)
test_df["vectors"]=get_vectorized_series(test_df["full_text"].copy(), vectorizer_1)

features_1=[
    "action_verb_full",
    "question_mark_full",
    "text_len",
    "language_question"
]

features_2=[
    "num_questions",
    "num_periods",
    "num_commas",
    "num_exclam",
    "num_quotes",
    "num_colon",
    "num_stops",
    "num_semicolon",
    "num_words",
    "num_chars",
    "num_diff_words",
    "avg_word_len",
    "polarity"
]

features_2.extend(POS_NAMES.keys())

X_test_1, y_test=get_feature_vector_and_label(test_df,features_1)
X_test_2, y_test=get_feature_vector_and_label(test_df, features_2)
X_test_3=test_df[features_2].astype(float)

In [5]:
clf1_predicted_proba=clf_1.predict_proba(X_test_1)
clf2_predicted_proba=clf_2.predict_proba(X_test_2)
clf3_predicted_proba=clf_3.predict_proba(X_test_3)

clf1_predicted=clf_1.predict(X_test_1)
clf2_predicted=clf_2.predict(X_test_2)
clf3_predicted=clf_3.predict(X_test_3)

ValueError: X has 7587 features, but RandomForestClassifier is expecting 7586 features as input.

## Comparing accuracy

In [None]:
def get_metrics(y_test, y_predicted):
    # true positive / (true_positive + false positive)
    precision=precision_score(y_test, y_predicted, pos_label=True, average='binary')
    # true positives / (true positives + false negatives)
    recall=recall_score(y_test, y_predicted, pos_label=True, average='binary')

    # harmonic mean of precision and recall
    f1=f1_score(y_test,  y_predicted, pos_label=True, average='binary')

    # true positives + true negatives / total
    accuracy=accuracy_score(y_test, y_predicted)
    return f1, precision, recall, accuracy


for i, y_predicted in enumerate([clf1_predicted, clf2_predicted, clf3_predicted]):
    accuracy, precision, recall, f1=get_metrics(y_test, y_predicted)

    print(f"Model Validation Accuracy: {accuracy:.3f}, precision: {precision:.3f}, recall: {recall:.3f}, f1: {f1:.3f}")

## comparing feature importance

In [7]:
def display_importance(clf, feature_names, k=10):
    print(f"Top {k} importances:\n")
    print('\n'.join([f"{tup[0]}: {tup[1]:.2g}" for tup in get_feature_importance(clf, feature_names)[:k]]))

    print(f"\nBottom {k} importances:\n")
    print('\n'.join([f"{tup[0]}: {tup[1]:.2g}" for tup in get_feature_importance(clf, feature_names)[-k:]]))


In [None]:
print("Model 1")
w_indices=vectorizer_1.get_feature_names_out()
w_indices.extend(features_1)
all_features_1=np.array(w_indices)
display_importance(clf_1, all_features_1)

In [None]:
print("Model 2")
w_indices=vectorizer_2.get_feature_names_out()
w_indices.extend(features_2)
all_features_2=np.array(w_indices)
display_importance(clf_2, all_features_2)

In [None]:
print("Model 3")
display_importance(clf_3, np.array(features_2))

## Comaping callibration

In [None]:
def get_multiple_calibration_plot(predicted_proba_arrays, true_y, figsize=(10, 8)):
    """
    :param figsize: size of the output figure
    :param predicted_proba_y: the predicted probabilities of our model for each example
    :param true_y: the true value of the label
    :return: calibration plot
    """
    plt.figure(figsize=figsize)
    ax1=plt.subplot2grid((3,1), (0, 0), rowspan=2)
    ax2=plt.subplot2grid((3, 1), (2, 0))

    ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")

    for i, predicted_proba_y in enumerate(predicted_proba_arrays):
        fraction_of_positives, mean_predicted_value=calibration_curve(
            true_y, predicted_proba_y, n_bins=10
        )

        ax1.plot(
            mean_predicted_value,
            fraction_of_positives,
            "s-",
            label="Model %s" % (i + 1)
        )
        ax2.hist(
            predicted_proba_y,
            range=(0, 1),
            bins=10,
            histtype="step",
            label="Model %s" % (i + 1),
            lw=2
        )

    ax1.set_ylabel("Fraction of positives")
    ax1.set_xlim([0, 1])
    ax1.set_ylim([0, 1])
    ax1.legend(loc="lower right")
    ax1.set_title("Calibration plot")


    ax2.set_title("Probability distribution")
    ax2.set_xlabel("Mean predicted value")
    ax2.set_ylabel("Count")
    ax2.legend(loc="upper right", ncol=2)

    plt.tight_layout()


In [None]:
predictions = [clf1_predicted_proba[:,1], clf2_predicted_proba[:,1], clf3_predicted_proba[:,1]]

get_multiple_calibration_plot(predictions, y_test)

In [None]:
get_calibration_plot(clf1_predicted_proba[:, 1], y_test, figsize=(10, 8))

In [None]:
get_calibration_plot(clf2_predicted_proba[:, 1], y_test, figsize=(10, 8))

In [None]:
get_calibration_plot(clf3_predicted_proba[:, 1], y_test, figsize=(10, 8))