## Inspect feature Importance

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from pathlib import Path
import joblib

import sys
sys.path.append("..")
import warnings
warnings.filterwarnings("ignore")

from ml_editor.data_processing import (
    format_raw_df,
    get_split_by_author,
    add_text_features_to_df,
    get_vectorized_series,
    get_feature_vector_and_label
)

from ml_editor.model_evaluation import get_feature_importance

data_path=Path("../data/writers.csv")
df=pd.read_csv(data_path)
df=format_raw_df(df.copy())


In [2]:
df=add_text_features_to_df(df.loc[df["is_question"]].copy())
train_df, test_df = get_split_by_author(df, test_size=0.2, random_state=42)


In [3]:
model_path=Path("../models/model_1.pkl")
clf=joblib.load(model_path)
vectorizer_path=Path("../models/vectorizer_1.pkl")
vectorizer=joblib.load(vectorizer_path)

In [4]:
train_df["vectors"]=get_vectorized_series(train_df["full_text"].copy(), vectorizer)
test_df["vectors"]=get_vectorized_series(test_df["full_text"].copy(), vectorizer)

features=[
    "action_verb_full",
    "question_mark_full",
    "text_len",
    "language_question"
]

X_train, y_train=get_feature_vector_and_label(train_df, features)
X_test, y_test=get_feature_vector_and_label(test_df, features)

In [7]:
w_indices=list(vectorizer.get_feature_names_out())
w_indices.extend(features)
all_feature_names=np.array(w_indices)

In [8]:
k = 10
print(f"Top {k} importances:\n")
print('\n'.join([f"{tup[0]}: {tup[1]:.2g}" for tup in get_feature_importance(clf, all_feature_names)[:k]]))

print(f"\nBottom {k} importances:\n")
print('\n'.join([f"{tup[0]}: {tup[1]:.2g}" for tup in get_feature_importance(clf, all_feature_names)[-k:]]))


Top 10 importances:

text_len: 0.0086
are: 0.0056
what: 0.0051
writing: 0.0049
ve: 0.0047
story: 0.0043
can: 0.0042
do: 0.0041
don: 0.0039
with: 0.0038

Bottom 10 importances:

knit: 0
balancing: 0
smiling: 0
smoke: 0
smoking: 0
snake: 0
labor: 0
labels: 0
societies: 0
comforting: 0
