## Training a simple model

In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
import joblib
import sys
sys.path.append("..")
np.random.seed(42)
import warnings
warnings.filterwarnings('ignore')

from ml_editor.data_processing import (
    format_raw_df,
    add_text_features_to_df,
    get_feature_vector_and_label,
    get_split_by_author,
    get_vectorized_input_and_label,
    get_vectorized_series,
    train_vectorizer
)

from ml_editor.model_v1 import get_model_probabilities_for_input_texts


data_path=Path("D:\Project 1\data\writers.csv")
df=pd.read_csv(data_path)
df=format_raw_df(df.copy())

df=df.loc[df["is_question"]].copy()


In [2]:
df=add_text_features_to_df(df.copy())
train_df, test_df=get_split_by_author(df, test_size=0.2, random_state=42)

vectorizer=train_vectorizer(train_df)
train_df["vectors"]=get_vectorized_series(train_df["full_text"].copy(), vectorizer)
test_df["vectors"]=get_vectorized_series(test_df["full_text"].copy(), vectorizer)

In [3]:
features=[
    "action_verb_full",
    "question_mark_full",
    "text_len",
    "language_question"
]

X_train, y_train=get_feature_vector_and_label(train_df, features)
X_test, y_test=get_feature_vector_and_label(test_df, features)

In [4]:
clf=RandomForestClassifier(n_estimators=100,class_weight='balanced', oob_score=True)
clf.fit(X_train, y_train)

y_predicted=clf.predict(X_test)
y_predicted_proba=clf.predict_proba(X_test)

In [5]:
y_train.value_counts()

Score
False    3327
True     2889
Name: count, dtype: int64

## Metrics

In [6]:
def get_metrics(y_test, y_predicted):
    # True positive / (true positive + false positive)
    precision=precision_score(y_test, y_predicted, pos_label=None, average="weighted")

    # true positive / (true positive + false negative)
    recall = recall_score(y_test, y_predicted, pos_label=None, average='weighted')

    # harmonic mean of precision and recall
    f1=f1_score(y_test, y_predicted, pos_label=None, average='weighted')

    # true positive + true negative / total
    accuracy=accuracy_score(y_test, y_predicted)
    return accuracy, precision, recall, f1

In [7]:
y_train_pred=np.argmax(clf.oob_decision_function_, axis=1)

accuracy, precision, recall, f1=get_metrics(y_train, y_train_pred)
print(f"Training accuracy: {accuracy:.3f}, precision: {precision:.3f}, recall: {recall:.3f}")

Training accuracy: 0.606, precision: 0.604, recall: 0.606


In [8]:
accuracy, precision, recall, f1=get_metrics(y_test, y_predicted)
print(f"Validation accuracy: {accuracy:.3f}, precision: {precision:.3f}, recall: {recall:.3f}, f1: {f1:.3f}")

Validation accuracy: 0.592, precision: 0.591, recall: 0.592, f1: 0.591


In [9]:
model_path=Path("../models/model_1.pkl")
vectorizer_path=Path("../models/vectorizer_1.pkl")
joblib.dump(clf, model_path)
joblib.dump(vectorizer, vectorizer_path)

['..\\models\\vectorizer_1.pkl']

## Inference function

In [12]:
test_q=["bad question"]
probs=get_model_probabilities_for_input_texts(test_q)

# Index 1 corresponds to the positive class here
print(f"Probability of questions receiving a high score according to our model : {(probs[0][1])}")

ValueError: setting an array element with a sequence.