In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
import pickle

In [5]:
data_preprocessed = pd.read_csv("test-data/preprocessed")

In [6]:


X_text = data_preprocessed["cleaned_text"]
y_target = data_preprocessed["target"]

N_SPLITS = 5 
kf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)
f1_scores = []
M = 0.001 # Minimum frequency threshold

for i, (train_index, val_index) in enumerate(kf.split(X_text, y_target)):
    print(f"--- Processing Fold {i+1}/{N_SPLITS} ---")
    
    X_train_text, X_val_text = X_text.iloc[train_index], X_text.iloc[val_index]
    y_train, y_val = y_target.iloc[train_index], y_target.iloc[val_index]

    vectorizer = CountVectorizer(binary=True, min_df=M)
    
    X_train_features = vectorizer.fit_transform(X_train_text)
    X_val_features = vectorizer.transform(X_val_text)
    
    logreg_l2 = LogisticRegression(penalty="l2", solver="liblinear", random_state=42)
    model_logreg_l2 = logreg_l2.fit(X_train_features, y_train)
    
    y_pred = model_logreg_l2.predict(X_val_features)
    fold_f1 = f1_score(y_true=y_val, y_pred=y_pred)
    f1_scores.append(fold_f1)

print(f"Individual F1-Scores: {f1_scores}")
print(f"Mean F1-Score across {N_SPLITS} folds: {np.mean(f1_scores):.4f}")
print(f"Standard Deviation of F1-Score: {np.std(f1_scores):.4f}")

--- Processing Fold 1/5 ---
--- Processing Fold 2/5 ---
--- Processing Fold 3/5 ---
--- Processing Fold 4/5 ---
--- Processing Fold 5/5 ---
Individual F1-Scores: [0.875, 0.8666666666666667, 0.8148148148148148, 0.9333333333333333, 0.782608695652174]
Mean F1-Score across 5 folds: 0.8545
Standard Deviation of F1-Score: 0.0520


In [7]:
model_filename = 'models/trained_logistic_regression_model.pkl'
vectorizer_filename = 'models/trained_count_vectorizer.pkl'

with open(model_filename, 'wb') as file:
    pickle.dump(model_logreg_l2, file)

with open(vectorizer_filename, 'wb') as file:
    pickle.dump(vectorizer, file)
