In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import joblib

In [2]:
# Load dataset
df = pd.read_csv("Ignorance.csv")

# Drop missing values if any
df.dropna(subset=["text", "label"], inplace=True)

In [3]:
# Extract features and labels
X = df["text"]
y = df["label"]

In [4]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

In [5]:
# Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [6]:
# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf_model, X_tfidf, y, cv=cv, scoring='f1')

In [7]:
# Results
print("Cross-Validation F1 Scores:", scores)
print("Average F1 Score:", np.mean(scores))

Cross-Validation F1 Scores: [1.         0.90909091 0.90909091 1.         1.        ]
Average F1 Score: 0.9636363636363636


In [8]:
# Fit on full dataset and save
rf_model.fit(X_tfidf, y)
joblib.dump(rf_model, "rf_ignorance_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer_ignorance.pkl")

['tfidf_vectorizer_ignorance.pkl']