In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [6]:
# Load dataset
df = pd.read_csv("Hypocricy.csv")

In [7]:
# Check for missing values
df.dropna(subset=["text", "label"], inplace=True)

In [8]:
# Split features and target
X = df["text"]
y = df["label"]

In [9]:
# Convert text to TF-IDF vectors
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

In [15]:
# Define Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [18]:
# Perform stratified 5-fold cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf_model, X_tfidf, y, cv=cv, scoring='f1')

In [21]:
print("Cross-Validation F1 Scores:", scores)
print("Average F1 Score:", np.mean(scores))

Cross-Validation F1 Scores: [1. 1. 1. 1. 1.]
Average F1 Score: 1.0


In [22]:
rf_model.fit(X_tfidf, y)
joblib.dump(rf_model, "rf_hypocrisy_model_cv.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer_cv.pkl")

['tfidf_vectorizer_cv.pkl']