In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import joblib

In [2]:
# Load dataset
df = pd.read_csv("Attacking.csv")

In [3]:
# Drop missing entries
df.dropna(subset=["text", "label"], inplace=True)

In [4]:
# Features and labels
X = df["text"]
y = df["label"]

In [5]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)
X_tfidf = vectorizer.fit_transform(X)

# Define model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [6]:
# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(rf_model, X_tfidf, y, cv=cv, scoring='f1')

In [7]:
# Output results
print("Cross-Validation F1 Scores:", scores)
print("Average F1 Score:", np.mean(scores))

Cross-Validation F1 Scores: [0.90909091 1.         0.90909091 0.90909091 1.        ]
Average F1 Score: 0.9454545454545455


In [8]:
# Train on full data and save model
rf_model.fit(X_tfidf, y)
joblib.dump(rf_model, "rf_attacking_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer_attacking.pkl")

['tfidf_vectorizer_attacking.pkl']