In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [2]:
# Load the dataset
data = pd.read_csv("/kaggle/input/fakenewscorpus/preprocessed_data.csv")

# Splitting into X (features) and y (target)
X = data['content']
y = data['label']

In [3]:
# Split the dataset into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [4]:
# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(X_test)
joblib.dump(vectorizer, "nb_count_vec.pkl")

['nb_count_vec.pkl']

In [5]:
# Train a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)
# Save the classifier
joblib.dump(classifier, "nb_count_classifier.pkl")

['nb_count_classifier.pkl']

In [6]:
# Calculate the score using the validation set
val_score = classifier.score(X_val_vec, y_val)
print("Validation Score:", val_score)

# Predictions
y_pred = classifier.predict(X_test_vec)
# Evaluate the classifier
test_accuracy = accuracy_score(y_test, y_pred)
report_test = classification_report(y_test, y_pred)


# Print and save the reports
print("Test Accuracy:", test_accuracy)
print("\nTest Classification Report:")
print(report_test)

# Save the reports to a file
with open("nb_count_report.txt", "w") as file:
    file.write("Test Accuracy: {}\n\n".format(test_accuracy))
    file.write("Test Classification Report:\n")
    file.write(report_test)

Validation Score: 0.8400766100071821
Test Accuracy: 0.840555647093119

Test Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.79      0.82     36472
         1.0       0.83      0.88      0.86     42427

    accuracy                           0.84     78899
   macro avg       0.84      0.84      0.84     78899
weighted avg       0.84      0.84      0.84     78899

