In [3]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

# TRUE_DATASET
true_data = pd.read_csv(r"C:\Users\nh013\Desktop\fake and real news dataset\True.csv")

# FAKE_DATASET
fake_data = pd.read_csv(r"C:\Users\nh013\Desktop\fake and real news dataset\Fake.csv")

# LABEL 1 TRUE AND LABEL 0 FAKE
true_data['label'] = 1
fake_data['label'] = 0

# COMBINE DATASET
combined_data = pd.concat([true_data, fake_data], ignore_index=True)

# EXCHANGE THE COMBINE DATASET
combined_data = combined_data.sample(frac=1, random_state=42).reset_index(drop=True)

# CONVERT TO LOWERCASE, REMOVE SPECIAL CHARACTERS, TOKENIZATION
combined_data['text'] = combined_data['text'].str.lower()
combined_data['text'] = combined_data['text'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))
combined_data['text'] = combined_data['text'].apply(lambda x: word_tokenize(x))

# REMOVE STOPWORDS, LEMMATIZATION
stop_words = set(stopwords.words('english'))
combined_data['text'] = combined_data['text'].apply(lambda x: [word for word in x if word not in stop_words])

lemmatizer = WordNetLemmatizer()
combined_data['text'] = combined_data['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# CONVERT TOKENIZED TEXT TO STRING
combined_data['text'] = combined_data['text'].apply(lambda x: ' '.join(x))

# TF-IDF VECTORIZATION
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(combined_data['text'])
y = combined_data['label']

# SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TRAIN NAIVE BAYES CLASSIFIER MODEL
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# PREDICT ON TEST DATA
y_pred = nb_classifier.predict(X_test)

# EVALUATE THE MODEL
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Accuracy: 0.939977728285078
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.93      0.94      4669
           1       0.93      0.95      0.94      4311

    accuracy                           0.94      8980
   macro avg       0.94      0.94      0.94      8980
weighted avg       0.94      0.94      0.94      8980

