****Import Libraries****

In [12]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pickle

****Download Stopwords****

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\najam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


****Load Dataset****

In [3]:
fake_df = pd.read_csv('C:\\Users\\najam\\PycharmProjects\\FakeNews_Detection\\fakenews_dataset\\Fake.csv')
true_df = pd.read_csv('C:\\Users\\najam\\PycharmProjects\\FakeNews_Detection\\fakenews_dataset\\True.csv')

#Add labels to news
fake_df['label'] = 0
true_df['label'] = 1

#merge both dataframes
dataset = pd.concat([fake_df, true_df], axis=0).sample(frac=1, random_state=42).reset_index(drop=True)

****Cleaning the data****

In [4]:
def clean_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)    #Remove punctuation/numbers
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text


#Apply function on dataset
dataset['text'] = dataset['text'].apply(clean_text)
dataset = dataset[dataset['text'].str.strip() != ''].reset_index(drop=True)

****Split dataset into train, test and validation****

In [5]:
train_valid, test = train_test_split(dataset, test_size=0.2, stratify=dataset['label'], random_state=42)
train, valid = train_test_split(train_valid, test_size=0.1,stratify=train_valid['label'], random_state=42)

****Vectorize dataset****

In [6]:
vectorizer = TfidfVectorizer(max_features=5000)

X_train = vectorizer.fit_transform(train['text'])
X_valid = vectorizer.transform(valid['text'])
X_test = vectorizer.transform(test['text'])

y_train = train['label']
y_valid = valid['label']
y_test = test['label']

****Train the Model****

In [8]:
model = LogisticRegression()
model.fit(X_train, y_train)

#Validation performance
y_pred = model.predict(X_valid)
print("Validation Accuracy: ", accuracy_score(y_valid, y_pred))
print(classification_report(y_valid, y_pred))

Validation Accuracy:  0.9906832298136646
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1828
           1       0.99      0.99      0.99      1714

    accuracy                           0.99      3542
   macro avg       0.99      0.99      0.99      3542
weighted avg       0.99      0.99      0.99      3542



****Evaluation on Test Set****

In [9]:
y_test_pred = model.predict(X_test)
print("Test Accuracy: ", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

Test Accuracy:  0.9881409532414728
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4570
           1       0.98      0.99      0.99      4284

    accuracy                           0.99      8854
   macro avg       0.99      0.99      0.99      8854
weighted avg       0.99      0.99      0.99      8854



****Save Model****


In [13]:
with open('model/model.pkl', 'wb') as f:
    pickle.dump(model, f)

#save vectorizer
with open('model/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)