In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


file_path = '/content/drive/MyDrive/notebooks/fakenews.csv'
df = pd.read_csv(file_path)
df = df.dropna(subset=['text'])

print("\n\nOriginal Text:\n\n", df['text'].iloc[0])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




Original Text: House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30, 2016 Subscribe Jason Chaffetz on the stump in American Fork, Utah ( image courtesy Michael Jolley, available under a Creative Commons-BY license) 
With apologies to Keith Olbermann, there is no doubt who the Worst Person in The World is this week–FBI Director James Comey. But according to a House Democratic aide, it looks like we also know who the second-worst person is as well. It turns out that when Comey sent his now-infamous letter announcing that the FBI was looking into emails that may be related to Hillary Clinton’s email server, the ranking Democrats on the relevant committees didn’t hear about it from Comey. They found out via a tweet from one of the Republican committee chairmen. 
As we now know, Comey notified the Republican chairmen and Democratic ranking members of the House Intelligence, Judiciary, and Oversight committees that his agency was 

### **Removing punctuation**


In [4]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['text'] = df['text'].apply(remove_punctuation)


print("\n\nText after removing punctuation:\n\n", df['text'].iloc[0])



Text after removing punctuation: House Dem Aide We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30 2016 Subscribe Jason Chaffetz on the stump in American Fork Utah  image courtesy Michael Jolley available under a Creative CommonsBY license 
With apologies to Keith Olbermann there is no doubt who the Worst Person in The World is this week–FBI Director James Comey But according to a House Democratic aide it looks like we also know who the secondworst person is as well It turns out that when Comey sent his nowinfamous letter announcing that the FBI was looking into emails that may be related to Hillary Clinton’s email server the ranking Democrats on the relevant committees didn’t hear about it from Comey They found out via a tweet from one of the Republican committee chairmen 
As we now know Comey notified the Republican chairmen and Democratic ranking members of the House Intelligence Judiciary and Oversight committees that his agency was r

### **Removing stop words**


In [5]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(tokens)

df['text'] = df['text'].apply(remove_stopwords)


print("\n\nText after removing stop-words:\n\n", df['text'].iloc[0])




Text after removing stop-words:

 House Dem Aide ’ Even See Comey ’ Letter Jason Chaffetz Tweeted Darrell Lucus October 30 2016 Subscribe Jason Chaffetz stump American Fork Utah image courtesy Michael Jolley available Creative CommonsBY license apologies Keith Olbermann doubt Worst Person World week–FBI Director James Comey according House Democratic aide looks like also know secondworst person well turns Comey sent nowinfamous letter announcing FBI looking emails may related Hillary Clinton ’ email server ranking Democrats relevant committees ’ hear Comey found via tweet one Republican committee chairmen know Comey notified Republican chairmen Democratic ranking members House Intelligence Judiciary Oversight committees agency reviewing emails recently discovered order see contained classified information long letter went Oversight Committee Chairman Jason Chaffetz set political world ablaze tweet FBI Dir informed FBI learned existence emails appear pertinent investigation Case reope

### **Removing ’**

In [6]:
def remove_special_characters(text):
    return text.replace('’', '')


df['text'] = df['text'].apply(remove_special_characters)

print("\n\nText after removing special characters:\n\n", df['text'].iloc[0])



Text after removing special characters:

 House Dem Aide  Even See Comey  Letter Jason Chaffetz Tweeted Darrell Lucus October 30 2016 Subscribe Jason Chaffetz stump American Fork Utah image courtesy Michael Jolley available Creative CommonsBY license apologies Keith Olbermann doubt Worst Person World week–FBI Director James Comey according House Democratic aide looks like also know secondworst person well turns Comey sent nowinfamous letter announcing FBI looking emails may related Hillary Clinton  email server ranking Democrats relevant committees  hear Comey found via tweet one Republican committee chairmen know Comey notified Republican chairmen Democratic ranking members House Intelligence Judiciary Oversight committees agency reviewing emails recently discovered order see contained classified information long letter went Oversight Committee Chairman Jason Chaffetz set political world ablaze tweet FBI Dir informed FBI learned existence emails appear pertinent investigation Case r

### **Lemmatization**

In [7]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

df['text'] = df['text'].apply(lemmatize_text)


print("\n\nText after lemmatization:\n\n", df['text'].iloc[0])



Text after lemmatization:

 House Dem Aide Even See Comey Letter Jason Chaffetz Tweeted Darrell Lucus October 30 2016 Subscribe Jason Chaffetz stump American Fork Utah image courtesy Michael Jolley available Creative CommonsBY license apology Keith Olbermann doubt Worst Person World week–FBI Director James Comey according House Democratic aide look like also know secondworst person well turn Comey sent nowinfamous letter announcing FBI looking email may related Hillary Clinton email server ranking Democrats relevant committee hear Comey found via tweet one Republican committee chairman know Comey notified Republican chairman Democratic ranking member House Intelligence Judiciary Oversight committee agency reviewing email recently discovered order see contained classified information long letter went Oversight Committee Chairman Jason Chaffetz set political world ablaze tweet FBI Dir informed FBI learned existence email appear pertinent investigation Case reopened — Jason Chaffetz jas

### **Creating model**

In [8]:
x_text = df['text']
y = df['label'].values

x_train, x_temp, y_train, y_temp = train_test_split(x_text, y, test_size=0.3, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)


vectorizer = CountVectorizer(stop_words='english', max_features=5000)
x_train_bow = vectorizer.fit_transform(x_train)
x_val_bow = vectorizer.transform(x_val)
x_test_bow = vectorizer.transform(x_test)

model = MultinomialNB()
model.fit(x_train_bow, y_train)

### **Evaluate on val set**

In [9]:
y_val_pred = model.predict(x_val_bow)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Validation Accuracy: {val_accuracy:.4f}')
print('Validation Classification Report:')
print(classification_report(y_val, y_val_pred))

Validation Accuracy: 0.8886
Validation Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.92      0.89      1564
           1       0.91      0.86      0.88      1550

    accuracy                           0.89      3114
   macro avg       0.89      0.89      0.89      3114
weighted avg       0.89      0.89      0.89      3114



### **Evaluate on test set**

In [16]:
y_test_pred = model.predict(x_test_bow)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy:.4f}')
print('Test Classification Report:')
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.8841
Test Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.92      0.89      1574
           1       0.91      0.85      0.88      1541

    accuracy                           0.88      3115
   macro avg       0.89      0.88      0.88      3115
weighted avg       0.89      0.88      0.88      3115



#### **Saving the model**

In [15]:
import joblib
import os


preprocessing_model_dir = '/content/drive/MyDrive/models/naive_bayes/'
os.makedirs(preprocessing_model_dir, exist_ok=True)
model_path = os.path.join(preprocessing_model_dir, 'naive_bayes_preprocessing.joblib')
vectorizer_path = os.path.join(preprocessing_model_dir, 'vectorizer_preprocessing.joblib')


joblib.dump(model, model_path)
joblib.dump(vectorizer, vectorizer_path)

print(f'Model and vectorizer saved to {model_path} and {vectorizer_path}')


Model and vectorizer saved to /content/drive/MyDrive/models/naive_bayes/naive_bayes_preprocessing.joblib and /content/drive/MyDrive/models/naive_bayes/vectorizer_preprocessing.joblib
