In [0]:
## Import packages required for classification
import os
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
## packages for deep learning
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [0]:
## change the 
os.chdir('/content/drive/My Drive/Fake_news_classifier_project')

In [3]:
os.getcwd()

'/content/drive/My Drive/Fake_news_classifier_project'

In [18]:
!unzip news.zip

Archive:  news.zip
replace news.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [4]:
!ls

news.csv  news.zip


In [5]:
df = pd.read_csv('news.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [0]:
X = df['text']
y= df['label']

In [0]:
## Split the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2)

In [0]:
#Creating a pipeline that first creates bag of words(after applying stopwords) & then applies Multinomial Naive Bayes model
pipeline = Pipeline([('tfidf',TfidfVectorizer(stop_words='english')),
                     ('nbmodel',MultinomialNB())])

In [10]:
#Training our data
pipeline.fit(X_train, y_train)


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('nbmodel',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [0]:
#Predicting the label for the test data
pred_NB = pipeline.predict(X_test)

In [19]:
#Checking the performance of our model
from sklearn.metrics import confusion_matrix
print(classification_report(y_test, pred_NB))
print(confusion_matrix(y_test, pred_NB))
print(accuracy_score(y_test, pred_NB))


              precision    recall  f1-score   support

        FAKE       0.98      0.73      0.83       622
        REAL       0.79      0.98      0.87       645

    accuracy                           0.86      1267
   macro avg       0.88      0.85      0.85      1267
weighted avg       0.88      0.86      0.85      1267

[[451 171]
 [ 11 634]]
0.856353591160221


## Random Forest


In [16]:
from sklearn.ensemble import RandomForestClassifier
pipe_rf = Pipeline([('tfidf',TfidfVectorizer(stop_words='english')),
			('clf', RandomForestClassifier(random_state=42))])
pipe_rf.fit(X_train, y_train)


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

In [0]:
#Predicting the label for the test data
pred_rf = pipe_rf.predict(X_test)

In [23]:
print(classification_report(y_test, pred_rf))
print(accuracy_score(y_test, pred_rf))
print(confusion_matrix(y_test,pred_rf))

print(classification_report(y_test, pred_NB))
print(confusion_matrix(y_test, pred_NB))
print(accuracy_score(y_test, pred_NB))

              precision    recall  f1-score   support

        FAKE       0.92      0.92      0.92       622
        REAL       0.92      0.92      0.92       645

    accuracy                           0.92      1267
   macro avg       0.92      0.92      0.92      1267
weighted avg       0.92      0.92      0.92      1267

0.9187056037884768
[[572  50]
 [ 53 592]]
              precision    recall  f1-score   support

        FAKE       0.98      0.73      0.83       622
        REAL       0.79      0.98      0.87       645

    accuracy                           0.86      1267
   macro avg       0.88      0.85      0.85      1267
weighted avg       0.88      0.86      0.85      1267

[[451 171]
 [ 11 634]]
0.856353591160221


In [0]:
## pickle the random forest model
with open('model.pickle', 'wb') as handle:
    pickle.dump(pipe_rf, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [25]:
!ls

model.pickle  news.csv	news.zip
