In [55]:
# Import all the necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
import random
import re
from nltk.tokenize import word_tokenize
from nltk.corpus   import stopwords
from nltk.stem     import WordNetLemmatizer

In [56]:
# load the dataset
data = pd.read_csv('fakereal _1_.csv')

In [57]:
# print the data
print(data)

                                                  text label
0    Daniel Greenfield, a Shillman Journalism Fello...  FAKE
1    Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE
2    U.S. Secretary of State John F. Kerry said Mon...  REAL
3    — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE
4    It's primary day in New York and front-runners...  REAL
..                                                 ...   ...
251  Wed, 26 Oct 2016 18:19 UTC © Jen Psaki Preside...  FAKE
252  When Donald Trump introduced his new universit...  REAL
253  Trump Has Forever Changed American Politics > ...  FAKE
254  With Western and Iranian negotiators racing to...  REAL
255  The role of the US and Nato in EU relations wi...  FAKE

[256 rows x 2 columns]


In [58]:
# shape of the dataset
print(data.shape)

(256, 2)


In [59]:
# display the first five rows of the dataset
print(data.head())

                                                text label
0  Daniel Greenfield, a Shillman Journalism Fello...  FAKE
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE
2  U.S. Secretary of State John F. Kerry said Mon...  REAL
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE
4  It's primary day in New York and front-runners...  REAL


In [60]:
# display the last five rows of the dataset
print(data.tail())

                                                  text label
251  Wed, 26 Oct 2016 18:19 UTC © Jen Psaki Preside...  FAKE
252  When Donald Trump introduced his new universit...  REAL
253  Trump Has Forever Changed American Politics > ...  FAKE
254  With Western and Iranian negotiators racing to...  REAL
255  The role of the US and Nato in EU relations wi...  FAKE


In [61]:
# To check if there are any NULL Values in the dataset
print(data.isnull().sum())

text     0
label    0
dtype: int64


# Observation:

So from the above observation, it is clear that there are no NULL Values in the datset.

In [62]:
# create an object for the lemmatizer
stemmer = WordNetLemmatizer()

# Preprocessing the text

In [63]:
# define the empty corpus
corpus = []

for i in range(0,256):
    # remove all the whitespace characters
    document = re.sub(r'\W+','', data['text'][i])
    # remove all the single characters
    document = re.sub(r'\s+[a-zA-Z]\s+','',document)
    # remove all the single characters from the beginning of the text
    document = re.sub(r'\^[a-zA-Z]\s+','', document)
    # remove all the multi lines spaces
    document = re.sub(r'\s+','',document, flags = re.I)
    # convert the text into lower case
    document = document.lower()
    
    # split the document into words
    document = document.split()
    
    # lemmatize the document
    document = [stemmer.lemmatize(word) for word in document]
    
    # join the lemmetize words in the document
    document = ''.join(document)
    
    # append the document in the corpus
    corpus.append(document)

In [64]:
# print the entire text document from the corpus
print(corpus)



In [65]:
# print the second text document from the corpus
print(corpus[1])

googlepinterestdigglinkedinredditstumbleuponprintdeliciouspockettumblrtherearetwofundamentaltruthsinthisworldpaulryandesperatelywantstobepresidentandpaulryanwillneverbepresidenttodayproveditinaparticularlystaggeringexampleofpoliticalcowardicepaulryanrererereversedcourseandannouncedthathewasbackonthetrumptrainafterallthiswasanaboutfacefromwherehewasafewweeksagohehadpreviouslydeclaredhewouldnotbesupportingordefendingtrumpafteratapewasmadepublicinwhichtrumpbraggedaboutassaultingwomensuddenlyryanwasappearingataprotrumprallyandboldlydeclaringthathealreadysentinhisvotetomakehimpresidentoftheunitedstatesitwasasurrealmomentthefigureheadoftherepublicanpartydosedhimselfingasolinegotuponastageonachillyafternooninwisconsinandlitamatchspeakerryansayshevotedforrealdonaldtrumprepublicansitistimetocomehomehttpstcovytt49yvoepictwittercomwcvscg4a5iabcnewspoliticsabcpoliticsnovember52016thedemocraticpartycouldnthaveaskedforabettermomentoffilmryanschancesofeverbecomingpresidentwentdowntozeroinaninstantint

In [66]:
from sklearn.model_selection import train_test_split

In [67]:
# divide the dataset into training and testing data
X = corpus
Y = data.label
X_train,X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state = 0)

In [68]:
Y_train.shape

(204,)

# CONVERT THE TEXT INTO WORD VECTOR USING COUNTVECTORIZER METHOD

In [69]:
from sklearn.feature_extraction.text import CountVectorizer

# call the object for Count Vectorizer
countvectorizer = CountVectorizer(max_features = 1500, min_df = 3, max_df = 0.8, stop_words = stopwords.words("english"))

# using the obejct for CountVectorizer transform the training data
count_train = countvectorizer.fit_transform(X_train)

# using the object for countvectorizer transform the test data
count_test = countvectorizer.transform(X_test)

In [70]:
# To get all the feature names present in the vocabulary
text_input = countvectorizer.get_feature_names()
print(text_input)

['onthisdayin1973jfredbuzhardtalawyerdefendingpresidentrichardnixoninthewatergatecaserevealedthatakeywhitehousetapehadan18']


In [71]:
# To create the dataframe for the word count vector for the training dataset
pd.DataFrame(count_train.toarray(), columns = countvectorizer.get_feature_names())

Unnamed: 0,onthisdayin1973jfredbuzhardtalawyerdefendingpresidentrichardnixoninthewatergatecaserevealedthatakeywhitehousetapehadan18
0,0
1,0
2,0
3,0
4,0
...,...
199,0
200,0
201,0
202,0


In [72]:
# To create the dataframe for the word count vector for the test dataset
pd.DataFrame(count_test.toarray(), columns = countvectorizer.get_feature_names())

Unnamed: 0,onthisdayin1973jfredbuzhardtalawyerdefendingpresidentrichardnixoninthewatergatecaserevealedthatakeywhitehousetapehadan18
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


# CREATE THE TFIDF VECTOR FOR THE TRAINING AND TEST DATA

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create the object for TfidfVectorizer
tfidf = TfidfVectorizer(max_features = 1500, min_df = 3, max_df = 0.8, stop_words = stopwords.words("english"))

# using the object for tfidf transform the training data
tfidf_train = tfidf.fit_transform(X_train)

# using the object for tfidf transform the test data
tfidf_test = tfidf.transform(X_test)

In [74]:
# get all the feature names for the tfidf vectors
tfidf_data = tfidf.get_feature_names()
print(tfidf_data)

['onthisdayin1973jfredbuzhardtalawyerdefendingpresidentrichardnixoninthewatergatecaserevealedthatakeywhitehousetapehadan18']


In [75]:
# create the dataframe for the tfidf training data
pd.DataFrame(tfidf_train.toarray(), columns = tfidf.get_feature_names())

Unnamed: 0,onthisdayin1973jfredbuzhardtalawyerdefendingpresidentrichardnixoninthewatergatecaserevealedthatakeywhitehousetapehadan18
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
...,...
199,0.0
200,0.0
201,0.0
202,0.0


In [76]:
# create the dataframe for the tfidf testing data
pd.DataFrame(tfidf_test.toarray(), columns = tfidf.get_feature_names())

Unnamed: 0,onthisdayin1973jfredbuzhardtalawyerdefendingpresidentrichardnixoninthewatergatecaserevealedthatakeywhitehousetapehadan18
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0
5,0.0
6,0.0
7,0.0
8,0.0
9,0.0


# TO DETERMINE THE ACCURACY OF WORD COUNT PERFORM NAIVE BAYES CLASSIFIER

In [77]:
# Import all the necessary libraries for text classification

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
import random
import re
from nltk.tokenize import word_tokenize
from nltk.corpus   import stopwords
from nltk.stem     import WordNetLemmatizer

# PERFORM THE MULTINOMIAL NAIVE BAYES USING WORD COUNT VECTOR

In [78]:
from sklearn.naive_bayes import MultinomialNB

from sklearn import metrics


# create the object for Multinomial
obj = MultinomialNB()


# using the object for MultinomialNB train the data
obj.fit(count_train, Y_train)

MultinomialNB()

In [82]:
# using the object for MultinomialNB , predict the model so as to give the labels in the test data
pred_y = obj.predict(count_test)
print(pred_y)

['REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL'
 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL'
 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL'
 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL'
 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL'
 'REAL' 'REAL']


In [85]:
# get the accuracy score between the actual and predicted output
accuracy = metrics.accuracy_score(Y_test, pred_y)
print(accuracy*100.0)

51.92307692307693


# PERFORM THE MULTINOMIAL NAIVE BAYES USING TFIDF VECTORS

In [90]:
from sklearn.naive_bayes import MultinomialNB

# create the object fro MultinomialNB
obj = MultinomialNB()

In [91]:
# train the model using the obejct for MultinomialNB
obj.fit(tfidf_train, Y_train)

MultinomialNB()

In [92]:
# predict the model using the object for MultinomailNB
pred_y = obj.predict(tfidf_test)
print(pred_y)

['REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL'
 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL'
 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL'
 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL'
 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL' 'REAL'
 'REAL' 'REAL']


In [93]:
# using the accuracy score to get the accuracy between the actual output and predicted output
from sklearn.metrics import accuracy_score

In [94]:
accuracy = accuracy_score(pred_y, Y_test)
print(accuracy * 100.0)

51.92307692307693


In [95]:
# import all the libraries for the metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [96]:
print("Accuracy score is:", accuracy_score(pred_y, Y_test))
print("confusion matrix is:", confusion_matrix(pred_y, Y_test))
print("confusion matrix is:", classification_report(pred_y, Y_test))

Accuracy score is: 0.5192307692307693
confusion matrix is: [[ 0  0]
 [25 27]]
confusion matrix is:               precision    recall  f1-score   support

        FAKE       0.00      0.00      0.00         0
        REAL       1.00      0.52      0.68        52

    accuracy                           0.52        52
   macro avg       0.50      0.26      0.34        52
weighted avg       1.00      0.52      0.68        52



  _warn_prf(average, modifier, msg_start, len(result))
