# Importing Libraries

In [73]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Importing the Dataset

In [18]:
df= pd.read_csv("fake_or_real_news.csv")

In [19]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [20]:
df.count()

Unnamed: 0    6335
title         6335
text          6335
label         6335
dtype: int64

In [21]:
df.groupby(by='label').count()

Unnamed: 0_level_0,Unnamed: 0,title,text
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FAKE,3164,3164,3164
REAL,3171,3171,3171


# Train-Test Split

In [22]:
y = df['label']

In [26]:
df= df.drop('label', axis=1)

In [29]:
X_train, x_test, Y_train, y_test = train_test_split(df['text'], y, test_size= 0.33 ,random_state=53)

# Fitting the model using Count Vectorizer

In [33]:
count_vectorizer = CountVectorizer(stop_words= 'english')

In [34]:
count_train = count_vectorizer.fit_transform(X_train)

In [37]:
count_test = count_vectorizer.transform(x_test)

In [44]:
print(count_vectorizer.get_feature_names()[:10])

['00', '000', '0000', '00000031', '000035', '00006', '0001', '0001pt', '000ft', '000km']


# Fitting the model using Tfid Vectorizer

In [52]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

In [55]:
tfidf_train = tfidf_vectorizer.fit_transform(X_train)

In [56]:
tfidf_test = tfidf_vectorizer.transform(x_test)

In [58]:
print(tfidf_vectorizer.get_feature_names()[:10])

['00', '000', '0000', '00000031', '000035', '00006', '0001', '0001pt', '000ft', '000km']


In [59]:
print(tfidf_train.A[:5])

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


# Converting Count Vectorizer array into DataFrames 

In [61]:
count_df= pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

In [63]:
count_df.head()

Unnamed: 0,00,000,0000,00000031,000035,00006,0001,0001pt,000ft,000km,...,حلب,عربي,عن,لم,ما,محاولات,من,هذا,والمرضى,ยงade
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Converting Tfid Vectorizer into DataFrames

In [66]:
tfidf_df= pd.DataFrame(tfidf_train.A, columns= tfidf_vectorizer.get_feature_names())

In [67]:
tfidf_df.head()

Unnamed: 0,00,000,0000,00000031,000035,00006,0001,0001pt,000ft,000km,...,حلب,عربي,عن,لم,ما,محاولات,من,هذا,والمرضى,ยงade
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Check if both the columns have same set of values 

In [71]:
difference = set(count_df.columns)- set(tfidf_df.columns)
print(difference)

set()


# Check if both DataFrames are equal

In [72]:
print(count_df.equals(tfdif_df))

False


# Training and Testing the "Fake News" model with CountVectorizer

### Instantiate multinomial naive baiyes classifier

In [74]:
nb_classifier= MultinomialNB()

In [78]:
nb_classifier.fit(count_train, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [79]:
pred = nb_classifier.predict(count_test)

In [81]:
score= metrics.accuracy_score(y_test, pred)

In [82]:
print(score)

0.893352462936394


### Plotting confusion matrix

In [85]:
cm = metrics.confusion_matrix(y_test, pred, labels=['REAL', 'FAKE'])

In [86]:
print(cm)

[[1003   80]
 [ 143  865]]


# Training and Testing the "Fake News" model with TfidfVectorizer

In [87]:
nb_classifier.fit(tfidf_train, Y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [89]:
pred= nb_classifier.predict(tfidf_test)

In [90]:
score = metrics.accuracy_score(y_test, pred)

In [91]:
print(score)

0.8565279770444764


In [92]:
cm = metrics.confusion_matrix(y_test, pred, labels=['REAL', 'FAKE'])

In [93]:
print(cm)

[[1052   31]
 [ 269  739]]


# Improving Model 

In [95]:
alphas = np.arange(0,1,0.1)

#### Create the function to train the model in all the values of alphas

In [99]:
def train_and_predict(alpha):
    # Instantiate the classifier
    nb_classifier = MultinomialNB(alpha)
    # fit the training data 
    nb_classifier.fit(tfidf_train, Y_train)
    # predict the values 
    pred= nb_classifier.predict(tfidf_test)
    # calculating the accuracy scores
    score= metrics.accuracy_score(y_test, pred)
    return score

In [108]:
for alpha in alphas:
    print('alpha:{0}'.format(alpha))
    print('score:{0}'.format(train_and_predict(alpha)))
    print()

alpha:0.0
score:0.8813964610234337

alpha:0.1
score:0.8976566236250598

alpha:0.2
score:0.8938307030129125

alpha:0.30000000000000004


  'setting alpha = %.1e' % _ALPHA_MIN)


score:0.8900047824007652

alpha:0.4
score:0.8857006217120995

alpha:0.5
score:0.8842659014825442

alpha:0.6000000000000001
score:0.874701099952176

alpha:0.7000000000000001
score:0.8703969392635102

alpha:0.8
score:0.8660927785748446

alpha:0.9
score:0.8589191774270684

