In [101]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
import pandas as pd
import string
import re
import numpy as np
import nltk
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

In [102]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test['label'] = 't'

In [103]:
test = test.fillna(' ')
train = train.fillna(' ')
test['total'] = test['title'] + ' ' + test['author'] + test['text']
train['total'] = train['title'] + ' ' + train['author'] + train['text']


In [104]:
from nltk import PorterStemmer
stemmer = PorterStemmer()
wl = WordNetLemmatizer()
def tokenize_and_stem(text):
    tokens = nltk.tokenize.word_tokenize(text)
    # strip out punctuation and make lowercase
    tokens = [token.lower().strip(string.punctuation)
              for token in tokens if token.isalnum()]

    # now stem the tokens
    tokens = [wl.lemmatize(token) for token in tokens]
    tokens = [word for word in tokens if len(word) >= 3 and word != ('said' or 'people')]
    #tokens = [stemmer.stem(token) for token in tokens]

    return tokens

In [105]:
transformer = TfidfTransformer(smooth_idf=False)
count_vectorizer = CountVectorizer(ngram_range=(1, 3), stop_words='english', tokenizer= tokenize_and_stem)
counts = count_vectorizer.fit_transform(train['total'].values)
tfidf = transformer.fit_transform(counts)
targets = train['label'].values

In [118]:
X_train, X_test, y_train, y_test = train_test_split(tfidf, targets, random_state=0)

# multiBayes
NB = MultinomialNB()
NB.fit(X_train, y_train)
print("multiBayes")
print('Accuracy of NB classifier on training set: {:.2f}'
      .format(NB.score(X_train, y_train)))
print('Accuracy of NB classifier on test set: {:.2f}'
      .format(NB.score(X_test, y_test)))

BNB = BernoulliNB()
BNB.fit(X_train, y_train)
print("Bernoulli")
print('Accuracy of NB classifier on training set: {:.2f}'
      .format(BNB.score(X_train, y_train)))
print('Accuracy of NB classifier on test set: {:.2f}'
      .format(BNB.score(X_test, y_test)))

multiBayes
Accuracy of NB classifier on training set: 0.97
Accuracy of NB classifier on test set: 0.83
Bernoulli
Accuracy of NB classifier on training set: 0.93
Accuracy of NB classifier on test set: 0.72


In [122]:
class_labels= NB.classes_
feature_names =count_vectorizer.get_feature_names()
topn_class1 = sorted(zip(NB.feature_count_[0], feature_names),reverse=True)[:30]
topn_class2 = sorted(zip(NB.feature_count_[1], feature_names),reverse=True)[:30]
print("Important words in real news articles")

for coef1, feat1 in topn_class1:
    b = 0
    for coef2, feat2 in topn_class2:
        if feat1 ==feat2:
            b = 1
        else:
            continue
    if b ==0:
        print(class_labels[0], coef1, feat1)
    else:
        continue
            
print("-----------------------------------------")
print("Important words in fake news articles")
for coef2, feat2 in topn_class2:
    b = 0
    for coef1, feat1 in topn_class1:
        if feat1 ==feat2:
            b = 1
        else:
            continue
            
    if b==0:
        print(class_labels[1], coef2, feat2) 
    else:
        continue

Important words in real news articles
0 59.329299871009006 breitbart
0 52.19754111636912 york
0 51.91481547074832 new york
0 47.766852245074574 news
0 47.128348756709165 republican
0 46.28007769074222 house
0 46.14874928013911 united
0 43.640527431134 company
0 43.55729329750984 official
0 43.54189651887907 police
0 42.19041080097914 city
0 40.997242772580535 twitter
0 40.94286730445308 woman
0 39.80787257371681 united state
0 39.70739393905273 say
-----------------------------------------
Important words in fake news articles
1 89.85681039193022 hillary
1 62.40370011846452 election
1 53.896212310112716 email
1 52.6133688150295 fbi
1 51.0531903699768 2016
1 47.95742854121367 hillary clinton
1 47.246185697867105 war
1 45.312378025328854 russia
1 43.08880991275489 world
1 36.950844723176125 campaign
1 36.377542827884824 medium
1 36.1397092607962 vote
1 33.75617373039203 russian
1 33.0982850696016 donald
1 32.57668032187472 know


In [120]:
from sklearn import metrics
predicted = NB.predict(X_test)
print(metrics.classification_report(y_test, predicted))

             precision    recall  f1-score   support

          0       0.74      1.00      0.85      2564
          1       1.00      0.66      0.79      2636

avg / total       0.87      0.83      0.82      5200



In [121]:
print(metrics.confusion_matrix(y_test, predicted))

[[2558    6]
 [ 893 1743]]


In [111]:
print(metrics.accuracy_score(y_test, predicted))

0.7201923076923077
