In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
sns.set()

In [25]:
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix

In [4]:
# datasets used
news = pd.read_csv('./fake_or_real_news.csv')

# Building word count vectors with `sklearn`

In [7]:
news.head(3)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL


## `CountVectorizer` for text classification

In [8]:
X = news['text']
y = news['label']

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=53)

In [19]:
# initialize a vectorizer
cvec = CountVectorizer(stop_words='english')

# transform data
count_train = cvec.fit_transform(X_train)
count_test = cvec.transform(X_test)

# first 10 features of vec
print(f'first 10 features:\n{cvec.get_feature_names()[:10]}')

first 10 features:
['00', '000', '0000', '00000031', '000035', '00006', '0001', '0001pt', '000ft', '000km']


## `TfidfVectorizer` for text classification

In [20]:
# initialize a tfidf vec
tvec = TfidfVectorizer(stop_words='english',
                      max_df=0.7)

# transform data
tfidf_train = tvec.fit_transform(X_train)
tfidf_test = tvec.transform(X_test)

print(f'first 10 features:\n{tvec.get_feature_names()[:10]}\n')

# print(f'first 2 vectors:\n{tfidf_train.toarray()[:2]}')

first 10 features:
['00', '000', '0000', '00000031', '000035', '00006', '0001', '0001pt', '000ft', '000km']



## Inspecting the vectors

In [21]:
# create df for count and tdidf training data
count_df = pd.DataFrame(count_train.toarray(),
                        columns=cvec.get_feature_names())
tfidf_df = pd.DataFrame(tfidf_train.toarray(),
                        columns=tvec.get_feature_names())

In [22]:
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)

set()


In [23]:
print(count_df.equals(tfidf_df))

False


# Training and testing a classification model with `sklearn` 

## Training and testing with `CountVectorizer`

In [27]:
# instantiate a multinomial naive bayes classifier
nb = MultinomialNB()

# fit, transform, eval
nb.fit(count_train, y_train)

y_pred = nb.predict(count_test)

score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred,
                      labels=['FAKE', 'REAL'])
print(f'test acc: {score: .1%}')
print(cm)

test acc:  89.3%
[[ 865  143]
 [  80 1003]]


## Training and testing with `TfidfVectorizer`

In [33]:
# instantiate a multinomial naive bayes classifier
nb = MultinomialNB(alpha=0.1)

# fit, transform, eval
nb.fit(tfidf_train, y_train)

y_pred = nb.predict(tfidf_test)

score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred,
                      labels=['FAKE', 'REAL'])
print(f'test acc: {score: .1%}')
print(cm)

test acc:  89.8%
[[ 866  142]
 [  72 1011]]


# Simple NLP, complex problems

## Improving your model

In [32]:
# Create the list of alphas: alphas
alphas = np.arange(0, 1, 0.1)

# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(tfidf_train, y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(tfidf_test)
    # Compute accuracy: score
    score = accuracy_score(y_test, pred)
    return score

# Iterate over the alphas and print the corresponding score
for alpha in alphas:
    print(f'Alpha: {alpha}')
    print(f'Score: {train_and_predict(alpha): .1%}\n')

Alpha: 0.0
Score:  88.1%

Alpha: 0.1
Score:  89.8%

Alpha: 0.2
Score:  89.4%

Alpha: 0.30000000000000004
Score:  89.0%

Alpha: 0.4
Score:  88.6%

Alpha: 0.5


  'setting alpha = %.1e' % _ALPHA_MIN)


Score:  88.4%

Alpha: 0.6000000000000001
Score:  87.5%

Alpha: 0.7000000000000001
Score:  87.0%

Alpha: 0.8
Score:  86.6%

Alpha: 0.9
Score:  85.9%



In [34]:
# Get the class labels: class_labels
class_labels = nb.classes_

# Extract the features: feature_names
feature_names = tvec.get_feature_names()

# Zip the feature names together with the coefficient array and sort by weights: feat_with_weights
feat_with_weights = sorted(zip(nb.coef_[0], feature_names))

# Print the first class label and the top 20 feat_with_weights entries
print(class_labels[0], feat_with_weights[:20])

# Print the second class label and the bottom 20 feat_with_weights entries
print(class_labels[1], feat_with_weights[-20:])

FAKE [(-12.641778440826338, '0000'), (-12.641778440826338, '000035'), (-12.641778440826338, '0001'), (-12.641778440826338, '0001pt'), (-12.641778440826338, '000km'), (-12.641778440826338, '0011'), (-12.641778440826338, '006s'), (-12.641778440826338, '007'), (-12.641778440826338, '007s'), (-12.641778440826338, '008s'), (-12.641778440826338, '0099'), (-12.641778440826338, '00am'), (-12.641778440826338, '00p'), (-12.641778440826338, '00pm'), (-12.641778440826338, '014'), (-12.641778440826338, '015'), (-12.641778440826338, '018'), (-12.641778440826338, '01am'), (-12.641778440826338, '020'), (-12.641778440826338, '023')]
REAL [(-6.790929954967984, 'states'), (-6.765360557845787, 'rubio'), (-6.751044290367751, 'voters'), (-6.701050756752027, 'house'), (-6.695547793099875, 'republicans'), (-6.670191249042969, 'bush'), (-6.661945235816139, 'percent'), (-6.589623788689861, 'people'), (-6.559670340096453, 'new'), (-6.489892292073902, 'party'), (-6.452319082422527, 'cruz'), (-6.452076515575875, '