In [1]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#read the data
messages = pd.read_csv('fake_or_real_news.csv')

In [3]:
messages.columns

Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')

In [4]:
messages.drop('Unnamed: 0',axis=1,inplace=True)

In [5]:
messages['length'] = messages['text'].apply(len)
messages.head()

Unnamed: 0,title,text,label,length
0,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,7518
1,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,2646
2,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,2543
3,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,2660
4,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,1840


In [6]:
messages.groupby('label').describe()

Unnamed: 0_level_0,length,length,length,length,length,length,length,length
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
FAKE,3164.0,4121.04646,5680.232733,1.0,1283.5,2558.0,5027.0,115372.0
REAL,3171.0,5292.160202,4348.288284,43.0,2729.5,4683.0,6829.5,44039.0


In [7]:
#split data into train and test sets
from sklearn.model_selection import train_test_split
msg_train, msg_test, label_train, label_test = train_test_split(messages['text'], messages['label'], test_size=0.2)

In [8]:
#import stopwords
import string
from nltk.corpus import stopwords

In [9]:
#Cache the stopwords in memory
StopWords = stopwords.words("english")

In [10]:
#define a function to remove punctuation and stopwords
def clean_message(mess):
    """
    Remove all punctuation and stopwords
    """
    #Filter out the punctuation
    nopunc = [char for char in mess if char not in string.punctuation]
    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    #remove stopwords
    return [word for word in nopunc.split() if word.lower() not in StopWords]

In [11]:
#test if the function works properly
chk = messages.head(5)
chk['text']

0    Daniel Greenfield, a Shillman Journalism Fello...
1    Google Pinterest Digg Linkedin Reddit Stumbleu...
2    U.S. Secretary of State John F. Kerry said Mon...
3    — Kaydee King (@KaydeeKing) November 9, 2016 T...
4    It's primary day in New York and front-runners...
Name: text, dtype: object

In [12]:
chk['text'].apply(clean_message)

0    [Daniel, Greenfield, Shillman, Journalism, Fel...
1    [Google, Pinterest, Digg, Linkedin, Reddit, St...
2    [US, Secretary, State, John, F, Kerry, said, M...
3    [—, Kaydee, King, KaydeeKing, November, 9, 201...
4    [primary, day, New, York, frontrunners, Hillar...
Name: text, dtype: object

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [14]:
#create a pipeline for the step by step procedure
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=clean_message)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())])

In [15]:
pipeline.fit(msg_train,label_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function clean_message at 0x0000023678E19B70>,
        binary=False, decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocess...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [16]:
predictions = pipeline.predict(msg_test)

In [17]:
from sklearn.metrics import classification_report

In [18]:
print(classification_report(predictions,label_test))

             precision    recall  f1-score   support

       FAKE       0.63      0.98      0.77       408
       REAL       0.99      0.73      0.84       859

avg / total       0.87      0.81      0.81      1267

