In [35]:
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB

In [36]:
df = pd.read_csv("fake_or_real_news.csv")

In [37]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [38]:
# what is the first column?
len(df[df.columns[0]].unique()), len(df)

(6335, 6335)

In [39]:
df = df.set_index(df.columns[0])

In [40]:
df.label

Unnamed: 0
8476     FAKE
10294    FAKE
3608     REAL
10142    FAKE
875      REAL
         ... 
4490     REAL
8062     FAKE
8622     FAKE
4021     REAL
4330     REAL
Name: label, Length: 6335, dtype: object

### Helper functions

In [41]:

def process_tweet(tweet):
    '''
    Some standard tweet processing techniques, which includes tokenizer and stemming

    '''

In [42]:
def convert_label_tweet(label):
    # convert FAKE to 0 and REAL to 1
    if label.strip().lower()=='real':
        return 1
    else:
        return 0

### Prepare Train and Test Data

In [43]:
df.columns

Index(['title', 'text', 'label'], dtype='object')

In [44]:
df['label'] = df['label'].apply(convert_label_tweet)

In [45]:
# combine the title and the text of the tweets, then process it
df['all text'] = df['title'].astype('str') +' '+ df['text'].astype('str')
df['all text'] = df['all text'].apply(process_tweet)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(df['all text'], df['label'],test_size=0.3, random_state=42)

In [47]:
MAX_FEATURES = 4000 # select 4000 most popular words

In [48]:
vectorizer = CountVectorizer(max_features= MAX_FEATURES)
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

In [49]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 4, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Train and Test 

In [50]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predict Class
y_pred = classifier.predict(X_test)

# Accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

In [51]:
accuracy

0.8453445554971067