# <span style="color:darkblue">Executive Summary</span> 

In this notebook **Jonathan Serrano Barbosa** and **Paul Jacques Mignault** developed a binary text classifier able to dissociate between fake news and real news. The team tried different approaches both in the data pre-processing and modelling. The approach that led to the highest accuracy is the following:

Explain process here

# 1. Importing libraries

The three cells below contain the code to call the libraries required for the assignment, and renaming of some functions

In [2]:
# Importing Libraries
import nltk
import pandas as pd
import numpy as np
import string
import re
import xgboost

from nltk import word_tokenize, pos_tag, pos_tag_sents, DefaultTagger, UnigramTagger
from nltk.util import ngrams
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
from nltk.classify import MaxentClassifier
from nltk.classify import maxent

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups
from sklearn.ensemble import RandomForestClassifier
from nltk.tokenize.treebank import TreebankWordDetokenizer


from sklearn.model_selection import cross_val_score, GridSearchCV

from xgboost import XGBClassifier


from textblob import TextBlob


import warnings
warnings.filterwarnings('ignore')

In [3]:
#Run the below command if you haven't set up the textblob library
#!pip install -U textblob
from textblob import TextBlob

In [4]:
# Renaming some of the functions
pst = PorterStemmer()
wnlt = WordNetLemmatizer()
stop = stopwords.words('english')
stm = SnowballStemmer('english')

# 2. Reading the CSV

In [7]:
# Reading the Training dataset
df = pd.read_csv('/Users/jonathanserrano/Desktop/Assignment_1/fake_or_real_news_training.csv')
df.head()

Unnamed: 0,ID,title,text,label,X1,X2
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,,
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,,
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,,
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,,
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,,


In [8]:
df = df.fillna('')

# 3. Creating a Baseline

We began the exercise by creating a baseline, that is a basic classifier without significant transformation to the data.

In [9]:
# Setting the ID as the index
df = df.set_index('ID')

# Dropping x1 and x2 columns 
df = df.drop(['X1','X2'], axis=1)

# Creating the target column
y = df.label 

In [None]:
# Make training and test sets 
X_train, X_test, y_train, y_test = train_test_split(df['text'], y, test_size=0.3, random_state=666)

In [None]:
# Simple count vectorizer
count_vectorizer = CountVectorizer()
# Fit and transform the training data 
count_train = count_vectorizer.fit_transform(X_train)
# Transform the test set 
count_test = count_vectorizer.transform(X_test)

# Initialize the `tfidf_vectorizer` 
tfidf_vectorizer = TfidfVectorizer(max_df=0.7) 
# Fit and transform the training data 
tfidf_train = tfidf_vectorizer.fit_transform(X_train) 
# Transform the test set 
tfidf_test = tfidf_vectorizer.transform(X_test)

### <span style="color:dimgray">3.1. Using a count vectorizer</span> 

In [None]:
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('Multinomial NB', MultinomialNB()))
models.append(('SVM', SVC()))
models.append(('Random Forest', RandomForestClassifier()))
models.append(('XGBoost', XGBClassifier()))

results = []
names = []
scoring = 'accuracy'

for name, model in models:
    cv_scores = cross_val_score(model, count_train,y_train,cv=10,n_jobs=-1)
    mean_score = round(np.mean(cv_scores), 2)
    results.append(mean_score)
    names.append(name)

Models_comparison = pd.DataFrame(np.column_stack([names,results]), 
                               columns=['Model','Accuracy'])

print(Models_comparison)

### <span style="color:dimgray">3.2. Using a tfidf vectorizer</span> 

In [None]:
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('Multinomial NB', MultinomialNB()))
models.append(('SVM', SVC()))
models.append(('Random Forest', RandomForestClassifier()))
models.append(('XGBoost', XGBClassifier()))

results = []
names = []
scoring = 'accuracy'

for name, model in models:
    cv_scores = cross_val_score(model, tfidf_train,y_train,cv=10,n_jobs=-1)
    mean_score = round(np.mean(cv_scores), 2)
    results.append(mean_score)
    names.append(name)

Models_comparison = pd.DataFrame(np.column_stack([names,results]), 
                               columns=['Model','Accuracy'])

print(Models_comparison)

# 4. Improving the Baseline

## <span style="color:gray">4.1. Iteration 1</span> 

In [18]:
# Creating a function to correct the spelling mistakes using the textblob library

#def correct_spelling(dataframe):
    #'Function that takes the dataframe as input and corrects spelling mistakes in text and title'
    #dataframe['title'] = dataframe['title'].apply(lambda x: str(TextBlob(x).correct()))
    #dataframe['text'] = dataframe['text'].apply(lambda x: str(TextBlob(x).correct()))

In [19]:
# Creating a function to clean the dataset: lower-case, remove punctuation, remove digits
def clean(dataframe):
    'Function that takes the dataframe as an input and cleans it by removing punction, putting strings in lower cage and removing digits'
    dataframe.title = dataframe.title.apply(lambda x: x.lower())
    dataframe.text = dataframe.text.apply(lambda x: x.lower())
    dataframe.title = dataframe.title.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    dataframe.text = dataframe.text.apply(lambda x: x.translate(str.maketrans('','', string.punctuation)))
    dataframe.title = dataframe.title.apply(lambda x: x.translate(str.maketrans('', '', string.digits)))
    dataframe.text = dataframe.text.apply(lambda x: x.translate(str.maketrans('','', string.digits)))

In [20]:
# Creating a function to tokenize title and text
def tokenize(dataframe):
    'Function that takes the dataframe as input and tokenizes the titles and the texts'
    dataframe['tokenized_titles'] = dataframe.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)
    dataframe['tokenized_text'] = dataframe.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

In [21]:
# Creating a function to remove stopwords
def stop_words(dataframe):
    'Function that takes the dataframe as input and removes stopwords from the tokenized sentences'
    df['tokenized_titles'] = df['tokenized_titles'].apply(lambda x: [item for item in x if item not in stop])
    df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [item for item in x if item not in stop])

In [22]:
# Creating a function for stemming using Porter Stemming
def pst_stemmer(dataframe):
    'Function that takes the dataframe as input and applies Porter stemming to the tokenized titles and texts'
    dataframe['titles_stemmed']=dataframe['tokenized_titles'].apply(lambda x : [pst.stem(y) for y in x])
    dataframe['text_stemmed']=dataframe['tokenized_text'].apply(lambda x: [pst.stem(y) for y in x])

In [23]:
# Creating a function to detokenize text
def detokenize(dataframe):
    'Function that takes the dataframe as input and detokenizes the titles and the texts'
    dataframe['titles_stemmed'] = dataframe.apply(lambda row: TreebankWordDetokenizer().detokenize(row['titles_stemmed']), axis=1)
    dataframe['text_stemmed'] = dataframe.apply(lambda row: TreebankWordDetokenizer().detokenize(row['text_stemmed']), axis=1)

In [24]:
# Creating a function to remove special characters in case any are left after pre-processing
def spec_charact(dataframe):
    'Function that takes the dataframe as input and removes remaining regular expressions in the text and titles'
    dataframe['titles_stemmed'] = dataframe['titles_stemmed'].str.replace('[^\w\s]','')
    dataframe['text_stemmed'] = dataframe['text_stemmed'].str.replace('[^\w\s]','')

In [25]:
#Applying the function
#correct_spelling(df)
clean(df)
tokenize(df)
stop_words(df)
pst_stemmer(df)
detokenize(df)
spec_charact(df)

In [26]:
df

Unnamed: 0_level_0,title,text,label,tokenized_titles,tokenized_text,titles_stemmed,text_stemmed
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
8476,you in smell axillary’s dear,daniel greenfield a tillman journalism fellow ...,FAKE,"[smell, axillary, ’, dear]","[daniel, greenfield, tillman, journalism, fell...",smell axillari dear,daniel greenfield tillman journal fellow freed...
10294,watch the exact moment paul in committed polit...,google interest fig linkedin edit stumbleupon ...,FAKE,"[watch, exact, moment, paul, committed, politi...","[google, interest, fig, linkedin, edit, stumbl...",watch exact moment paul commit polit suicid pl...,googl interest fig linkedin edit stumbleupon p...
3608,merry to go to paris in gesture of sympathy,us secretary of state john f merry said monday...,REAL,"[merry, go, paris, gesture, sympathy]","[us, secretary, state, john, f, merry, said, m...",merri go pari gestur sympathi,us secretari state john f merri said monday st...
10142,erie supporters on twitter erupt in anger agai...,— kaydee king kaydeeking november the lesson...,FAKE,"[erie, supporters, twitter, erupt, anger, dnc,...","[—, kaydee, king, kaydeeking, november, lesson...",eri support twitter erupt anger dnc tri warn,kayde king kaydeek novemb lesson tonight dem ...
875,the battle of new work why his primary matters,its primary day in new work and frontrunners a...,REAL,"[battle, new, work, primary, matters]","[primary, day, new, work, frontrunners, axilla...",battl new work primari matter,primari day new work frontrunn axillari clinto...
6903,tehran usa,\ni’m not an immigrant but my grandparents a...,FAKE,"[tehran, usa]","[’, immigrant, grandparents, years, ago, arriv...",tehran usa,immigr grandpar year ago arriv new work citi ...
7341,girl horrified it that the patches boyfriend t...,share his baylee luciani left screenshot of wh...,FAKE,"[girl, horrified, patches, boyfriend, left, fa...","[share, baylee, luciani, left, screenshot, bay...",girl horrifi patch boyfriend left facetim,share bayle luciani left screenshot bayle caug...
95,‘britain’s schindler’ lies at,a czech stockbroker who saved more than jewis...,REAL,"[‘, britain, ’, schindler, ’, lies]","[czech, stockbroker, saved, jewish, children, ...",britain schindler lie,czech stockbrok save jewish children gaze germ...
4869,act check plump and clinton at the commanderin...,axillary clinton and donald plump made some in...,REAL,"[act, check, plump, clinton, commanderinchief,...","[axillary, clinton, donald, plump, made, inacc...",act check plump clinton commanderinchief forum,axillari clinton donald plump made inaccur cla...
2909,an reportedly makes new push for cranium conce...,cranial negotiator reportedly have made a last...,REAL,"[reportedly, makes, new, push, cranium, conces...","[cranial, negotiator, reportedly, made, lastdi...",reportedli make new push cranium concess nucle...,cranial negoti reportedli made lastditch push ...


In [27]:
# Make training and test sets 
X_train, X_test, y_train, y_test = train_test_split(df['text_stemmed'], y, test_size=0.2, random_state=666)

### <span style="color:dimgray">4.1.1. Using a count vectorizer</span> 

In [28]:
# Setting the count vectorizer
vect = CountVectorizer()
# Fit and transform the training data 
count_train = vect.fit_transform(X_train)
# Transform the test set 
count_test = vect.transform(X_test)

In [29]:
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('Multinomial NB', MultinomialNB()))
models.append(('SVM', SVC()))
models.append(('Random Forest', RandomForestClassifier()))
models.append(('XGBoost', XGBClassifier()))

results = []
names = []
scoring = 'accuracy'

for name, model in models:
    cv_scores = cross_val_score(model, count_train,y_train,cv=5,n_jobs=-1)
    mean_score = round(np.mean(cv_scores), 2)
    results.append(mean_score)
    names.append(name)

Models_comparison = pd.DataFrame(np.column_stack([names,results]), 
                               columns=['Model','Accuracy'])

print(Models_comparison)

                 Model Accuracy
0  Logistic Regression      0.9
1       Multinomial NB     0.86
2                  SVM     0.78
3        Random Forest      0.8
4              XGBoost     0.87


### <span style="color:dimgray">4.1.2. Using tfidf vectorizer</span> 

# Functions to Keep in case

In [None]:
# Creating the function to tokenize title and text

def tokenize(dataframe):
    'Function that takes the dataframe as input and tokenizes the titles and the texts'
    dataframe['tokenized_titles'] = dataframe.apply(lambda row: nltk.word_tokenize(row['title']), axis=1)
    dataframe['tokenized_text'] = dataframe.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

In [None]:
def stop_words(dataframe):
    'Function that takes the dataframe as input and removes stopwords from the tokenized sentences'
    df['tokenized_titles'] = df['tokenized_titles'].apply(lambda x: [item for item in x if item not in stop])
    df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [item for item in x if item not in stop])

In [None]:
def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n' 
    
def lemmatize_sent(text): 
    # Text input is string, returns lowercased strings.
    return [wnlt.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(word_tokenize(text))]

lemmatize_sent('My name was Jonathan')

In [None]:
# Creating the function to apply stemming to the text

#Porter Stemmer
def pst_stemmer(dataframe):
    'Function that takes the dataframe as input and applies Porter stemming to the tokenized titles and texts'
    dataframe['titles_stemmed']=dataframe['tokenized_titles'].apply(lambda x : [pst.stem(y) for y in x])
    dataframe['text_stemmed']=dataframe['tokenized_text'].apply(lambda x: [pst.stem(y) for y in x])

#Snowball Stemmer
def stm_stemmer(dataframe):
    'Function that takes the dataframe as input and applies Snowball stemming to the tokenized titles and texts'
    dataframe['titles_stemmed']=dataframe['tokenized_titles'].apply(lambda x : [stm.stem(y) for y in x])
    dataframe['text_stemmed']=dataframe['tokenized_text'].apply(lambda x: [stm.stem(y) for y in x])

#WordNet Lemmatizer
def wnlt_lemmatizer(dataframe):
    'Function that takes the dataframe as input and applies WordNet lemmatizer to the tokenized titles and texts'
    dataframe['titles_lem']=dataframe['tokenized_titles'].apply(lambda x : [wnlt.lemmatize(y) for y in x])
    dataframe['text_lem']=dataframe['tokenized_text'].apply(lambda x: [wnlt.lemmatize(y) for y in x])

In [None]:
# Creating the function to generate bigrams out of the tokenized texts

def bigram(dataframe):
    'Function that takes the dataframe as input and creates bigrams out of the tokenized words'
    df['bigrams_text'] = df['text_lem'].apply(lambda row: list(nltk.ngrams(row, 2)))
    df['bigrams_title'] = df['titles_lem'].apply(lambda row: list(nltk.ngrams(row,2)))

In [None]:
# Function to correct spelling mistakes

def correct_spelling(dataframe):
    dataframe['title'] = dataframe['title'].apply(lambda x: str(TextBlob(x).correct()))
    dataframe['text'] = dataframes['text'].apply(lambda x: str(TextBlob(x).correct()))


In [None]:
correct_spelling(df)

# <span style="color:darkblue">Thank you!</span> 

In [None]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://media.giphy.com/media/l0Ex0GlhTAYnwL2Cs/giphy.gif")