# A small-scale version of our pipeline

### Imports

In [75]:
import pandas as pd
import regex as re
from cleantext import clean
import pyarrow.feather as feather
from multiprocessing import Pool
import gc
import nltk
import itertools
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import *
import time
from collections import Counter
from sklearn.model_selection import train_test_split    # splitting the data

### Getting a proper data set

In [76]:
import pandas as pd
import regex as re
from cleantext import clean
data = pd.read_csv('https://raw.githubusercontent.com/several27/FakeNewsCorpus/master/news_sample.csv', dtype={"content": "string"})
data.head()

Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary
0,0,141,awm.com,unreliable,http://awm.com/church-congregation-brings-gift...,Sometimes the power of Christmas will make you...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Church Congregation Brings Gift to Waitresses ...,Ruth Harris,,[''],,,
1,1,256,beforeitsnews.com,fake,http://beforeitsnews.com/awakening-start-here/...,AWAKENING OF 12 STRANDS of DNA – “Reconnecting...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,AWAKENING OF 12 STRANDS of DNA – “Reconnecting...,Zurich Times,,[''],,,
2,2,700,cnnnext.com,unreliable,http://www.cnnnext.com/video/18526/never-hike-...,Never Hike Alone: A Friday the 13th Fan Film U...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Never Hike Alone - A Friday the 13th Fan Film ...,,,[''],Never Hike Alone: A Friday the 13th Fan Film ...,,
3,3,768,awm.com,unreliable,http://awm.com/elusive-alien-of-the-sea-caught...,"When a rare shark was caught, scientists were ...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Elusive ‘Alien Of The Sea ‘ Caught By Scientis...,Alexander Smith,,[''],,,
4,4,791,bipartisanreport.com,clickbait,http://bipartisanreport.com/2018/01/21/trumps-...,Donald Trump has the unnerving ability to abil...,2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Trump’s Genius Poll Is Complete & The Results ...,Gloria Christie,,[''],,,


In [77]:
# remove everything with null type and set all types to either reliable or false
data = data[data['type'].isnull() == False]

### Preprocessing

In [78]:
pattern = re.compile(r"([\d]{1,2}[\/|\-][\d]{1,2}(?:[\/|\-][\d]{2,4})?|[\d]{2,4}[\/|\-][\d]{1,2}[\/|\-][\d]{1,2}|(?:january|february|march|april|may|june|july|august|september|october|november|december)[\s][\d]{1,2}[a-z][a-z](?:\s[\d]{2,4})|[\d][\d]\w?\w?\sof\s(?:january|february|march|april|may|june|july|august|september|october|november|december)(?:\s[\d]{2,4})?|(?:january|february|march|april|may|june|july|august|september|october|november|december)\s\d\d?\w?\w?,?(?:\s\d{2,4})?)")
def clean_string(st):
    s1 = pattern.sub("date",st)
    return clean(s1, lower=True,
                    no_line_breaks=True,
                    no_emails=True,
                    no_urls=True,
                    no_numbers=True,
                    no_punct=True,
                    lang="en",
                    replace_with_number="num",
                    replace_with_email="email",
                    replace_with_url="url")
def clean_dataframe(dataframe):
    start = time.time()
    dataframe['content'] = dataframe['content'].apply(clean_string)
    end = time.time()
    print("cleaning took " + str(end - start) + " seconds")

In [79]:
def remove_english_stopwords(stopwords):
    def remove_stopwords(tokenlist):
        return filter(lambda x : x not in stopwords, tokenlist)
    return remove_stopwords

def stem_tokens():
    stemmer = PorterStemmer()
    def stem_tokenlist(tokenlist):
        return map(stemmer.stem, tokenlist)
    return stem_tokenlist

def tokenize():
    def tokenize_text(s):
        return list((map(nltk.word_tokenize, s)))
    return tokenize_text

def to_list():
    def turn_to_list(it):
        return list(it)
    return turn_to_list

def preprocess(dataframe):
    # tokenize content column
    #print("Tokenizing...")
    start = time.time()
    dataframe['content'] = dataframe['content'].apply(nltk.word_tokenize)
    end = time.time()
    print("tokenizing took " + str(end - start) + " seconds")
    #tokens = list(itertools.chain.from_iterable(dataframe['content']))
    #vocabulary = set(tokens)
    # remove stopwords
    start = time.time()
    dataframe['content'] = dataframe['content'].apply(remove_english_stopwords(stopwords.words('english')))
    end = time.time()
    print("removing stopwords took " + str(end - start) + " seconds")
    #tokens_no_stopwords = list(itertools.chain.from_iterable(dataframe['content']))
    #vocabulary_no_stopwords = set(tokens_no_stopwords)
    #print("Reduction rate of removing stopwords: " + str(1 - len(vocabulary_no_stopwords) / len(vocabulary)))
    # stem tokens
    start = time.time()
    dataframe['content'] = dataframe['content'].apply(stem_tokens())
    end = time.time()
    print("stemming took " + str(end - start) + " seconds")

    start = time.time()
    dataframe['content'] = dataframe['content'].apply(to_list())
    end = time.time()
    print("converting to list took" + str(end - start) + " seconds")

    #tokens_stem = list(itertools.chain.from_iterable(dataframe['content']))
    #print("Stemmed tokens = " + str(tokens_stem))
    #vocabulary_stem = set(tokens_stem)
    #print("Reduction rate of stemming: " + str(1 - len(vocabulary_stem)/len(vocabulary_no_stopwords)))

In [80]:
clean_dataframe(data)

cleaning took 0.9628560543060303 seconds


In [81]:
preprocess(data)

tokenizing took 0.26261401176452637 seconds
removing stopwords took 0.0008192062377929688 seconds
stemming took 0.00016498565673828125 seconds
converting to list took0.9268679618835449 seconds


In [82]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,domain,type,url,content,scraped_at,inserted_at,updated_at,title,authors,keywords,meta_keywords,meta_description,tags,summary
0,0,141,awm.com,unreliable,http://awm.com/church-congregation-brings-gift...,"[sometim, power, christma, make, wild, wonder,...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Church Congregation Brings Gift to Waitresses ...,Ruth Harris,,[''],,,
1,1,256,beforeitsnews.com,fake,http://beforeitsnews.com/awakening-start-here/...,"[awaken, num, strand, dna, reconnect, movi, re...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,AWAKENING OF 12 STRANDS of DNA – “Reconnecting...,Zurich Times,,[''],,,
2,2,700,cnnnext.com,unreliable,http://www.cnnnext.com/video/18526/never-hike-...,"[never, hike, alon, friday, 13th, fan, film, u...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Never Hike Alone - A Friday the 13th Fan Film ...,,,[''],Never Hike Alone: A Friday the 13th Fan Film ...,,
3,3,768,awm.com,unreliable,http://awm.com/elusive-alien-of-the-sea-caught...,"[rare, shark, caught, scientist, left, blunder...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Elusive ‘Alien Of The Sea ‘ Caught By Scientis...,Alexander Smith,,[''],,,
4,4,791,bipartisanreport.com,clickbait,http://bipartisanreport.com/2018/01/21/trumps-...,"[donald, trump, unnerv, abil, abil, creat, rea...",2018-01-25 16:17:44.789555,2018-02-02 01:19:41.756632,2018-02-02 01:19:41.756664,Trump’s Genius Poll Is Complete & The Results ...,Gloria Christie,,[''],,,


### Simple Model - Naive Bayes

In [83]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer=lambda x : x)
X = vectorizer.fit_transform(data['content'])
y = data['type']

In [84]:
X_train, X_val, y_train, y_val = train_test_split(X, data['type'], test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=0)

In [85]:
print(X_train.toarray())

[[ 0  0  0 ...  0  1  0]
 [ 0  0  0 ...  0  0  0]
 [10  0  0 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 2  0  0 ...  0  0  0]]


In [86]:
print(X_train.toarray().shape)

(190, 10927)


In [87]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)

In [88]:
pred_val = model.predict(X_val)


In [89]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val, pred_val)

0.875

In [90]:
pred_test = model.predict(X_test)

In [91]:
accuracy_score(y_test, pred_test)

0.7083333333333334

### Simple Model - Logistic Regression

In [92]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer=lambda x : x)
X = vectorizer.fit_transform(data['content'])
y = data['type']

In [93]:
X_train, X_val, y_train, y_val = train_test_split(X, data['type'], test_size=0.2, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=0)

In [95]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [96]:
pred_val = model.predict(X_val)

In [97]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val, pred_val)

0.75

In [98]:
pred_test = model.predict(X_test)

In [99]:
accuracy_score(y_test, pred_test)

0.8333333333333334