### Feature Engineering

This notebook will cover various ways to create features from the fakes news data set. This will include text analytics, sentiment, readability scores.

In [5]:
import pandas as pd
import numpy as np
import re
import math
import pickle
import string
import textstat
import time
import random

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import concurrent.futures

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
sia = SIA()


# this allows jupyter to output more than one line in the notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [6]:
data = pd.read_csv('data/combinedData.csv')

In [12]:
data.head()
data.columns

Unnamed: 0,title,author,text,url,label
0,The Impact of Debates? It's Debatable,GARY LANGER,With the Hillary Clinton-Donald Trump debates ...,http://abcnews.go.com/Politics/impact-debates-...,real
1,Details Emerge About NYC Bomb Suspect Ahmad Kh...,Brian Ross Rhonda Schwartz Mike Levine Stephan...,As police today captured the man wanted for qu...,http://abcnews.go.com/US/source-suspect-wanted...,real
2,Donald Trump Repeats Calls for Police Profilin...,ALANA ABRAMSON,One day after explosive devices were discovere...,http://abcnews.go.com/Politics/donald-trump-re...,real
3,"NY, NJ Bombings Suspect Charged With Attempted...",EMILY SHAPIRO Aaron Katersky Josh Margolin Mik...,"Ahmad Khan Rahami, earlier named a person of i...",http://abcnews.go.com/US/bombing-incidences-ny...,real
4,Trump Surrogates Push Narrative That Clinton S...,Candace Smith,Donald Trump's surrogates and leading supporte...,http://abcnews.go.com/Politics/trump-surrogate...,real


Index(['title', 'author', 'text', 'url', 'label'], dtype='object')

##### Are there any NA's in the data?

Text features don't work very well with NA's. We'll replace NA's with an empty string "" for now.

In [10]:
data.isnull().any()

title     False
author    False
text      False
url       False
label     False
dtype: bool

In [9]:
data = data.fillna("")

##### Creating the readability ease, grade, and sentiment. 
These computations are expensive, so we use parallel processing to improve the speed

In [13]:
%%time

# produces a generator operator
# running for the title of the data
# change max_workers depending on the number of cores in your CPU
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
    values = data['title']
    reading_ease = executor.map(textstat.flesch_reading_ease, values, chunksize=100)
    reading_grade = executor.map(textstat.flesch_kincaid_grade, values, chunksize=100)
    sentiment = executor.map(sia.polarity_scores, values, chunksize=100)

Wall time: 6.2 s


In [14]:
%%time

# produces a generator operator
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
    values = data['text']
    reading_ease_body = executor.map(textstat.flesch_reading_ease, values, chunksize=100)
    reading_grade_body = executor.map(textstat.flesch_kincaid_grade, values, chunksize=100)
    sentiment_body = executor.map(sia.polarity_scores, values, chunksize=100)

Wall time: 1min 15s


In [15]:
%%time

reading_ease = pd.DataFrame({'readability':list(reading_ease)})
reading_grade = pd.DataFrame({'read_grade':list(reading_grade)})
sentiment = pd.DataFrame(list(sentiment))

reading_ease_body = pd.DataFrame({'readability_text':list(reading_ease_body)})
reading_grade_body = pd.DataFrame({'read_grade_text':list(reading_grade_body)})
sentiment_body = pd.DataFrame(list(sentiment_body))

Wall time: 150 ms


In [16]:
sentiment_body.columns = ['compound_text', 'neg_text', 'neu_text', 'pos_text']

In [18]:
df = pd.concat([data, reading_ease, reading_grade, sentiment, reading_ease_body, reading_grade_body, sentiment_body], axis=1)

In [19]:
df.head()

Unnamed: 0,title,author,text,url,label,readability,read_grade,compound,neg,neu,pos,readability_text,read_grade_text,compound_text,neg_text,neu_text,pos_text
0,The Impact of Debates? It's Debatable,GARY LANGER,With the Hillary Clinton-Donald Trump debates ...,http://abcnews.go.com/Politics/impact-debates-...,real,56.93,6.8,0.0,0.0,1.0,0.0,67.18,9.1,0.9682,0.061,0.851,0.089
1,Details Emerge About NYC Bomb Suspect Ahmad Kh...,Brian Ross Rhonda Schwartz Mike Levine Stephan...,As police today captured the man wanted for qu...,http://abcnews.go.com/US/source-suspect-wanted...,real,62.34,6.8,-0.6597,0.435,0.565,0.0,46.0,13.1,-0.9882,0.095,0.86,0.045
2,Donald Trump Repeats Calls for Police Profilin...,ALANA ABRAMSON,One day after explosive devices were discovere...,http://abcnews.go.com/Politics/donald-trump-re...,real,51.85,8.8,0.0,0.0,1.0,0.0,68.7,8.5,-0.7769,0.086,0.887,0.028
3,"NY, NJ Bombings Suspect Charged With Attempted...",EMILY SHAPIRO Aaron Katersky Josh Margolin Mik...,"Ahmad Khan Rahami, earlier named a person of i...",http://abcnews.go.com/US/bombing-incidences-ny...,real,69.79,6.0,-0.8271,0.554,0.446,0.0,55.07,11.7,-0.9912,0.087,0.864,0.049
4,Trump Surrogates Push Narrative That Clinton S...,Candace Smith,Donald Trump's surrogates and leading supporte...,http://abcnews.go.com/Politics/trump-surrogate...,real,46.44,8.8,0.0,0.0,1.0,0.0,52.12,12.8,-0.7303,0.074,0.859,0.067


##### Implementing Basic Text Feature

In [22]:
%%time

df['punctuation_title'] = df['title'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
df['punctuation_text'] = df['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
df['count_text'] = df['text'].apply(lambda x: len(str(x).split(" ")))
df['count_title'] = df['title'].apply(lambda x: len(str(x).split(" ")))
df['punctuation_ratio_text'] = df['punctuation_text']/df['count_text']
df['punctuation_ratio_title'] = df['punctuation_title']/df['count_title']

Wall time: 11.6 s


In [23]:
df.head()

Unnamed: 0,title,author,text,url,label,readability,read_grade,compound,neg,neu,...,compound_text,neg_text,neu_text,pos_text,punctuation_title,punctuation_text,count_text,count_title,punctuation_ratio_text,punctuation_ratio_title
0,The Impact of Debates? It's Debatable,GARY LANGER,With the Hillary Clinton-Donald Trump debates ...,http://abcnews.go.com/Politics/impact-debates-...,real,56.93,6.8,0.0,0.0,1.0,...,0.9682,0.061,0.851,0.089,2,256,1152,6,0.222222,0.333333
1,Details Emerge About NYC Bomb Suspect Ahmad Kh...,Brian Ross Rhonda Schwartz Mike Levine Stephan...,As police today captured the man wanted for qu...,http://abcnews.go.com/US/source-suspect-wanted...,real,62.34,6.8,-0.6597,0.435,0.565,...,-0.9882,0.095,0.86,0.045,0,110,640,9,0.171875,0.0
2,Donald Trump Repeats Calls for Police Profilin...,ALANA ABRAMSON,One day after explosive devices were discovere...,http://abcnews.go.com/Politics/donald-trump-re...,real,51.85,8.8,0.0,0.0,1.0,...,-0.7769,0.086,0.887,0.028,0,41,213,11,0.192488,0.0
3,"NY, NJ Bombings Suspect Charged With Attempted...",EMILY SHAPIRO Aaron Katersky Josh Margolin Mik...,"Ahmad Khan Rahami, earlier named a person of i...",http://abcnews.go.com/US/bombing-incidences-ny...,real,69.79,6.0,-0.8271,0.554,0.446,...,-0.9912,0.087,0.864,0.049,1,200,1158,10,0.172712,0.1
4,Trump Surrogates Push Narrative That Clinton S...,Candace Smith,Donald Trump's surrogates and leading supporte...,http://abcnews.go.com/Politics/trump-surrogate...,real,46.44,8.8,0.0,0.0,1.0,...,-0.7303,0.074,0.859,0.067,2,112,583,8,0.19211,0.25
