### Feature Engineering

This notebook will cover various ways to create features from the fakes news data set. This will include text analytics, sentiment, readability scores.

In [1]:
import pandas as pd
import numpy as np
import re
import math
import pickle
import string
import textstat
import time
import random

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import concurrent.futures

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
sia = SIA()


# this allows jupyter to output more than one line in the notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [82]:
# data = pd.read_csv('data/combinedData.csv') 
data = pd.read_csv('data_wang.csv')

In [83]:
data.head()
data.columns

Unnamed: 0,dataset,title,author,text,url,label
0,Buzzfeed,The Impact of Debates? It's Debatable,GARY LANGER,With the Hillary Clinton-Donald Trump debates ...,http://abcnews.go.com/Politics/impact-debates-...,real
1,Buzzfeed,Details Emerge About NYC Bomb Suspect Ahmad Kh...,Brian Ross Rhonda Schwartz Mike Levine Stephan...,As police today captured the man wanted for qu...,http://abcnews.go.com/US/source-suspect-wanted...,real
2,Buzzfeed,Donald Trump Repeats Calls for Police Profilin...,ALANA ABRAMSON,One day after explosive devices were discovere...,http://abcnews.go.com/Politics/donald-trump-re...,real
3,Buzzfeed,"NY, NJ Bombings Suspect Charged With Attempted...",EMILY SHAPIRO Aaron Katersky Josh Margolin Mik...,"Ahmad Khan Rahami, earlier named a person of i...",http://abcnews.go.com/US/bombing-incidences-ny...,real
4,Buzzfeed,Trump Surrogates Push Narrative That Clinton S...,Candace Smith,Donald Trump's surrogates and leading supporte...,http://abcnews.go.com/Politics/trump-surrogate...,real


Index(['dataset', 'title', 'author', 'text', 'url', 'label'], dtype='object')

##### Are there any NA's in the data?

Text features don't work very well with NA's. We'll replace NA's with an empty string "" for now.

In [84]:
data.isnull().any()

dataset    False
title       True
author      True
text        True
url         True
label      False
dtype: bool

In [85]:
data = data.fillna("")

##### Creating the readability ease, grade, and sentiment. 
These computations are expensive, so we use parallel processing to improve the speed

In [86]:
%%time

# produces a generator operator
# running for the title of the data
# change max_workers depending on the number of cores in your CPU
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
    values = data['title']
    reading_ease = executor.map(textstat.flesch_reading_ease, values, chunksize=100)
    reading_grade = executor.map(textstat.flesch_kincaid_grade, values, chunksize=100)
    sentiment = executor.map(sia.polarity_scores, values, chunksize=100)

Wall time: 5.71 s


In [87]:
%%time

# produces a generator operator
with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
    values = data['text']
    reading_ease_body = executor.map(textstat.flesch_reading_ease, values, chunksize=100)
    reading_grade_body = executor.map(textstat.flesch_kincaid_grade, values, chunksize=100)
    sentiment_body = executor.map(sia.polarity_scores, values, chunksize=100)

Wall time: 1min 15s


In [88]:
%%time

reading_ease = pd.DataFrame({'readability':list(reading_ease)})
reading_grade = pd.DataFrame({'read_grade':list(reading_grade)})
sentiment = pd.DataFrame(list(sentiment))

reading_ease_body = pd.DataFrame({'readability_text':list(reading_ease_body)})
reading_grade_body = pd.DataFrame({'read_grade_text':list(reading_grade_body)})
sentiment_body = pd.DataFrame(list(sentiment_body))

Wall time: 42.9 ms


In [89]:
sentiment_body.columns = ['compound_text', 'neg_text', 'neu_text', 'pos_text']

In [90]:
df = pd.concat([data, reading_ease, reading_grade, sentiment, reading_ease_body, reading_grade_body, sentiment_body], axis=1)

In [91]:
df.head()

Unnamed: 0,dataset,title,author,text,url,label,readability,read_grade,compound,neg,neu,pos,readability_text,read_grade_text,compound_text,neg_text,neu_text,pos_text
0,Buzzfeed,The Impact of Debates? It's Debatable,GARY LANGER,With the Hillary Clinton-Donald Trump debates ...,http://abcnews.go.com/Politics/impact-debates-...,real,56.93,6.8,0.0,0.0,1.0,0.0,67.18,9.1,0.9682,0.061,0.851,0.089
1,Buzzfeed,Details Emerge About NYC Bomb Suspect Ahmad Kh...,Brian Ross Rhonda Schwartz Mike Levine Stephan...,As police today captured the man wanted for qu...,http://abcnews.go.com/US/source-suspect-wanted...,real,62.34,6.8,-0.6597,0.435,0.565,0.0,46.0,13.1,-0.9882,0.095,0.86,0.045
2,Buzzfeed,Donald Trump Repeats Calls for Police Profilin...,ALANA ABRAMSON,One day after explosive devices were discovere...,http://abcnews.go.com/Politics/donald-trump-re...,real,51.85,8.8,0.0,0.0,1.0,0.0,68.7,8.5,-0.7769,0.086,0.887,0.028
3,Buzzfeed,"NY, NJ Bombings Suspect Charged With Attempted...",EMILY SHAPIRO Aaron Katersky Josh Margolin Mik...,"Ahmad Khan Rahami, earlier named a person of i...",http://abcnews.go.com/US/bombing-incidences-ny...,real,69.79,6.0,-0.8271,0.554,0.446,0.0,55.07,11.7,-0.9912,0.087,0.864,0.049
4,Buzzfeed,Trump Surrogates Push Narrative That Clinton S...,Candace Smith,Donald Trump's surrogates and leading supporte...,http://abcnews.go.com/Politics/trump-surrogate...,real,46.44,8.8,0.0,0.0,1.0,0.0,52.12,12.8,-0.7303,0.074,0.859,0.067


##### Implementing Basic Text Feature

In [92]:
%%time

df['punctuation_title'] = df['title'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
df['punctuation_text'] = df['text'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
df['count_text'] = df['text'].apply(lambda x: len(str(x).split(" ")))
df['count_title'] = df['title'].apply(lambda x: len(str(x).split(" ")))
df['punctuation_ratio_text'] = df['punctuation_text']/df['count_text']
df['punctuation_ratio_title'] = df['punctuation_title']/df['count_title']

Wall time: 7.3 s


In [93]:
df.head()
df.columns

Unnamed: 0,dataset,title,author,text,url,label,readability,read_grade,compound,neg,...,compound_text,neg_text,neu_text,pos_text,punctuation_title,punctuation_text,count_text,count_title,punctuation_ratio_text,punctuation_ratio_title
0,Buzzfeed,The Impact of Debates? It's Debatable,GARY LANGER,With the Hillary Clinton-Donald Trump debates ...,http://abcnews.go.com/Politics/impact-debates-...,real,56.93,6.8,0.0,0.0,...,0.9682,0.061,0.851,0.089,2,256,1152,6,0.222222,0.333333
1,Buzzfeed,Details Emerge About NYC Bomb Suspect Ahmad Kh...,Brian Ross Rhonda Schwartz Mike Levine Stephan...,As police today captured the man wanted for qu...,http://abcnews.go.com/US/source-suspect-wanted...,real,62.34,6.8,-0.6597,0.435,...,-0.9882,0.095,0.86,0.045,0,110,640,9,0.171875,0.0
2,Buzzfeed,Donald Trump Repeats Calls for Police Profilin...,ALANA ABRAMSON,One day after explosive devices were discovere...,http://abcnews.go.com/Politics/donald-trump-re...,real,51.85,8.8,0.0,0.0,...,-0.7769,0.086,0.887,0.028,0,41,213,11,0.192488,0.0
3,Buzzfeed,"NY, NJ Bombings Suspect Charged With Attempted...",EMILY SHAPIRO Aaron Katersky Josh Margolin Mik...,"Ahmad Khan Rahami, earlier named a person of i...",http://abcnews.go.com/US/bombing-incidences-ny...,real,69.79,6.0,-0.8271,0.554,...,-0.9912,0.087,0.864,0.049,1,200,1158,10,0.172712,0.1
4,Buzzfeed,Trump Surrogates Push Narrative That Clinton S...,Candace Smith,Donald Trump's surrogates and leading supporte...,http://abcnews.go.com/Politics/trump-surrogate...,real,46.44,8.8,0.0,0.0,...,-0.7303,0.074,0.859,0.067,2,112,583,8,0.19211,0.25


Index(['dataset', 'title', 'author', 'text', 'url', 'label', 'readability',
       'read_grade', 'compound', 'neg', 'neu', 'pos', 'readability_text',
       'read_grade_text', 'compound_text', 'neg_text', 'neu_text', 'pos_text',
       'punctuation_title', 'punctuation_text', 'count_text', 'count_title',
       'punctuation_ratio_text', 'punctuation_ratio_title'],
      dtype='object')

#### Specific Keyword/Phrase Types of Linguistic Features

First person.
I, Me, My, Mine, we, us, our, ours
We don't use he, they, because assumption is that many fake news try to do a us against them type of approach.


Contrasting Conjuctures:
But, However, Nonetheless, Yet, Even so, Nevertheless, Still,Notwithstanding, Although, Though, Even though, Much as, Notwithstanding, No matter, Despite, In spite of, For all, Regardless of, Notwithstanding

In [94]:
%%time
# Tokenize the columns


text_columns = df[['title' ,'text']]
text_columns['title'] = text_columns['title'].str.lower()
text_columns['text'] = text_columns['text'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Wall time: 1.58 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [95]:
%%time

with concurrent.futures.ProcessPoolExecutor(max_workers=4) as executor:
    titles_tokenize = text_columns['title'] 
    text_tokeinze = text_columns['text']
    titles_tokens = executor.map(nltk.word_tokenize, titles_tokenize, chunksize=100)
    text_tokens = executor.map(nltk.word_tokenize, text_tokeinze, chunksize=100)

Wall time: 16.4 s


In [96]:
%%time
titles_tokens = pd.DataFrame({'titles_tokens':list(titles_tokens)})
text_tokens = pd.DataFrame({'text_tokens':list(text_tokens)})

text_columns = pd.concat([titles_tokens, text_tokens], axis=1)

Wall time: 11 ms


In [97]:
text_columns.head()

Unnamed: 0,titles_tokens,text_tokens
0,"[the, impact, of, debates, ?, it, 's, debatable]","[with, the, hillary, clinton-donald, trump, de..."
1,"[details, emerge, about, nyc, bomb, suspect, a...","[as, police, today, captured, the, man, wanted..."
2,"[donald, trump, repeats, calls, for, police, p...","[one, day, after, explosive, devices, were, di..."
3,"[ny, ,, nj, bombings, suspect, charged, with, ...","[ahmad, khan, rahami, ,, earlier, named, a, pe..."
4,"[trump, surrogates, push, narrative, that, cli...","[donald, trump, 's, surrogates, and, leading, ..."


In [98]:
%%time

text_columns['title_counts'] = text_columns['titles_tokens'].apply(lambda x: Counter(x))
text_columns['text_counts'] = text_columns['text_tokens'].apply(lambda x: Counter(x))

Wall time: 2.33 s


In [99]:
string_list = (['but', 'however', 'nonetheless', 'yet', 
               'even', 'nevertheless', 'still',
               'notwithstanding', 'although', 'though', 
               'notwithstanding', 'despite', 'spite', 
               'regardless', 'notwithstanding'])

possessive_list = ['i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours']

In [100]:
%%time
text_columns['text_totals'] = text_columns['text_counts'].apply(lambda x: sum(x.values()))
text_columns['text_exclusive_ratio'] = text_columns['text_counts'].apply(lambda x: (sum(x.get(w, 0) for w in string_list)))
text_columns['text_exclusive_ratio'] = text_columns['text_exclusive_ratio']/text_columns['text_totals']

Wall time: 308 ms


In [101]:
text_columns['possessive_ratio'] = text_columns['text_counts'].apply(lambda x: (sum(x.get(w, 0) for w in possessive_list)))
text_columns['possessive_ratio'] = text_columns['possessive_ratio']/text_columns['text_totals']

In [102]:
text_columns.head()
df.head()

Unnamed: 0,titles_tokens,text_tokens,title_counts,text_counts,text_totals,text_exclusive_ratio,possessive_ratio
0,"[the, impact, of, debates, ?, it, 's, debatable]","[with, the, hillary, clinton-donald, trump, de...","{'the': 1, 'impact': 1, 'of': 1, 'debates': 1,...","{'with': 2, 'the': 66, 'hillary': 2, 'clinton-...",1355,0.014022,0.002952
1,"[details, emerge, about, nyc, bomb, suspect, a...","[as, police, today, captured, the, man, wanted...","{'details': 1, 'emerge': 1, 'about': 1, 'nyc':...","{'as': 4, 'police': 5, 'today': 2, 'captured':...",717,0.008368,0.001395
2,"[donald, trump, repeats, calls, for, police, p...","[one, day, after, explosive, devices, were, di...","{'donald': 1, 'trump': 1, 'repeats': 1, 'calls...","{'one': 1, 'day': 1, 'after': 2, 'explosive': ...",252,0.003968,0.019841
3,"[ny, ,, nj, bombings, suspect, charged, with, ...","[ahmad, khan, rahami, ,, earlier, named, a, pe...","{'ny': 1, ',': 1, 'nj': 1, 'bombings': 1, 'sus...","{'ahmad': 1, 'khan': 1, 'rahami': 9, ',': 59, ...",1307,0.005356,0.010712
4,"[trump, surrogates, push, narrative, that, cli...","[donald, trump, 's, surrogates, and, leading, ...","{'trump': 1, 'surrogates': 1, 'push': 1, 'narr...","{'donald': 1, 'trump': 9, ''s': 8, 'surrogates...",691,0.004342,0.014472


Unnamed: 0,dataset,title,author,text,url,label,readability,read_grade,compound,neg,...,compound_text,neg_text,neu_text,pos_text,punctuation_title,punctuation_text,count_text,count_title,punctuation_ratio_text,punctuation_ratio_title
0,Buzzfeed,The Impact of Debates? It's Debatable,GARY LANGER,With the Hillary Clinton-Donald Trump debates ...,http://abcnews.go.com/Politics/impact-debates-...,real,56.93,6.8,0.0,0.0,...,0.9682,0.061,0.851,0.089,2,256,1152,6,0.222222,0.333333
1,Buzzfeed,Details Emerge About NYC Bomb Suspect Ahmad Kh...,Brian Ross Rhonda Schwartz Mike Levine Stephan...,As police today captured the man wanted for qu...,http://abcnews.go.com/US/source-suspect-wanted...,real,62.34,6.8,-0.6597,0.435,...,-0.9882,0.095,0.86,0.045,0,110,640,9,0.171875,0.0
2,Buzzfeed,Donald Trump Repeats Calls for Police Profilin...,ALANA ABRAMSON,One day after explosive devices were discovere...,http://abcnews.go.com/Politics/donald-trump-re...,real,51.85,8.8,0.0,0.0,...,-0.7769,0.086,0.887,0.028,0,41,213,11,0.192488,0.0
3,Buzzfeed,"NY, NJ Bombings Suspect Charged With Attempted...",EMILY SHAPIRO Aaron Katersky Josh Margolin Mik...,"Ahmad Khan Rahami, earlier named a person of i...",http://abcnews.go.com/US/bombing-incidences-ny...,real,69.79,6.0,-0.8271,0.554,...,-0.9912,0.087,0.864,0.049,1,200,1158,10,0.172712,0.1
4,Buzzfeed,Trump Surrogates Push Narrative That Clinton S...,Candace Smith,Donald Trump's surrogates and leading supporte...,http://abcnews.go.com/Politics/trump-surrogate...,real,46.44,8.8,0.0,0.0,...,-0.7303,0.074,0.859,0.067,2,112,583,8,0.19211,0.25


In [103]:
df2 = pd.concat([df, text_columns['text_exclusive_ratio'], text_columns['possessive_ratio']], axis=1)

In [105]:
df2.tail()

Unnamed: 0,dataset,title,author,text,url,label,readability,read_grade,compound,neg,...,neu_text,pos_text,punctuation_title,punctuation_text,count_text,count_title,punctuation_ratio_text,punctuation_ratio_title,text_exclusive_ratio,possessive_ratio
11437,Wang-PolitiFact,"U.S. Representative, Florida District 23",debbie-wasserman-schultz,President Obama has the most border patrols an...,an interview on MSNBC,real,-1.29,14.7,0.0,0.0,...,0.876,0.124,3,1,18,5,0.055556,0.6,0.0,0.0
11438,Wang-PolitiFact,Governor,rick-scott,Most of the newspapers that endorsed Alex Sink...,a debate,real,-47.99,20.2,0.0,0.0,...,0.714,0.286,0,1,12,1,0.083333,0.0,0.0,0.0
11439,Wang-PolitiFact,U.S. Senator,jeff-merkley,60 percent of the jobs lost in the 2008 recess...,a talk to the Human Services Coalition of Oregon,real,35.61,8.8,0.0,0.0,...,0.812,0.0,2,5,24,2,0.208333,1.0,0.0,0.0
11440,Wang-PolitiFact,U.S. House of Representatives,john-carter,About 40 percent of U.S. illegal immigrants ca...,a news article,real,33.58,9.6,0.0,0.0,...,0.819,0.053,2,5,26,4,0.192308,0.5,0.0,0.0
11441,Wang-PolitiFact,U.S. Senator,tim-kaine,"Gov. Pence said, inarguably, Vladimir Putin is...","the vice presidential debate in Farmville, Va.",real,35.61,8.8,0.0,0.0,...,0.791,0.209,2,4,13,2,0.307692,1.0,0.0,0.0


In [106]:
# df2.to_csv('data_Wang_expandedfeatures.csv', index=False) # this has the new ratios for possessive and exclusive words

In [7]:
# df = pd.read_csv('data12-05-2018.csv')

In [126]:
wang = pd.read_csv('data_Wang_expandedfeatures.csv')
no_wang = pd.read_csv('data_expandedfeatures.csv')

In [127]:
wang[1000:3000]

Unnamed: 0,dataset,title,author,text,url,label,readability,read_grade,compound,neg,...,neu_text,pos_text,punctuation_title,punctuation_text,count_text,count_title,punctuation_ratio_text,punctuation_ratio_title,text_exclusive_ratio,possessive_ratio
1000,Buzzfeed,How to fix our disaster recovery strategy,,Six months after Hurricane Katrina struck New ...,http://www.politico.com/agenda/story/2016/09/h...,real,38.99,9.6,-0.6249,0.406,...,0.779,0.061,0,156,1248,7,0.125000,0.000000,0.007117,0.009964
1001,Buzzfeed,Schumer transfers millions to Dems in bid for ...,,Chuck Schumer is sitting on a mountain of cash...,http://www.politico.com/story/2016/09/chuck-sc...,real,86.71,3.7,0.0000,0.000,...,0.881,0.092,0,138,935,10,0.147594,0.000000,0.013274,0.007965
1002,Buzzfeed,"If you can't remember all your lies, you're te...",,"Truth be told, it isn’t really about Hillary a...",http://www.politico.com/story/2016/09/simon-sa...,real,94.15,2.9,-0.4215,0.219,...,0.811,0.100,3,130,914,11,0.142232,0.272727,0.013624,0.013624
1003,Buzzfeed,Government funding battle could SLIP to next w...,,Politico POLITICO's must-read briefing on what...,http://www.politico.com/tipsheets/playbook/201...,real,56.59,11.1,0.2960,0.086,...,0.865,0.094,7,1146,4043,26,0.283453,0.269231,0.003259,0.007130
1004,Buzzfeed,"George Soros to give $500 million to migrant, ...",,George Soros is pumping $500 million into migr...,http://www.politico.com/story/2016/09/george-s...,real,78.25,4.8,0.0000,0.000,...,0.773,0.170,2,47,387,10,0.121447,0.200000,0.002237,0.017897
1005,Buzzfeed,Worst. President. Ever.,,"As my 25th wedding anniversary approached, I t...",http://www.politico.com/magazine/story/2016/09...,real,34.59,9.2,-0.6249,0.672,...,0.815,0.091,3,290,2083,3,0.139222,1.000000,0.009209,0.008372
1006,Buzzfeed,"New York New Siena poll finds Clinton, Schumer...",,ALBANY — While Donald Trump and his supporters...,http://www.politico.com/states/new-york/albany...,real,99.57,2.9,0.0000,0.000,...,0.910,0.071,1,56,481,14,0.116424,0.071429,0.005405,0.000000
1007,Buzzfeed,Trump hits Clinton on Islamic State: ‘It is ti...,,Donald Trump called for a change Tuesday in ho...,http://www.politico.com/story/2016/09/trump-is...,real,85.69,4.0,0.0000,0.000,...,0.756,0.018,1,25,245,11,0.102041,0.090909,0.000000,0.013937
1008,Buzzfeed,Bono: ‘Trump is potentially the worst idea tha...,,Donald Trump is the antithesis of the United S...,http://www.politico.com/story/2016/09/bono-tru...,real,59.30,8.0,-0.6249,0.272,...,0.823,0.148,1,19,161,12,0.118012,0.083333,0.009852,0.024631
1009,Buzzfeed,Jack Welch outlines why he's supporting Trump,,Jack Welch may have to think twice about backi...,http://www.politico.com/story/2016/09/jack-wel...,real,81.29,3.7,0.4404,0.000,...,0.782,0.153,1,39,326,7,0.119632,0.142857,0.007481,0.017456


In [129]:
no_wang = wang[:-2500]

In [135]:
no_wang.to_csv('data_expandedfeatures.csv', index=False)

In [134]:
no_wang['dataset'].value_counts()
wang['dataset'].value_counts()

Risdal-McIntire    6335
Buzzfeed           1627
Rosas-Celebrity     500
Rosas-News          480
Name: dataset, dtype: int64

Risdal-McIntire    6335
Wang-PolitiFact    2500
Buzzfeed           1627
Rosas-Celebrity     500
Rosas-News          480
Name: dataset, dtype: int64

In [132]:
wang.columns == no_wang.columns


array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True], dtype=bool)

In [125]:
df.head()
df2.head()

Unnamed: 0,dataset,title,author,text,url,label,readability,read_grade,compound,neg,...,neu_text,pos_text,punctuation_title,punctuation_text,count_text,count_title,punctuation_ratio_text,punctuation_ratio_title,text_exclusive_ratio,possessive_ratio
0,Buzzfeed,The Impact of Debates? It's Debatable,GARY LANGER,With the Hillary Clinton-Donald Trump debates ...,http://abcnews.go.com/Politics/impact-debates-...,real,56.93,6.8,0.0,0.0,...,0.851,0.089,2,256,1152,6,0.222222,0.333333,0.014022,0.002952
1,Buzzfeed,Details Emerge About NYC Bomb Suspect Ahmad Kh...,Brian Ross Rhonda Schwartz Mike Levine Stephan...,As police today captured the man wanted for qu...,http://abcnews.go.com/US/source-suspect-wanted...,real,62.34,6.8,-0.6597,0.435,...,0.86,0.045,0,110,640,9,0.171875,0.0,0.008368,0.001395
2,Buzzfeed,Donald Trump Repeats Calls for Police Profilin...,ALANA ABRAMSON,One day after explosive devices were discovere...,http://abcnews.go.com/Politics/donald-trump-re...,real,51.85,8.8,0.0,0.0,...,0.887,0.028,0,41,213,11,0.192488,0.0,0.003968,0.019841
3,Buzzfeed,"NY, NJ Bombings Suspect Charged With Attempted...",EMILY SHAPIRO Aaron Katersky Josh Margolin Mik...,"Ahmad Khan Rahami, earlier named a person of i...",http://abcnews.go.com/US/bombing-incidences-ny...,real,69.79,6.0,-0.8271,0.554,...,0.864,0.049,1,200,1158,10,0.172712,0.1,0.005356,0.010712
4,Buzzfeed,Trump Surrogates Push Narrative That Clinton S...,Candace Smith,Donald Trump's surrogates and leading supporte...,http://abcnews.go.com/Politics/trump-surrogate...,real,46.44,8.8,0.0,0.0,...,0.859,0.067,2,112,583,8,0.19211,0.25,0.004342,0.014472


Unnamed: 0,dataset,title,text,url,label,readability,read_grade,compound,neg,neu,...,neu_text,pos_text,punctuation_title,punctuation_text,count_text,count_title,punctuation_ratio_text,punctuation_ratio_title,text_exclusive_ratio,possessive_ratio
0,Buzzfeed,The Impact of Debates? It's Debatable,With the Hillary Clinton-Donald Trump debates ...,http://abcnews.go.com/Politics/impact-debates-...,real,56.93,6.8,0.0,0.0,1.0,...,0.851,0.089,2,256,1152,6,0.222222,0.333333,0.014022,0.002952
1,Buzzfeed,Details Emerge About NYC Bomb Suspect Ahmad Kh...,As police today captured the man wanted for qu...,http://abcnews.go.com/US/source-suspect-wanted...,real,62.34,6.8,-0.6597,0.435,0.565,...,0.86,0.045,0,110,640,9,0.171875,0.0,0.008368,0.001395
2,Buzzfeed,Donald Trump Repeats Calls for Police Profilin...,One day after explosive devices were discovere...,http://abcnews.go.com/Politics/donald-trump-re...,real,51.85,8.8,0.0,0.0,1.0,...,0.887,0.028,0,41,213,11,0.192488,0.0,0.003968,0.019841
3,Buzzfeed,"NY, NJ Bombings Suspect Charged With Attempted...","Ahmad Khan Rahami, earlier named a person of i...",http://abcnews.go.com/US/bombing-incidences-ny...,real,69.79,6.0,-0.8271,0.554,0.446,...,0.864,0.049,1,200,1158,10,0.172712,0.1,0.005356,0.010712
4,Buzzfeed,Trump Surrogates Push Narrative That Clinton S...,Donald Trump's surrogates and leading supporte...,http://abcnews.go.com/Politics/trump-surrogate...,real,46.44,8.8,0.0,0.0,1.0,...,0.859,0.067,2,112,583,8,0.19211,0.25,0.004342,0.014472
