# Predicting the political bias of news articles

## Text Processing with the Webhose.io Crawl

In [None]:
# https://github.com/shivam5992/textstat
!pip install textstat

### Load Text

In [49]:
# import pandas
import pandas as pd

# define csv read
def read_data(filename):
    # read in csv
    df = pd.read_csv(filename, sep='\t', encoding='utf-8')
    
    # convert bias label to numbers
    df['bias'] = df['bias'].replace({'Left': 1, 'Center': 2, 'Right': 3})
    
    # add sentence num count
    df['sentences'] = [t.count('.') for t in df['text']]
    
    #drop first column (index)
    df = df.drop(df.columns[0], axis=1)
    
    return df

In [51]:
combine = read_data('webhose_crawl_2018-06-22.csv')
combine.head()

Unnamed: 0,bias,source,headline,text,date,sentences
0,1,New York Times,Libyan Forces Advance Into Eastern City of Der...,Libyan Forces Advance Into Eastern City of Der...,2018-06-04,16
1,1,New York Times,First Saudi Women Receive Driving Licenses Ami...,First Saudi Women Receive Driving Licenses Ami...,2018-06-04,35
2,1,New York Times,Boulder-Size Asteroid Disintegrated Harmlessly...,Boulder-Size Asteroid Disintegrated Harmlessly...,2018-06-04,24
3,1,New York Times,‘There There’ is an Energetic Revelation of a ...,Image Credit Alessandra Montalto/The New York ...,2018-06-05,58
4,1,New York Times,"Transcript of ‘Charm City,’ Part 1: Baltimore ...","Transcript of ‘Charm City,’ Part 1: Baltimore ...",2018-06-04,541


In [52]:
# check the size of the current df
combine.shape

(31184, 6)

### Add Text Readability Measure

In [53]:
# get text readability
from textstat.textstat import textstat

def read_score(text):
  text = textstat.automated_readability_index(text.replace('\n',''))
  return text

In [54]:
read_score(combine['text'][1])

15.4

In [55]:
# add reading column
reading_list = [read_score(t) for t in combine.text]
combine["reading"] = reading_list

In [56]:
#add text length column
combine['length'] = [len(t.split()) for t in combine['text']]

In [57]:
combine.head()

Unnamed: 0,bias,source,headline,text,date,sentences,reading,length
0,1,New York Times,Libyan Forces Advance Into Eastern City of Der...,Libyan Forces Advance Into Eastern City of Der...,2018-06-04,16,14.8,403
1,1,New York Times,First Saudi Women Receive Driving Licenses Ami...,First Saudi Women Receive Driving Licenses Ami...,2018-06-04,35,15.4,800
2,1,New York Times,Boulder-Size Asteroid Disintegrated Harmlessly...,Boulder-Size Asteroid Disintegrated Harmlessly...,2018-06-04,24,11.9,418
3,1,New York Times,‘There There’ is an Energetic Revelation of a ...,Image Credit Alessandra Montalto/The New York ...,2018-06-05,58,10.7,1078
4,1,New York Times,"Transcript of ‘Charm City,’ Part 1: Baltimore ...","Transcript of ‘Charm City,’ Part 1: Baltimore ...",2018-06-04,541,6.5,6620


In [61]:
#sources
set(combine.source.tolist())

{'Breitbart',
 'CNN',
 'Fox News',
 'Huffington Post',
 'NPR',
 'New York Times',
 'Reuters',
 'USA Today',
 'Washington Times'}

## Text Preprocessing on DF

In [58]:
# import key modules
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [62]:
# strip texts of punctuation, boilerplate, and stop words
def text_prepare(text):
    """
        text: a string
        return: modified initial string
    """
    text = text.lower()
    text = text.replace('\n',' ')
    
    letters = list(string.ascii_lowercase)
    numbers = ['0','1','2','3','4','5','6','7','8','9']
    banned = ["’","’","“","—","”","‘","–",'#','[','/','(',')','{','}','\\','[',']','|','@',',',';','+','-']
    banned = ''.join(banned) + string.punctuation + ''.join(numbers)
    boilerplate = ['  ','https','http','www', '’s', '―', '/', 'playback', 'image', 'credit', 'transcript', 'get', 'mr', 'mrs', 'ms', 'dr', 'prof', 'news', 'report', 'unsubscribe', 'they', 'must', 'share', 'that', 'view', 'hide', 'copy', 'something', 'enlarge', 'reprint', 'read', '_', 'videos', 'autoplay', 'watched', 'press', '’ve', 'toggle', 'around', 'the', 's.', 'said', 'here©', 'ad', '#', 'andhis', 'click', 'r', 'device', 'contributed', 'advertisement', 'the washington', '&', 'follow', 'copyright', 'mrs.', 'photo', 'to', 'also', 'times', 'for', 'however', 'fox', 'this', 'copyright ©', 'ofs', 'just', 'wait', 'n’t', 'told', 'unsupported', 'i', 'caption', 'ms.', '’m', 'paste', '’re', 'replay', 'photos', 'mr.', '©', 'skip', 'watch', '2018', 'cut', 'llc', 'more', 'post', 'embed', 'blog', 'b.', 'associated', 'permission']
    sources = ['breitbart', 'cnn', 'npr', 'huffington', 'post', 'new', 'york', 'times', 'usa', 'today', 'reuters', 'washington', 'associated', 'press']
    stop_list = set(stopwords.words('english') + boilerplate + sources + letters)
    
    translation_table = dict.fromkeys(map(ord, banned), ' ')
    text = text.translate(translation_table)
    text = re.sub(' +',' ',text)
    text = ' '.join([word for word in text.split() if word not in stop_list])
    return text

In [63]:
# shuffle df for random sampling
new = combine
new = new.sample(frac=1).reset_index(drop=True)
new.head()

Unnamed: 0,bias,source,headline,text,date,sentences,reading,length
0,3,Washington Times,Group sues over county permit for oil refinery...,"Print By - Associated Press - Thursday, June 1...",2018-06-15,10,13.2,145
1,3,Breitbart,Man Claims He Was Asked to Leave Restaurant fo...,Breitbart: Man Claims He Was Asked to Leave Re...,2018-06-10,17,10.8,360
2,1,New York Times,Will AT&T Be Able to Handle HBO?,Media analysts and tech-industry prognosticato...,2018-06-14,73,10.2,1231
3,1,New York Times,5 Summer Skiing Getaways in the Southern Hemis...,"Summer is a popular time for beach vacations, ...",2018-06-05,27,11.1,513
4,1,New York Times,"Fact Check: Corey Stewart, Republicans and the...","Fact Check: Corey Stewart, Republicans and the...",2018-06-14,80,11.5,1173


In [67]:
# rewrite df with cleaned text
for i in range(0, len(new)):
  new.at[i,'text'] = text_prepare(new.at[i,'text'])
  new.at[i,'headline'] = text_prepare(str(new.at[i,'headline']))
  
new.head()

Unnamed: 0,bias,source,headline,text,date,sentences,reading,length
0,3,Washington Times,group sues county permit oil refinery near park,print thursday june bismarck ap dakota resourc...,2018-06-15,10,13.2,145
1,3,Breitbart,man claims asked leave restaurant wearing larg...,man claims asked leave restaurant wearing larg...,2018-06-10,17,10.8,360
2,1,New York Times,able handle hbo,media analysts tech industry prognosticators l...,2018-06-14,73,10.2,1231
3,1,New York Times,summer skiing getaways southern hemisphere,summer popular time beach vacations sun water ...,2018-06-05,27,11.1,513
4,1,New York Times,fact check corey stewart republicans fringe right,fact check corey stewart republicans fringe ri...,2018-06-14,80,11.5,1173


In [68]:
# save cleaned file to csv
new.to_csv('webhose-corpus-df-clean.csv', sep='\t', encoding='utf-8')

## Separate Clean for ShortText

### Load Text

In [80]:
# import pandas
import pandas as pd

# define csv read
def read_data(filename):
    # read in csv
    df = pd.read_csv(filename, sep='\t', encoding='utf-8')

    #limit df content to bias, text, headline, and source
    df = df.loc[:, ['bias', 'text']]    
    
    return df

In [81]:
newer = read_data('webhose_crawl_2018-07-04B.csv')
newer.head()

Unnamed: 0,bias,text
0,Left,Libyan Forces Advance Into Eastern City of Der...
1,Left,First Saudi Women Receive Driving Licenses Ami...
2,Left,Boulder-Size Asteroid Disintegrated Harmlessly...
3,Left,Image Credit Alessandra Montalto/The New York ...
4,Left,"Transcript of ‘Charm City,’ Part 1: Baltimore ..."


### Text Processing

In [82]:
# strip texts of punctuation, boilerplate, and stop words
def text_prepare_var(text):
    """
        text: a string
        return: modified initial string
    """
    text = text.replace('\n',' ')
    
    letters = list(string.ascii_lowercase)
    numbers = ['0','1','2','3','4','5','6','7','8','9']
    banned = ["’","’","“","—","”","‘","–",'#','[','/','(',')','{','}','\\','[',']','|','@',',',';','+','-']
    banned = ''.join(banned) + ''.join(numbers)
    boilerplate = ['  ','https','http','www', '’s', '―', '/', 'playback', 'image', 'credit', 'transcript', 'get', 'by', 'mr', 'mrs', 'ms', 'dr', 'prof', 'news', 'report', 'unsubscribe', 'they', 'must', 'share', 'that', 'view', 'hide', 'copy', 'reporting', 'writing', 'editing', 'advertisement', 'something', 'enlarge', 'reprint', 'read', '_', 'videos', 'autoplay', 'watched', 'press', '’ve', 'toggle', 'around', 'the', 's.', 'said', 'here©', 'ad', '#', 'andhis', 'click', 'r', 'device', 'contributed', 'advertisement', 'the washington', '&', 'follow', 'copyright', 'mrs.', 'photo', 'to', 'also', 'times', 'for', 'however', 'fox', 'this', 'copyright ©', 'ofs', 'just', 'wait', 'n’t', 'told', 'unsupported', 'i', 'caption', 'ms.', '’m', 'paste', '’re', 'replay', 'photos', 'mr.', '©', 'skip', 'watch', '2018', 'cut', 'llc', 'more', 'post', 'embed', 'blog', 'b.', 'associated', 'permission']
    sources = ['Breitbart', 'CNN', 'NPR', 'Huffington', 'Post', 'New', 'York', 'Times', 'USA', 'Today', 'Reuters', 'Washington', 'Associated', 'Press']
    stop_list = set(stopwords.words('english') + boilerplate + sources + letters)
    
    translation_table = dict.fromkeys(map(ord, banned), ' ')
    text = text.translate(translation_table)
    text = re.sub(' +',' ',text)
    text = ' '.join([word for word in text.split() if word not in stop_list])
    text = text.split('.')
    return text

In [83]:
# rewrite df with cleaned text
for i in range(0, len(newer)):
  newer.at[i,'text'] = text_prepare_var(newer.at[i,'text'])
  
newer.head()

Unnamed: 0,bias,text
0,Left,[Libyan Forces Advance Into Eastern City Derna...
1,Left,[First Saudi Women Receive Driving Licenses Am...
2,Left,[Boulder Size Asteroid Disintegrated Harmlessl...
3,Left,[Image Credit Alessandra Montalto The A modest...
4,Left,[Transcript Charm City Part : Baltimore After ...


In [84]:
#assess results
newer['text'][0]

['Libyan Forces Advance Into Eastern City Derna By June BENGHAZI Libya Libyan forces advanced several neighborhoods Derna Monday stepping military campaign oust rivals eastern city spokesman residents said',
 ' The advance followed heavy shelling air strikes recent weeks Libyan National Army LNA eastern based force loyal Khalifa Haftar launched ground campaign Derna',
 ' The LNA long encircled city last eastern Libya elude control',
 ' It held Derna Protection Forces formerly known Derna Mujahideen Shura Council coalition Islamists anti Haftar combatants',
 " Early Monday LNA forces entered Derna eastern western coastal roads took control Shiha Bab Tobruk districts advancing within one kilometer port one operation's commanders Salem al Rafadi Reuters",
 ' Photos shared resident appeared show soldiers military vehicles advancing along largely deserted streets',
 ' Haftar says men fighting rid Derna city km miles border Egypt "terrorists"',
 ' Opponents accuse subjecting city urban warfa

In [86]:
# save cleaned file to csv
newer.to_csv('webhose-shortText.csv', sep='\t', encoding='utf-8')