# Predicting the political bias of news articles

## Text Preprocessing Notebook

In [None]:
# https://github.com/shivam5992/textstat
!pip install textstat

### Load Text

In [50]:
# import pandas
import pandas as pd

# define csv read
def read_data(filename):
    # read in csv
    df = pd.read_csv(filename)
    
    # limit df content to bias, text, headline, and source
    df = df.loc[:, ['bias', 'text', 'headline', 'source', 'date']]
    
    # convert bias label to numbers
    df['bias'] = df['bias'].replace({'Left ': 1, 'Center ': 2, 'Right ': 3})
    
    # add sentence num count
    df['sentences'] = [t.count('.') for t in df['text']]
    
    # classify without center biased news
    # new = new[new.bias != '2']
    
    return df

In [51]:
new = read_data('news-corpus-df.csv')
new.head()

Unnamed: 0,bias,text,headline,source,date,sentences
0,1,"Jared Bernstein, a former chief economist to V...",b'How trend-riding Trump is taking credit for ...,Washington Post,2018-06-13,59
1,3,Liberals have opposed virtually every move Pre...,b'It\xe2\x80\x99s Trump\xe2\x80\x99s Economy Now',Wall Street Journal- Editorial,2018-06-13,9
2,2,CLOSE President Trump’s once bitter political ...,b'The Bubble: By undoing Obama accomplishments...,USA TODAY,2018-06-13,56
3,2,"The attorneys for Michael Cohen, President Don...",b'Trump Lawyer Michael Cohen\xe2\x80\x99s Atto...,Wall Street Journal- News,2018-06-13,7
4,1,Longtime Trump lawyer Michael Cohen is changin...,b'Reports suggest Michael Cohen is thinking of...,Vox,2018-06-13,47


In [52]:
# check the size of the current df
new.shape

(3162, 6)

### Check DF with Sentiment Data

In [53]:
# define csv read
def read_data2(filename):
    # read in csv
    df = pd.read_csv(filename, sep='\t', encoding='utf-8')
    
    #limit df content to bias, text, headline, and source
    df = df.loc[:, ['bias', 'text', 'headline', 'source', 'sentiment', 'magnitude']]
    
    return df

# load cleaned file with sentiment data
sent = read_data2('news-corpus-df-sent.csv')
sent.head()

Unnamed: 0,bias,text,headline,source,sentiment,magnitude
0,1,"Jared Bernstein, a former chief economist to V...",b'How trend-riding Trump is taking credit for ...,Washington Post,0.0,12.4
1,3,Liberals have opposed virtually every move Pre...,b'It\xe2\x80\x99s Trump\xe2\x80\x99s Economy Now',Wall Street Journal- Editorial,-0.1,1.0
2,2,CLOSE President Trump’s once bitter political ...,b'The Bubble: By undoing Obama accomplishments...,USA TODAY,0.0,17.799999
3,2,"The attorneys for Michael Cohen, President Don...",b'Trump Lawyer Michael Cohen\xe2\x80\x99s Atto...,Wall Street Journal- News,0.0,0.5
4,1,Longtime Trump lawyer Michael Cohen is changin...,b'Reports suggest Michael Cohen is thinking of...,Vox,0.0,13.2


In [54]:
# check the size of the current df
sent.shape

(3162, 6)

### Combine Date and Sentiment Information

In [55]:
def combine_df(df1, df2):
    # merge dfs
    combine = pd.merge(new, sent, how="outer", on="headline")
    
    #limit df content to bias, text, headline, and source
    combine = combine.loc[:, ['bias_x', 'text_x', 'headline', 'source_x', 'date', 'sentiment', 'magnitude', 'sentences']]
    
    #rename columns
    combine = combine.rename(index=str, columns={"bias_x": "bias", "text_x":"text", "source_x":"source"})
    
    #add text length column
    combine['length'] = [len(t.split()) for t in combine['text']]
    
    return combine
  
combine = combine_df(new, sent)
combine.head()

Unnamed: 0,bias,text,headline,source,date,sentiment,magnitude,sentences,length
0,1,"Jared Bernstein, a former chief economist to V...",b'How trend-riding Trump is taking credit for ...,Washington Post,2018-06-13,0.0,12.4,59,960
1,3,Liberals have opposed virtually every move Pre...,b'It\xe2\x80\x99s Trump\xe2\x80\x99s Economy Now',Wall Street Journal- Editorial,2018-06-13,-0.1,1.0,9,90
2,2,CLOSE President Trump’s once bitter political ...,b'The Bubble: By undoing Obama accomplishments...,USA TODAY,2018-06-13,0.0,17.799999,56,1154
3,2,"The attorneys for Michael Cohen, President Don...",b'Trump Lawyer Michael Cohen\xe2\x80\x99s Atto...,Wall Street Journal- News,2018-06-13,0.0,0.5,7,82
4,1,Longtime Trump lawyer Michael Cohen is changin...,b'Reports suggest Michael Cohen is thinking of...,Vox,2018-06-13,0.0,13.2,47,1115


In [56]:
# check for missing scores
missing_scores = combine[combine.text.isnull()]

# display results
missing_scores.reset_index(inplace=True,drop='index')
missing_scores.head()

# None!

Unnamed: 0,bias,text,headline,source,date,sentiment,magnitude,sentences,length


### Add Text Readability Measure

In [68]:
# get text readability
from textstat.textstat import textstat

def read_score(text):
  text = textstat.automated_readability_index(text.replace('\n',''))
  return text

In [62]:
read_score(combine['text'][1])

9.6

In [73]:
# add reading column
reading_list = [read_score(t) for t in combine.text]
combine["reading"] = reading_list
combine.head()

Unnamed: 0,bias,text,headline,source,date,sentiment,magnitude,sentences,length,reading
0,1,"Jared Bernstein, a former chief economist to V...",b'How trend-riding Trump is taking credit for ...,Washington Post,2018-06-13,0.0,12.4,59,960,12.4
1,3,Liberals have opposed virtually every move Pre...,b'It\xe2\x80\x99s Trump\xe2\x80\x99s Economy Now',Wall Street Journal- Editorial,2018-06-13,-0.1,1.0,9,90,9.6
2,2,CLOSE President Trump’s once bitter political ...,b'The Bubble: By undoing Obama accomplishments...,USA TODAY,2018-06-13,0.0,17.799999,56,1154,13.5
3,2,"The attorneys for Michael Cohen, President Don...",b'Trump Lawyer Michael Cohen\xe2\x80\x99s Atto...,Wall Street Journal- News,2018-06-13,0.0,0.5,7,82,15.3
4,1,Longtime Trump lawyer Michael Cohen is changin...,b'Reports suggest Michael Cohen is thinking of...,Vox,2018-06-13,0.0,13.2,47,1115,15.0


## Text Preprocessing on DF

In [74]:
# import key modules
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [75]:
# strip texts of punctuation, boilerplate, and stop words
def text_prepare(text):
    """
        text: a string
        return: modified initial string
    """
    text = text.lower()
    text = text.replace('\n',' ')
    
    letters = list(string.ascii_lowercase)
    numbers = ['0','1','2','3','4','5','6','7','8','9']
    banned = ["’","’","“","—","”","‘","–",'#','[','/','(',')','{','}','\\','[',']','|','@',',',';','+','-']
    banned = ''.join(banned) + string.punctuation + ''.join(numbers)
    boilerplate = ['  ','https','http','www', '’s', '―', '/', 'playback', 'get', 'mr', 'mrs', 'ms', 'dr', 'prof', 'news', 'report', 'unsubscribe', 'they', 'must', 'share', 'that', 'view', 'hide', 'copy', 'something', 'enlarge', 'reprint', 'read', '_', 'videos', 'autoplay', 'watched', 'press', '’ve', 'toggle', 'around', 'the', 's.', 'said', 'here©', 'ad', '#', 'andhis', 'click', 'r', 'device', 'contributed', 'advertisement', 'the washington', '&', 'follow', 'copyright', 'mrs.', 'photo', 'to', 'also', 'times', 'for', 'however', 'fox', 'this', 'copyright ©', 'ofs', 'just', 'wait', 'n’t', 'told', 'unsupported', 'i', 'caption', 'ms.', '’m', 'paste', '’re', 'replay', 'photos', 'mr.', '©', 'skip', 'watch', '2018', 'cut', 'llc', 'more', 'post', 'embed', 'blog', 'b.', 'associated', 'permission']
    stop_list = set(stopwords.words('english') + boilerplate + letters)
    
    translation_table = dict.fromkeys(map(ord, banned), ' ')
    text = text.translate(translation_table)
    text = re.sub(' +',' ',text)
    text = ' '.join([word for word in text.split() if word not in stop_list])
    return text

In [76]:
# shuffle df for random sampling
new = combine
new = new.sample(frac=1).reset_index(drop=True)
new.head()

Unnamed: 0,bias,text,headline,source,date,sentiment,magnitude,sentences,length,reading
0,1,Melania Trump appears to have plagiarized her ...,"b'Melania Trump Plagiarized Michelle Obama, a ...",Daily Beast,2016-07-19,0.0,13.1,51,901,11.4
1,1,The Federal Communications Commission took aim...,b'FCC plan would give Internet providers power...,Washington Post,2017-11-21,-0.1,11.0,39,957,16.4
2,1,This is not a radical idea. I live 50 miles so...,b'OPINION: Bernie Sanders: Why We Need Medicar...,New York Times,2017-09-13,0.0,3.0,18,379,12.6
3,3,Democrats indicated Sunday that the decision b...,b'Attorney General Jeff Sessions to testify; S...,Washington Times,2017-06-12,-0.1,9.9,86,1096,11.7
4,1,"""The California Highway Patrol takes pride in ...","b'LAPD chief, mayor warn against taking anti-T...",Los Angeles Times,2016-11-10,0.0,0.6,2,62,19.8


In [77]:
# rewrite df with cleaned text
for i in range(0, len(new)):
  new.at[i,'text'] = text_prepare(new.at[i,'text'])
  new.at[i,'headline'] = text_prepare(new.at[i,'headline'])
  
new.head()

Unnamed: 0,bias,text,headline,source,date,sentiment,magnitude,sentences,length,reading
0,1,melania trump appears plagiarized speech openi...,melania trump plagiarized michelle obama woman...,Daily Beast,2016-07-19,0.0,13.1,51,901,11.4
1,1,federal communications commission took aim sig...,fcc plan would give internet providers power c...,Washington Post,2017-11-21,-0.1,11.0,39,957,16.4
2,1,radical idea live miles south canadian border ...,opinion bernie sanders need medicare,New York Times,2017-09-13,0.0,3.0,18,379,12.6
3,3,democrats indicated sunday decision attorney g...,attorney general jeff sessions testify senate ...,Washington Times,2017-06-12,-0.1,9.9,86,1096,11.7
4,1,california highway patrol takes pride able saf...,lapd chief mayor warn taking anti trump protes...,Los Angeles Times,2016-11-10,0.0,0.6,2,62,19.8


In [78]:
# save cleaned file to csv
new.to_csv('news-corpus-df-clean.csv', sep='\t', encoding='utf-8')