# Predict political bias of news articles with linear models

## Text Preprocessing Notebook

### Load Text

In [5]:
# import pandas
import pandas as pd

# define csv read
def read_data(filename):
    # read in csv
    df = pd.read_csv(filename)
    
    # drop texts under 500 words
    df = df.drop(df[df.text_len < 500].index)
    
    # drop texts over 1000 words
    df = df.drop(df[df.text_len > 10000].index)
    
    #limit df content to bias, text, headline, and source
    df = df.loc[:, ['bias', 'text', 'headline', 'source']]
    
    #convert bias label to numbers
    df['bias'] = df['bias'].replace({'Left ': 1, 'Center ': 2, 'Right ': 3})
    
    # classify without center biased news
    # new = new[new.bias != '2']
    
    return df

In [6]:
new = read_data('news-corpus-df.csv')
new.head()

Unnamed: 0,bias,text,headline,source
0,1,"Jared Bernstein, a former chief economist to V...",b'How trend-riding Trump is taking credit for ...,Washington Post
1,3,Liberals have opposed virtually every move Pre...,b'It\xe2\x80\x99s Trump\xe2\x80\x99s Economy Now',Wall Street Journal- Editorial
2,2,CLOSE President Trump’s once bitter political ...,b'The Bubble: By undoing Obama accomplishments...,USA TODAY
3,2,"The attorneys for Michael Cohen, President Don...",b'Trump Lawyer Michael Cohen\xe2\x80\x99s Atto...,Wall Street Journal- News
4,1,Longtime Trump lawyer Michael Cohen is changin...,b'Reports suggest Michael Cohen is thinking of...,Vox


In [87]:
# check the size of the current df
new.shape

(2924, 4)

In [88]:
# shuffle df for random sampling
new = new.sample(frac=1).reset_index(drop=True)
new.head()

Unnamed: 0,bias,text,headline,source
0,3,"On Wednesday, Laura Ingraham tweeted out an il...",b'The Laura Ingraham Boycott Has Nothing To Do...,The Daily Wire
1,3,EDITOR'S NOTE: Orlando's mayor on Monday revis...,b'50 killed in shooting at Florida nightclub i...,Fox News
2,2,Ohio officials announced Wednesday that Libert...,b'How much hope for third-party presidential c...,Christian Science Monitor
3,1,Washington (CNN) A top White House aide called...,"b""WH aide: Bannon's comments 'grotesque'""",CNN (Web News)
4,3,As President Trump headed Tuesday to Puerto Ri...,b'Trump: San Juan mayor coming around on hurri...,Washington Times


In [89]:
# check num of center labeled stories
new.loc[new['bias'] == 2].shape

(749, 4)

In [90]:
# equalize numbers for each bias class
center = new.loc[new['bias'] == 2][0:800]
right = new.loc[new['bias'] == 3][0:800]
left = new.loc[new['bias'] == 1][0:800]

new = center.append(right, ignore_index=True)
new = new.append(left, ignore_index=True)

# check new df size
new.shape

(2349, 4)

## Text Preprocessing on DF

In [95]:
# import key modules
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [96]:
# strip texts of punctuation, boilerplate, and stop words
def text_prepare(text):
    """
        text: a string
        return: modified initial string
    """
    text = text.lower()
    text = text.replace('\n',' ')
    
    letters = list(string.ascii_lowercase)
    numbers = ['0','1','2','3','4','5','6','7','8','9']
    banned = ["’","’","“","—","”","‘","–",'#','[','/','(',')','{','}','\\','[',']','|','@',',',';','+','-']
    banned = ''.join(banned) + string.punctuation + ''.join(numbers)
    boilerplate = ['  ','https','http','www', '’s', '―', '/', 'playback', 'get', 'mr', 'mrs', 'ms', 'dr', 'prof', 'news', 'report', 'unsubscribe', 'they', 'must', 'share', 'that', 'view', 'hide', 'copy', 'something', 'enlarge', 'reprint', 'read', '_', 'videos', 'autoplay', 'watched', 'press', '’ve', 'toggle', 'around', 'the', 's.', 'said', 'here©', 'ad', '#', 'andhis', 'click', 'r', 'device', 'contributed', 'advertisement', 'the washington', '&', 'follow', 'copyright', 'mrs.', 'photo', 'to', 'also', 'times', 'for', 'however', 'fox', 'this', 'copyright ©', 'ofs', 'just', 'wait', 'n’t', 'told', 'unsupported', 'i', 'caption', 'ms.', '’m', 'paste', '’re', 'replay', 'photos', 'mr.', '©', 'skip', 'watch', '2018', 'cut', 'llc', 'more', 'post', 'embed', 'blog', 'b.', 'associated', 'permission']
    stop_list = set(stopwords.words('english') + boilerplate + letters)
    
    translation_table = dict.fromkeys(map(ord, banned), ' ')
    text = text.translate(translation_table)
    text = re.sub(' +',' ',text)
    text = ' '.join([word for word in text.split() if word not in stop_list])
    return text

In [97]:
# rewrite df with cleaned text
for i in range(0, len(new)):
  new.at[i,'text'] = text_prepare(new.at[i,'text'])
  new.at[i,'headline'] = text_prepare(new.at[i,'headline'])
  
new.head()

Unnamed: 0,bias,text,headline,source
0,2,ohio officials announced wednesday libertarian...,much hope third party presidential candidates,Christian Science Monitor
1,2,british police arrest investigation london att...,parliament lockdown firearms incident,NPR News
2,2,close harvey weinstein lawyer usa today client...,harvey weinstein formally charged rape crimina...,USA TODAY
3,2,koch network using libre initiative thank numb...,koch brothers fund political ads praising dems...,The Hill
4,2,elected officials chaired dnc past many party ...,keith ellison future dnc,Christian Science Monitor


In [98]:
# save cleaned file to csv
new.to_csv('news-corpus-df-clean.csv', sep='\t', encoding='utf-8')

### Check DF with Sentiment Data

In [7]:
# define csv read
def read_data2(filename):
    # read in csv
    df = pd.read_csv(filename, sep='\t', encoding='utf-8')
    
    #limit df content to bias, text, headline, and source
    df = df.loc[:, ['bias', 'text', 'headline', 'source', 'sentiment', 'magnitude']]
    
    return df

# load cleaned file with sentiment data
sent = read_data2('news-corpus-df-sent.csv')
sent.head()

Unnamed: 0,bias,text,headline,source,sentiment,magnitude
0,2,close president trump bitter political enemy m...,bubble undoing obama accomplishments trump let...,USA TODAY,0.0,17.799999
1,2,attorneys michael cohen president donald trump...,trump lawyer michael cohen xe attorneys expect...,Wall Street Journal- News,0.0,0.5
2,2,canada house commons monday unanimously condem...,canadian parliament condemns trump attacks tru...,The Hill,-0.3,6.5
3,2,washington reuters senate vote soon week legis...,senate vote zte ban defense bill week,Reuters,-0.1,1.4
4,2,unprecedented summit singapore donald trump ki...,trump kim begin new phase diplomacy,Wall Street Journal- News,0.1,1.4
