# Topic Modeling of Political News Articles

### Text Preprocessing

In [135]:
# import required modules
import nltk
import pandas as pd
from gensim import corpora, models, similarities

In [117]:
# read file from csv
df = pd.read_csv('news-corpus-df.csv')

In [118]:
# drop articles from df that have less than 200 words of text
new_df = df.drop(df[df.text_len < 250].index)

# subset article publication date, source bias, and article text
new_df = new_df.loc[:, ['date', 'bias', 'text']]
new_df.head()

Unnamed: 0,date,bias,text
0,2018-06-13,Left,"Jared Bernstein, a former chief economist to V..."
1,2018-06-13,Right,Liberals have opposed virtually every move Pre...
2,2018-06-13,Center,CLOSE President Trump’s once bitter political ...
3,2018-06-13,Center,"The attorneys for Michael Cohen, President Don..."
4,2018-06-13,Left,Longtime Trump lawyer Michael Cohen is changin...


In [182]:
# import modules from nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

# create a stop word list
political_names = ['rosenstein','assad', 'conway', 'cruz', 'jones', 'moore', 'franken', 'sanders', 'roy', 'conyers', 'putin', 'mccain', 'obama', 'jong', 'un', 'gates', 'nunes', 'powell', 'clinton', 'bill', 'trump', 'comey', 'kim', 'sessions', 'flynn', 'mueller', 'mr', 'mrs', 'hillary', 'tillerson', 'gorsuch', 'bannon', 'manafort', 'mcconnell', 'spicer', 'pence', 'cohen']
geography = ['south','north','korea','puerto','rico','russia','russian','pyongyang']
specific = ['percent']
stop_list = set(stopwords.words('english') + jockers_stop + political_names + geography + specific)

# create a tokenizer
re_tokenizer = RegexpTokenizer(r'\w+')
    
# very simple preprocessing 
def preprocess(text):
    
    collection = []
    
    for i in range(0,len(text)):

        article = text[i].lower()
        
        test_tokens = re_tokenizer.tokenize(article)

        clean_tokens = []
        for t in test_tokens:
            if t not in stop_list:
                clean_tokens.append(t)
                
        collection.append(clean_tokens)
        
    return collection

In [183]:
class TopicModel:
    """ Prepares data for topic modeling with gensim """

    def __init__(self, init_bias):
        # Subset the data set according to source bias, convert text column into list of texts
        self.criteria = new_df['bias'] == init_bias+ ' '
        self.bias = new_df[self.criteria]
        self.text = self.bias['text'].tolist()
        
    def get_info(self):
        return self.criteria.head(), self.bias.head(), self.text[0:2]
    
    # Run preprocessing over texts
    def edit(self):
        self.edit = preprocess(self.text)
        
    def create(self, init_bias):
        self.dict = corpora.Dictionary(self.edit)
        self.dict.save('/tmp/'+init_bias+'dict')
        self.corpus = [self.dict.doc2bow(text) for text in self.edit]
        return self.corpus
    
    # initialize a TFIDF model
    def tfidf_model(self):
        self.tfidf = models.TfidfModel(self.corpus)
        self.corpus_tfidf = self.tfidf[self.corpus]
        return self.tfidf, self.corpus_tfidf
    
    # initialize an LSI transformation
    def lsi(self, num):
        self.lsi = models.LsiModel(self.corpus_tfidf, id2word=self.dict, num_topics=num) 
        self.corpus_lsi = self.lsi[self.corpus_tfidf]
        return self.lsi, self.corpus_lsi

In [185]:
# Topic Model for Liberal Media
# health care, china, iran
liberal = TopicModel('Left')
liberal.edit()
liberal.create('Left')
liberal.tfidf_model()
liberal_model, liberal_model_corpus = liberal.lsi(10)
liberal_model.print_topics()

[(0,
  '0.094*"republicans" + 0.093*"senate" + 0.089*"house" + 0.087*"tax" + 0.086*"health" + 0.080*"democrats" + 0.078*"campaign" + 0.078*"fbi" + 0.077*"care" + 0.073*"investigation"'),
 (1,
  '-0.252*"tax" + -0.237*"health" + -0.226*"obamacare" + -0.199*"care" + -0.186*"repeal" + -0.157*"insurance" + 0.151*"fbi" + -0.150*"republicans" + -0.146*"medicaid" + 0.137*"investigation"'),
 (2,
  '0.230*"fbi" + 0.223*"investigation" + -0.220*"nuclear" + -0.170*"korean" + -0.132*"missile" + 0.123*"intelligence" + -0.123*"iran" + -0.112*"china" + 0.107*"memo" + 0.104*"committee"'),
 (3,
  '0.244*"nuclear" + 0.189*"korean" + -0.182*"voters" + -0.172*"poll" + 0.146*"missile" + 0.139*"iran" + -0.138*"convention" + 0.109*"sanctions" + -0.108*"party" + 0.108*"china"'),
 (4,
  '-0.306*"daca" + -0.249*"court" + -0.197*"immigration" + -0.185*"ban" + 0.160*"tax" + -0.147*"dreamers" + -0.144*"immigrants" + 0.140*"nuclear" + -0.130*"order" + 0.129*"korean"'),
 (5,
  '0.522*"tax" + 0.191*"rate" + 0.155*"ta

In [186]:
# Topic Model for Liberal Media
conservative = TopicModel('Right')
conservative.edit()
conservative.create('Right')
conservative.tfidf_model()
conservative_model, conservative_model_corpus = conservative.lsi(10)
conservative_model.print_topics()

[(0,
  '0.114*"tax" + 0.098*"senate" + 0.092*"obamacare" + 0.086*"republicans" + 0.085*"house" + 0.080*"fbi" + 0.080*"democrats" + 0.077*"u" + 0.076*"campaign" + 0.073*"health"'),
 (1,
  '0.301*"tax" + 0.274*"obamacare" + 0.175*"repeal" + 0.169*"health" + -0.168*"fbi" + 0.153*"plan" + 0.147*"republicans" + 0.138*"care" + 0.134*"senate" + 0.127*"insurance"'),
 (2,
  '0.253*"fbi" + -0.215*"nuclear" + -0.174*"korean" + 0.173*"investigation" + -0.154*"china" + -0.139*"u" + -0.135*"missile" + 0.123*"intelligence" + -0.109*"military" + 0.107*"memo"'),
 (3,
  '-0.267*"tax" + -0.176*"china" + -0.176*"nuclear" + 0.169*"court" + -0.166*"korean" + 0.144*"police" + 0.121*"gun" + -0.120*"trade" + 0.118*"immigration" + -0.116*"missile"'),
 (4,
  '0.356*"tax" + -0.177*"daca" + 0.152*"police" + -0.136*"border" + -0.126*"court" + -0.125*"dreamers" + -0.125*"immigration" + -0.112*"schumer" + 0.106*"taxes" + -0.106*"obamacare"'),
 (5,
  '-0.365*"tax" + 0.247*"obamacare" + -0.169*"court" + -0.158*"daca" +