In [1]:
import pandas as pd
import os
from string import punctuation
import numpy as np
import warnings
rand_state = 42
np.random.seed(rand_state)

warnings.simplefilter('ignore')

In [3]:
path = 'sotu'
dirs = os.listdir(path)

df = pd.DataFrame(columns=['year', 'president', 'party', 'text'])

for i in range(len(dirs)):
    components = dirs[i].split('_')
    name = components[0]
    year = components[1].split('.')[0]
    df.loc[i,'year'] = year
    df.loc[i,'president'] = name   
    
    filename = os.path.join(path, dirs[i])
    text_file = open(filename, "r")
    
    lines = text_file.read()
    lines = lines.replace('\n', ' ')
    df.loc[i, 'text'] = lines.lower()
    
df.year = df.year.astype(int) 
df.president = df.president.astype(str)
df.text = df.text.astype(str)
print('Shape: ', df.shape)

Shape:  (229, 4)


## Annotate

There are a few presidents that have the same last name (Roosevelt, Bush, Johnson and Adams), so let's clean that up now.

In [4]:
# need to distinuish between Theodore Roosevelt and Franklin D. Roosevelt
indices = df.query("president =='Roosevelt' & year <= 1909").index
df.loc[indices,'president'] = 'Theodore Roosevelt'

indices = df.query("president == 'Roosevelt'").index
df.loc[indices,'president'] = 'Franklin D. Roosevelt'

indices = df.query("president =='Bush' & year <= 1992").index
df.loc[indices,'president'] = 'George H. W. Bush'

indices = df.query("president == 'Bush'").index
df.loc[indices,'president'] = 'George W. Bush'

indices = df.query("president =='Johnson' & year <= 1869").index
df.loc[indices,'president'] = 'Andrew Johnson'

indices = df.query("president == 'Johnson'").index
df.loc[indices,'president'] = 'Lyndon B. Johnson'

indices = df.query("president =='Adams' & year <= 1801").index
df.loc[indices,'president'] = 'John Adams'

indices = df.query("president == 'Adams'").index
df.loc[indices,'president'] = 'John Quincy Adams'


indices = df.query("president =='Harrison' & year <= 1841").index
df.loc[indices,'president'] = 'William Henry Harrison'

indices = df.query("president == 'Harrison'").index
df.loc[indices,'president'] = 'Benjamin Harrison'

Now let's attach the political party to the president (note some presidents changed parties throughout their political career, I am simply listing the party they belonged to while serving as President).

In [5]:
def pres_to_party(name):
    republican = ['Lincoln', 'Grant', 'Hayes', 'Garfield', 'Arthur', 
                  'Benjamin Harrison', 'McKinley', 'Theodore Roosevelt', 
                  'Taft', 'Harding', 'Coolidge', 'Hoover', 'Eisenhower', 
                  'Nixon', 'Ford', 'Reagan', 'George H. W. Bush', 
                  'George W. Bush', 'Trump']
    if name in republican:
        return 'Republican'
    
    democratic = ['Jackson', 'Buren', 'Polk', 'Pierce', 
                  'Buchanan', 'Cleveland', 'Wilson', 'Franklin D. Roosevelt', 
                  'Truman', 'Kennedy', 'Lyndon B. Johnson', 'Carter', 'Clinton', 'Obama']
    if name in democratic:
        return 'Democratic'
    
    whig = ['William Henry Harrison', 'Taylor', 'Fillmore']
    if name in whig:
        return 'Whig'
    
    national_union = ['Andrew Johnson']
    if name in national_union:
        return 'National Union'
    
    
    unaffiliated = ['Washington', 'Tyler']
    if name in unaffiliated:
        return 'Unaffiliated'
    
    federalist = ['John Adams']
    if name in federalist:
        return 'Federalist'
    
    democratic_republican = ['Jefferson', 'Madison', 'Monroe', 'John Quincy Adams']
    if name in democratic_republican:
        return 'Democratic-Republican'
    
df.party = df.president.apply(pres_to_party)

df.set_index('year', inplace=True)
df.sort_index(inplace=True)

# need to drop George Washington's 1790 address as the file is empty
df = df.iloc[1:,:]
df.head()

Unnamed: 0_level_0,president,party,text
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1791,Washington,Unaffiliated,fellow-citizens of the senate and house of rep...
1792,Washington,Unaffiliated,fellow-citizens of the senate and house of rep...
1793,Washington,Unaffiliated,fellow-citizens of the senate and house of rep...
1794,Washington,Unaffiliated,fellow-citizens of the senate and house of rep...
1795,Washington,Unaffiliated,fellow-citizens of the senate and house of rep...


Note that the political parties are not well balanced, the Democrats and Republicans accounnt for 177 out of 227 addresses, while the other three parties made up only 22%. Therefore, it might be more interesting to predict the probability that a given text belongs to a particular political party, eg. 62% Democrat and 38% Republican.

In [6]:
df = df[df.party.isin(['Republican', 'Democratic'])]

## Tokenize  

We will begin by splitting our text (single SOTU transcript) into it's constituent sentences, and then split each sentence into words, only keeping alphanumeric characters, tag each word with it's part of speech, and then lemmatize each word (Word net lemmatizer).  

In [10]:
from nltk import sent_tokenize
sentences = [sent_tokenize(text) for text in df.text]

# remove the first and last sentences (meaningless intro/closing statements)
for i in range(len(sentences)):
    del sentences[i][0]
    del sentences[i][-1]  
    
sentence_lengths = [len(sent) for sent in sentences]
df['sentences'] = sentences
df['sentence_length'] = [len(sent) for sent in sentences]

In [11]:
from nltk import word_tokenize, sent_tokenize
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords 
stop_words = stopwords.words('english')
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mitchellmurphy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
def tokenize(sentence):
    words = [word for word in sentence.split(' ')] # if word not in STOPWORDS]
    _s = ' '.join(words)
    _s = re.sub('[\.,\?\!]','',_s)
    _s = re.sub('\d+', '_NUMBER',_s)
    return _s

In [13]:
porter = PorterStemmer()
wnl = WordNetLemmatizer()

def penn2morphy(penntag):
    """ Converts Penn Treebank tags to WordNet. """
    morphy_tag = {'NN':'n', 'JJ':'a',
                  'VB':'v', 'RB':'r'}
    try:
        return morphy_tag[penntag[:2]]
    except:
        return 'n' 

def lemmatize_sent(text): 
    # Text input is string, returns lowercased strings.
    return [wnl.lemmatize(word.lower(), pos=penn2morphy(tag)) 
            for word, tag in pos_tag(word_tokenize(text))]

def clean_text(text):
    #text = text.lower().replace(',','')
    tokenized_text = word_tokenize(text)
    cleaned_text = [t for t in tokenized_text if t not in stop_words] # and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)
    _l = lemmatize_sent(' '.join(cleaned_text))
    # replace numbers with _NUMBER
    #_l = re.sub('\d+', '_NUMBER',' '.join(_l))
    # strip punctuation
    _l = re.sub('[\.,\?\!]','',' '.join(_l))
    _l = re.sub('[^a-z0-9]', ' ', _l)
    return _l.replace('  ',' ').strip()

In [16]:
sentences_all = []
for sentence in sentences:
    for _s in sentence:
        sentences_all.append(clean_text(_s))

In [20]:
_t = 0
for _s in sentences_all:
    if len(_s.split()) <= 1:
        _t += 1
_t/len(sentences_all)

0.012824297892755656

So 1.2\% of all sentences only contain 1 word. Not great but pretty small fraction so for now we will keep them.

Now that we have preprocessed our text, lets create target labels, merge them into a dataframe and save it for downstream analysis.

In [21]:
# we will use Democratic as the positive class
df['party_num'] = (df.party == 'Democratic').astype(int)

target = []

for i in range(df.shape[0]):
    target.append((np.ones((df.iloc[i,4],)) * int(df.iloc[i,1] == 'Republican')))
    
target = np.concatenate(target, axis=0)

df_processed = pd.DataFrame({'label': target, 'text': sentences_all})
df_processed.label = df_processed.label.astype(int)
df_processed.head()

Unnamed: 0,label,text
0,0,task devolves provision constitution present f...
1,0,communicate first time source unfeigned satisf...
2,0,turn eye nation great desire see brethren huma...
3,0,foreign relation although general character pa...
4,0,effect adjustment shall continue object earnes...


In [22]:
df_processed.to_csv('sotu_lemmatized.csv', index=False)

**Trump 2019**

In [None]:
filename = "sotu/Trump_2019.txt"
text_file = open(filename, "r")

lines = text_file.read()
lines = lines.replace('\n', ' ')
text = lines.lower()

In [None]:
from nltk import sent_tokenize
#sentences = [sent_tokenize(_t) for _t in text]
sentences = sent_tokenize(text)

In [None]:
text.count('applause') / len(sentences)

In [None]:
num_words = [len(sentence.split()) for sentence in sentences]

In [None]:
# remove applause
text = text.replace('(applause.)','')
# remove commas
text = text.replace(',','')

In [None]:
del sentences[0]
del sentences[-1]

sentences_all = []
for sentence in sentences:
    sentences_all.append(clean_text(sentence))

In [None]:
sentences_all = [_s for _s in sentences_all if len(_s.split()) > 1]

In [None]:
num_sentences = len(sentences_all)

target = np.zeros((num_sentences))

df_processed = pd.DataFrame({'label': target, 'text': sentences_all})
df_processed.label = df_processed.label.astype(int)
df_processed.head()

In [None]:
df_processed.to_csv('trump_2019_lemmatized.csv', index=False)