## Senate NLP Project - Topic Modeling
### By: Mitch Brinkman

In [1]:
from sklearn.decomposition import NMF
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import re
import pickle
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer 
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.probability import FreqDist, ConditionalFreqDist
import nltk
nltk.download('punkt')
nltk.download('wordnet')
pd.set_option('display.max_rows', 500)
from sklearn.feature_extraction import text 

[nltk_data] Downloading package punkt to /Users/user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import pandas as pd
import re

In [3]:
from senate_func import build_edu_era_tables
from senate_func import build_hc_era_tables
from senate_func import build_fin_era_tables
from senate_func import clean_senate_speech
from senate_func import prez
from senate_func import display_topics
from senate_func import drop_columns
from senate_func import NLPProcessor
# from senate_func import find_topics

### DF Segmentation

#### DF Types & Names:
    Topic
        Education: edu_df
        Healthcare: hc_df
        Banking: fin_df
    Era
        1980-1988: (edu/hc/fin)_reagan_df
        1989-1992: (edu/hc/fin)_bush_df
        1993-2000: (edu/hc/fin)_clinton_df
        2001-2008: (edu/hc/fin)_w_bush_df
        2009-2016: (edu/hc/fin)_obama_df
    Gender
        female_df
        male_df
        edu_female_df
        hc_female_df
        fin_female_df
        edu_male_df
        hc_male_df
        fin_male_df 
    Party
        edu_dem_df
        hc_dem_df
        fin_dem_df
        edu_rep_df
        hc_rep_df
        fin_rep_df
    

In [None]:
# Builds and pickles era specific tables needed into the appropriate folder for use at any time
build_fin_era_tables(prez)
build_edu_era_tables(prez)
build_hc_era_tables(prez)

#### By Era - split into Themes (Edu, HC or Fin)

In [None]:
edu_reagan_df = pd.read_pickle('./data/pickles/era/edu_reagan_df.pkl')
hc_reagan_df = pd.read_pickle('./data/pickles/era/hc_reagan_df.pkl')
fin_reagan_df = pd.read_pickle('./data/pickles/era/fin_reagan_df.pkl')
edu_bush_df = pd.read_pickle('./data/pickles/era/edu_bush_df.pkl')
hc_bush_df = pd.read_pickle('./data/pickles/era/hc_bush_df.pkl')
fin_bush_df = pd.read_pickle('./data/pickles/era/fin_bush_df.pkl')
edu_clinton_df = pd.read_pickle('./data/pickles/era/edu_clinton_df.pkl')
hc_clinton_df = pd.read_pickle('./data/pickles/era/hc_clinton_df.pkl')
fin_clinton_df = pd.read_pickle('./data/pickles/era/fin_clinton_df.pkl')
edu_w_bush_df = pd.read_pickle('./data/pickles/era/edu_w_bush_df.pkl')
hc_w_bush_df = pd.read_pickle('./data/pickles/era/hc_w_bush_df.pkl')
fin_w_bush_df = pd.read_pickle('./data/pickles/era/fin_w_bush_df.pkl')
edu_obama_df = pd.read_pickle('./data/pickles/era/edu_obama_df.pkl')
hc_obama_df = pd.read_pickle('./data/pickles/era/hc_obama_df.pkl')
fin_obama_df = pd.read_pickle('./data/pickles/era/fin_obama_df.pkl')

In [4]:
# edu_df = pd.read_pickle('./data/pickles/topic/edu_df.pkl')
# hc_df = pd.read_pickle('./data/pickles/topic/hc_df.pkl')
fin_df = pd.read_pickle('./data/pickles/topic/fin_df.pkl')


#### Democrats by 80s, 90s and 00s

In [None]:
fin_dem_00_df = fin_df[(fin_df['date'] > '2000-01-01') & (fin_df['party'] == 'D')]
fin_dem_90_df = fin_df[(fin_df['date'] > '1990-01-01') & (fin_df['date'] < '2000-01-01') & (fin_df['party'] == 'D')]
fin_dem_80_df = fin_df[(fin_df['date'] < '1990-01-01') & (fin_df['party'] == 'D')]
hc_dem_00_df = hc_df[(hc_df['date'] > '2000-01-01') & (hc_df['party'] == 'D')]
hc_dem_90_df = hc_df[(hc_df['date'] > '1990-01-01') & (hc_df['date'] < '2000-01-01') & (hc_df['party'] == 'D')]
hc_dem_80_df = hc_df[(hc_df['date'] < '1990-01-01') & (hc_df['party'] == 'D')]
edu_dem_00_df = edu_df[(edu_df['date'] > '2000-01-01') & (edu_df['party'] == 'D')]
edu_dem_90_df = edu_df[(edu_df['date'] > '1990-01-01') & (edu_df['date'] < '2000-01-01') & (edu_df['party'] == 'D')]
edu_dem_80_df = edu_df[(edu_df['date'] < '1990-01-01') & (edu_df['party'] == 'D')]

#### Republicans by 80s, 90s and 00s

In [None]:
fin_rep_00_df = fin_df[(fin_df['date'] > '2000-01-01') & (fin_df['party'] == 'D')]
fin_rep_90_df = fin_df[(fin_df['date'] > '1990-01-01') & (fin_df['date'] < '2000-01-01') & (fin_df['party'] == 'D')]
fin_rep_80_df = fin_df[(fin_df['date'] < '1990-01-01') & (fin_df['party'] == 'D')]
hc_rep_00_df = hc_df[(hc_df['date'] > '2000-01-01') & (hc_df['party'] == 'D')]
hc_rep_90_df = hc_df[(hc_df['date'] > '1990-01-01') & (hc_df['date'] < '2000-01-01') & (hc_df['party'] == 'D')]
hc_rep_80_df = hc_df[(hc_df['date'] < '1990-01-01') & (hc_df['party'] == 'D')]
edu_rep_00_df = edu_df[(edu_df['date'] > '2000-01-01') & (edu_df['party'] == 'D')]
edu_rep_90_df = edu_df[(edu_df['date'] > '1990-01-01') & (edu_df['date'] < '2000-01-01') & (edu_df['party'] == 'D')]
edu_rep_80_df = edu_df[(edu_df['date'] < '1990-01-01') & (edu_df['party'] == 'D')]

In [None]:
first_hc_clinton_df = hc_clinton_df[(hc_clinton_df['date'] < '1995-01-01')]
second_hc_clinton_df = hc_clinton_df[(hc_clinton_df['date'] > '1995-01-01')]

In [5]:
fin_dem_80_df = fin_df[(fin_df['date'] < '1990-01-01') & (fin_df['party'] == 'D')]

### DTM & Topic Modeling Production

##### Each set of topics is then loaded into a word document to be interpreted in an easier format

In [7]:
# Pipeline processor with 1-2 ngrams, stop-words, lemmatizing

nlp = NLPProcessor(CountVectorizer(stop_words='english',ngram_range=(1, 2),max_df=.97,min_df=.05)
                   ,TreebankWordTokenizer().tokenize,
                   clean_senate_speech, WordNetLemmatizer().lemmatize)

nlp.fit(fin_dem_80_df['speech'])
fin_dem_80_dtm = nlp.transform(fin_dem_80_df['speech'])
fin_dem_80_cv = nlp.vectorizer
nmf_model = NMF(12)
doc_topic = nmf_model.fit_transform(fin_dem_80_dtm)
display_topics(nmf_model, fin_dem_80_cv.get_feature_names(), 8)


Topic  1
bank, wa, year, capital, loss, mr, failure, insurance

Topic  2
bank, farmer, agricultural, farm, loan, debt, problem, agricultural bank

Topic  3
bank, federal, agency, regulatory, banking, federal reserve, reserve, ha

Topic  4
loan, student, student loan, program, bond, cost, state, act

Topic  5
security, bank, commercial, market, firm, financial, glasssteagall, underwriting

Topic  6
loan, farmer, market, ha, program, year, export, capital

Topic  7
bank, title, insurance, legislation, think, moratorium, ii, problem

Topic  8
bank, company, holding, holding company, banking, bank holding, state, act

Topic  9
business, bank, country, american, ha, local, state, banking

Topic  10
banking, committee, state, new, financial, nonbank, ha, nonbank bank

Topic  11
reserve, bank, federal, money, federal reserve, tax, deposit, billion

Topic  12
bank, international, financial, banking, country, world, currency, foreign


### Code Archive- DISREGARD

In [None]:
# def build_edu_era_tables (prez_list):
#     for i in prez_list:
#             i_df = pd.read_pickle('./data/pickles/era/'+i+'_df.pkl')
#             edu_i_df = i_df.loc[(i_df['labels']=='Education')]
#             pickle.dump(edu_i_df, open("./data/pickles/era/edu_"+i+"_df.pkl", "wb"))

In [None]:
# def build_hc_era_tables (prez_list):
#     for i in prez_list:
#             i_df = pd.read_pickle('./data/pickles/era/'+i+'_df.pkl')
#             hc_i_df = i_df.loc[(i_df['labels']=='Healthcare')]
#             pickle.dump(hc_i_df, open("./data/pickles/era/hc_"+i+"_df.pkl", "wb"))

In [None]:
# def build_fin_era_tables (prez_list):
#     for i in prez_list:
#             i_df = pd.read_pickle('./data/pickles/era/'+i+'_df.pkl')
#             fin_i_df = i_df.loc[(i_df['labels']=='Financial')]
#             pickle.dump(fin_i_df, open("./data/pickles/era/fin_"+i+"_df.pkl", "wb"))

In [None]:
# def clean_senate_speech(text):
#     '''Make text lowercase, remove text in square brackets, 
#     remove punctuation and remove words containing numbers.
#     '''
#     text = re.sub('\w*\d\w*', '', text)
#     text = text.lower()
#     text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
#     return text

# big_wash = lambda x: clean_senate_speech(x)

In [None]:
# def display_topics(model, feature_names, no_top_words, topic_names=None):
#     """
#     Displays the top n terms in each topic
#     """
#     for ix, topic in enumerate(model.components_):
#         if not topic_names or not topic_names[ix]:
#             print("\nTopic ", ix + 1)
#         else:
#             print("\nTopic: '",topic_names[ix],"'")
#         print(", ".join([feature_names[i]
#                         for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
# class NLPProcessor:
    
#     def __init__(self, vectorizer_class, tokenizer_function, cleaning_function,lemmer_function):
#         self.vectorizer = vectorizer_class
#         self.tokenizer = tokenizer_function
#         self.cleaning_function = cleaning_function
#         self.lemmer = lemmer_function
    
#     def fit(self, corpus_list_to_fit):
#         cleaned_corpus = list(map(self.cleaning_function, corpus_list_to_fit))
# #         print(cleaned_corpus)
#         tokenized_list = list(map(self.tokenizer, cleaned_corpus))
# #         print(tokenized_list)
#         lemmed_list = [' '.join(list(map(self.lemmer, item))) for item in tokenized_list]
# #         print(lemmed_list)
#         return self.vectorizer.fit(lemmed_list)
    
#     def transform(self, corpus_list_to_clean):
#         cleaned_corpus = list(map(self.cleaning_function, corpus_list_to_clean))
#         tokenized_list = list(map(self.tokenizer, cleaned_corpus))
#         lemmed_list = [' '.join(list(map(self.lemmer, item))) for item in tokenized_list]
#         return pd.DataFrame(self.vectorizer.transform(lemmed_list).toarray(), 
#                             columns=self.vectorizer.get_feature_names())