# Descriptive Analysis (R v. D)

### Notebook goals: 
- Offer preliminary description of the dimensions of the data (democrats v. republicans)
- Provide basic insight into the contents of the data

## Set up

In [1]:
import os

from collections import Counter

%matplotlib inline

import os
import re
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk import tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
## Parameters
to_strip = ',.\xa0:-()\';$"/?][!`Ą@Ś§¨’–“”…ï‘>&\\%˝˘*'

## Functions

In [3]:
 %run functions.ipynb

## Data Files

#### Data organization: 
* I have ordered my data in the POOLE final data folder by party, and then additionally ordered the files by date (in ascending order).
* The dimensions of the analysis span from 1916 - 2016, and they were all collected from one source, UCBS's American Presidency Project website. (https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/nomination-acceptance-speeches)
* The size of units within my analysis are roughly 4 years (although there are larger gaps in the early 1900s). All of the articles of analysis were made at a national level, and they include both conservative and liberal units.

In [4]:
repub_all = os.listdir('data/republicans/')

dem_all = os.listdir('data/democrats/')

In [5]:
sorted(repub_all)

['1916_Hughes_R.txt',
 '1920_Harding_R.txt',
 '1932_Hoover_R.txt',
 '1940_Willkie_R.txt',
 '1944_Dewey_R.txt',
 '1948_Dewey_R.txt',
 '1952_Eisenhower_R.txt',
 '1956_Eisenhower_R.txt',
 '1960_Nixon_R.txt',
 '1964_Goldwater_R.txt',
 '1968_Nixon_R.txt',
 '1972_Nixon_R.txt',
 '1976_Ford_R.txt',
 '1980_Reagan_R.txt',
 '1984_Reagan_R.txt',
 '1988_Bush_R.txt',
 '1992_Bush_R.txt',
 '1996_Dole_R.txt',
 '2000_W_Bush_R.txt',
 '2004_W_Bush_R.txt',
 '2008_Mccain_R.txt',
 '2012_Romney_R.txt',
 '2016_Trump_R.txt']

In [6]:
sorted(dem_all)

['1916_Wilson_D.txt',
 '1928_Smith_D.txt',
 '1932_Roosevelt_D.txt',
 '1936_Roosevelt_D.txt',
 '1944_Roosevelt_D.txt',
 '1948_Truman_D.txt',
 '1952_Stevenson_D.txt',
 '1956_Stevenson_D.txt',
 '1960_Kennedy_D.txt',
 '1964_Johnson_D.txt',
 '1968_Humphrey_D.txt',
 '1972_Mcgovern_D.txt',
 '1976_Carter_D.txt',
 '1980_Carter_D.txt',
 '1984_Mondale_D.txt',
 '1988_Dukakis_D.txt',
 '1992_B_Clinton_D.txt',
 '1996_B_Clinton_D.txt',
 '2000_Gore_D.txt',
 '2004_Kerry_D.txt',
 '2008_Obama_D.txt',
 '2012_Obama_D.txt',
 '2016_H_Clinton_D.txt']

## Data Descriptives

In [7]:
## open speech files 
all_speeches_r = open('data/republican_all.txt').read()
all_speeches_d = open('data/democrat_all.txt').read()

In [8]:
## how many speeches are made by each party?
print('{}\t{}\t{}'.format("repub_all", len(repub_all), "speeches"))
print('{}\t{}\t{}'.format("dem_all  ", len(dem_all), "speeches"))

repub_all	23	speeches
dem_all  	23	speeches


In [9]:
## how many words in all Republican and Democratic speeches
print('{}\t{}\t{}'.format("all_speeches_r", len(all_speeches_r), "words"))
print('{}\t{}\t{}'.format("all_speeches_d", len(all_speeches_d), "words"))

all_speeches_r	560637	words
all_speeches_d	512144	words
