# n-Gram Frequency Distribution Analysis (R v. D)

## Setup

In [2]:
import os

from collections import Counter

%matplotlib inline

import os
import re
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk import tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [3]:
## Parameters
to_strip = ',.\xa0:-()\';$"/?][!`Ą@Ś§¨’–“”…ï‘>&\\%˝˘*'

In [8]:
## Open speeches
all_speeches_r = open('data/republican_all.txt').read()
all_speeches_d = open('data/democrat_all.txt').read()

## Functions

In [24]:
 %run functions.ipynb

## Speech Tokens

In [25]:
republicantokens = tokenize(all_speeches_r)
democrattokens = tokenize(all_speeches_d)

print('{} tokens in the Republican convention nominee speeches'.format(len(republicantokens)))
print('{} tokens in the Democratic convention nominee speeches'.format(len(democrattokens)))

98055 tokens in the Republican convention nominee speeches
91071 tokens in the Democratic convention nominee speeches


## n-gram Frequencies - Republicans

In [32]:
## Split speeches by white space 
r_split = all_speeches_r.split('\n\n')

In [45]:
word_freq_r = Counter()
bigram_freq_r = Counter()
trigram_freq_r = Counter()

for speech in r_split: 
    r_replaced = speech.replace(': ', ' ').replace(',', '').replace('.', '')
    r_lower = r_replaced.lower()
    tokens_r = tokenize(r_lower)
    word_freq_r.update(tokens_r)
    bigrams_r = get_bigram_tokens(tokens_r)
    bigram_freq_r.update(bigrams_r)
    trigrams_r = get_ngram_tokens(tokens_r, n = 3)
    trigram_freq_r.update(trigrams_r)

word_freq_r.most_common(30)

[('the', 5671),
 ('and', 3705),
 ('of', 3595),
 ('to', 3145),
 ('in', 2077),
 ('a', 1787),
 ('we', 1535),
 ('that', 1530),
 ('i', 1472),
 ('our', 1315),
 ('is', 1157),
 ('for', 1128),
 ('it', 899),
 ('have', 846),
 ('this', 809),
 ('will', 802),
 ('be', 640),
 ('not', 638),
 ('are', 595),
 ('you', 545),
 ('with', 538),
 ('but', 513),
 ('on', 502),
 ('my', 488),
 ('as', 465),
 ('america', 457),
 ('their', 424),
 ('all', 418),
 ('by', 416),
 ('people', 396)]

In [46]:
bigram_freq_r.most_common(30)

[('of the', 728),
 ('in the', 507),
 ('to the', 346),
 ('and the', 239),
 ('of our', 218),
 ('for the', 217),
 ('the world', 216),
 ('it is', 215),
 ('we have', 192),
 ('and i', 183),
 ('we will', 172),
 ('we are', 163),
 ('on the', 148),
 ('the american', 147),
 ('i have', 146),
 ('we must', 146),
 ('is the', 140),
 ('will be', 136),
 ('the united', 135),
 ('united states', 133),
 ('to be', 130),
 ('in this', 128),
 ('in our', 119),
 ('that we', 116),
 ('i am', 115),
 ('and we', 112),
 ('i will', 111),
 ('that the', 109),
 ('in a', 107),
 ('is a', 102)]

In [47]:
trigram_freq_r.most_common(30)

[('the united states', 131),
 ('in the world', 77),
 ('the american people', 76),
 ('of the united', 70),
 ('of the world', 43),
 ('president of the', 39),
 ('the republican party', 37),
 ('of the american', 34),
 ('my fellow americans', 32),
 ('are going to', 32),
 ('men and women', 31),
 ('it is the', 28),
 ('we are going', 27),
 ('i believe in', 26),
 ('there is no', 26),
 ('one of the', 26),
 ('united states of', 23),
 ('states of america', 23),
 ('i want to', 23),
 ('it is a', 22),
 ('in this country', 21),
 ('and i am', 21),
 ('and we will', 21),
 ('the federal government', 21),
 ('that we have', 21),
 ('the party of', 21),
 ('not going to', 21),
 ('and in the', 20),
 ('i believe that', 20),
 ("we're going to", 20)]

## n-gram Frequencies - Democrats

In [33]:
## Split speeches by white space 
d_split = all_speeches_d.split('\n\n')

In [40]:
word_freq_d = Counter()
bigram_freq_d = Counter()
trigram_freq_d = Counter()

for speech in d_split: 
    d_replaced = speech.replace(': ', ' ').replace(',', '').replace('.', '')
    d_lower = d_replaced.lower()
    tokens_d = tokenize(d_lower)
    word_freq_d.update(tokens_d)
    bigrams_d = get_bigram_tokens(tokens_d)
    bigram_freq_d.update(bigrams_d)
    trigrams_d = get_ngram_tokens(tokens_d, n = 3)
    trigram_freq_d.update(trigrams_d)
    
word_freq_d.most_common(30)

[('the', 5284),
 ('and', 3622),
 ('of', 3144),
 ('to', 2881),
 ('a', 1721),
 ('in', 1703),
 ('that', 1497),
 ('we', 1365),
 ('i', 1311),
 ('for', 1128),
 ('our', 1128),
 ('is', 972),
 ('it', 770),
 ('have', 764),
 ('this', 651),
 ('you', 634),
 ('will', 591),
 ('not', 575),
 ('are', 569),
 ('be', 545),
 ('with', 509),
 ('as', 501),
 ('but', 488),
 ('on', 474),
 ('all', 466),
 ('they', 453),
 ('who', 423),
 ('my', 418),
 ('by', 418),
 ('people', 390)]

In [41]:
bigram_freq_d.most_common(30)

[('of the', 660),
 ('in the', 457),
 ('to the', 286),
 ('and the', 242),
 ('for the', 220),
 ('we have', 206),
 ('of our', 205),
 ('and i', 181),
 ('it is', 155),
 ('on the', 154),
 ('is the', 136),
 ('to be', 134),
 ('we are', 130),
 ('we will', 129),
 ('the people', 126),
 ('the world', 119),
 ('we must', 117),
 ('that the', 115),
 ('we can', 113),
 ('by the', 112),
 ('i will', 112),
 ('of a', 111),
 ('in this', 109),
 ('the american', 102),
 ('i am', 101),
 ('that we', 100),
 ('and we', 97),
 ('the united', 96),
 ('i have', 93),
 ('with the', 93)]

In [43]:
trigram_freq_d.most_common(30)

[('the united states', 90),
 ('of the united', 64),
 ('the american people', 63),
 ('the democratic party', 47),
 ('i want to', 46),
 ("we're going to", 41),
 ('the people of', 40),
 ('in the world', 31),
 ('my fellow americans', 31),
 ('of the world', 30),
 ('men and women', 30),
 ('president of the', 29),
 ('be able to', 28),
 ('the 21st century', 27),
 ('and we will', 24),
 ('we can do', 24),
 ('the party of', 23),
 ('this is the', 23),
 ('and i will', 23),
 ('the republican party', 21),
 ('of this country', 21),
 ('all of us', 21),
 ('bridge to the', 21),
 ('and in the', 20),
 ('in the last', 20),
 ('one of the', 20),
 ('of the democratic', 20),
 ('that is the', 20),
 ('it is time', 20),
 ('to the 21st', 20)]