# Import Dependencies

In [1]:
import re
import pickle
import string
import numpy as np
import pandas as pd

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.attrs import IS_ALPHA
from spacy.lang.en import English
from spacy import displacy, lemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

# Load and Pre-Process Corpus

In [2]:
# Load pickled speech
clean_az = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/clean_az.p","rb"))
clean_ne = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/clean_ne.p","rb"))
clean_wi = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/clean_wi.p","rb"))
clean_mi = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/clean_mi.p","rb"))
clean_pa = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/clean_pa.p","rb"))

In [3]:
# Load pickled clean_corpus
clean_corpus = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/clean_corpus.p","rb"))

In [16]:
# Create tokenized corpus and preprocess text
def preprocess(clean_corpus):
    """
    Takes in a speech, tokenizes and breaks it out into sentences, lemmatizes and removes stop words and punctuation for each speech
    """
    
    # Create our list of punctuation marks
    punctuations = '!"#$%&\'()’*+,-./:”;<=>?@[\\]^_`{|}~'

    # Create our list of stopwords
    stop_words = spacy.lang.en.stop_words.STOP_WORDS
    
    # Establish weird words in our corpus to be filtered out
    weird_words = ['abc', 'c', 'o', 'n','', 'know', 'like', 'lot', 'want', 'rand', 'look', 'like', 'thing', 'day', 'come', 'leave',
                   'today', 'way', 'let', 'think', 'probably', 'long', 'try', 'hear']
    
    # Etablish new sentence list
    sentence_list = []
    
    # Tokenizer
    tokenizer = spacy.load('en_core_web_sm')
    corpus_tokens = tokenizer(clean_corpus)

    for sentence in corpus_tokens.sents:
        
        # Lemmatizing each token and converting each token into lowercase
        sentence = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in sentence]
        
        # Remove stop words, weird words and punctuations
        sentence = [word for word in sentence if word not in stop_words and word not in punctuations \
                         and word not in weird_words]

        # Remove numbers
        sentence = [word for word in sentence if word.isalpha()]
        
        # Join all words in sentence
        sentence = ','.join(sentence)
        
        # Append to new list
        sentence_list.append(sentence)

    # Remove empty sentences
    sentence_list = [sentence for sentence in sentence_list if sentence != '']
    
    return sentence_list

In [10]:
# Preprocess our corpus
processed_corpus = preprocess(clean_corpus)

In [11]:
# Preprocess our state corpuses
processed_az = preprocess(clean_az)
processed_ne = preprocess(clean_ne)
processed_wi = preprocess(clean_wi)
processed_mi = preprocess(clean_mi)
processed_pa = preprocess(clean_pa)

# Doc-Term Matrix

In [12]:
# The first document-term matrix has default Count Vectorizer values - counts of unigrams
from sklearn.feature_extraction.text import CountVectorizer

cv1 = CountVectorizer(stop_words='english')

X = cv1.fit_transform(processed_pa)

df = pd.DataFrame(X.toarray(), columns=cv1.get_feature_names())

In [13]:
df

Unnamed: 0,able,abolish,abraham,absentee,absolutely,accept,access,accord,accountability,accurate,...,yes,yesterday,york,young,yu,zero,zippo,zone,zoning,zucker
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3782,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3783,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3784,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3785,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
