# Import Dependencies

In [80]:
import re
import pickle
import string
import numpy as np
import pandas as pd

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from spacy import displacy

from sklearn.feature_extraction.text import CountVectorizer

from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import sparse2full

# Load and Clean the Data

In [2]:
# Load the speeches pickle files and clean text
goodyear = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/goodyear.p","rb"))
bullhead = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/bullhead.p","rb"))
omaha = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/omaha.p","rb"))
wsalem = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/wsalem.p","rb"))
lansing = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/lansing.p","rb"))
martinsburg = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/martinsburg.p","rb"))
lititz = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/lititz.p","rb"))
allentown = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/allentown.p","rb"))

def clean(speech):
    """
    Takes a speech as an argument and cleans it so it's ready to be processed into NLP Pipeline
    """
    speech = speech[5:] # Removes meaningless intro
    
    for i in range(len(speech)):
        speech[i] = speech[i][speech[i].find('\n') + 1:] # Removes 'meaningless text hear (min:sec)\n' at the beginning of each paragraph
        speech[i] = speech[i].replace('[', '(') # Replaces brackets with paranthesis
        speech[i] = speech[i].replace(']', ')')
        speech[i] = re.sub(r'\([^)]*\)', '', speech[i]) # Removes meaningless text in parantheses
    
    speech = ','.join(speech) # Join all of the paragraphs into on huge string
    
    return speech

goodyear = clean(goodyear)
bullhead = clean(bullhead)
omaha = clean(omaha)
wsalem = clean(wsalem)
lansing = clean(lansing)
martinsburg = clean(martinsburg)
lititz = clean(lititz)
allentown = clean(allentown)

# Create Doc-Term Matrix

In [66]:
# Create tokenizer

def tokenizer(speech):
    """
    Takes in a speech string, tokenizes, lemmatizes, removes stop words and punctuation
    """
    # Create our list of punctuation marks
    punctuations = '!"#$%&\'()’*+,-./:;<=>?@[\\]^_`{|}~'

    # Create our list of stopwords
    stop_words = spacy.lang.en.stop_words.STOP_WORDS

    # Tokenize speech
    nlp = spacy.load('en_core_web_sm')
    speech_tokens = nlp(speech)
    
   # Lemmatizing each token and converting each token into lowercase
    speech_tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in speech_tokens]
    
    # Removing stop words and punctuations
    speech_tokens = [word for word in doc if word not in stop_words and word not in punctuations]
    
    return speech_tokens

In [68]:
tokenizer(goodyear)

3741

In [56]:
# Tokenize our document
nlp = spacy.load('en_core_web_sm')
doc = nlp(goodyear)

In [17]:
# How many total tokens do we have?
len(doc)

11548

In [57]:
# Create our list of punctuation marks
punctuations = '!"#$%&\'()’*+,-./:;<=>?@[\\]^_`{|}~'
# Create our list of stopwords
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [58]:
# Lemmatizing each token and converting each token into lowercase
doc = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in doc]

In [59]:
# Removing stop words
doc = [word for word in doc if word not in stop_words and word not in punctuations ]

In [78]:
doc

['deliver',
 'record',
 'prosperity',
 'epic',
 'job',
 'growth',
 'safe',
 'vaccine',
 'eradicate',
 'virus',
 'china',
 'plague',
 'quickly',
 'end',
 'pandemic',
 'normal',
 'life',
 'want',
 'want',
 'normal',
 'life',
 'like',
 'seven',
 'month',
 'ago',
 'fully',
 'resume',
 'year',
 'great',
 'economic',
 'power',
 'strong',
 'happen',
 'year',
 'good',
 'economic',
 'year',
 'country',
 'history',
 'year',
 'interrupt',
 'interrupt',
 'america',
 'great',
 'vote',
 'joe',
 'biden',
 'sleepy',
 'joe',
 'vote',
 'big',
 'tax',
 'hike',
 'history',
 'man',
 'campaign',
 'fact',
 'raise',
 'taxis',
 ',lock',
 'lock',
 'lock',
 'lock',
 'lock',
 'lock',
 'lock',
 'lock',
 'lock',
 'lock',
 'up!,they’re',
 'crush',
 'regulation',
 'crush',
 'idea',
 'want',
 'regulation',
 'administration',
 'history',
 'country',
 'regulation',
 'cause',
 'waste',
 'lot',
 'problem',
 'slash',
 'medicare',
 'social',
 'security',
 'want',
 'abolish',
 'american',
 'energy',
 'know',
 'fracke',
 'fra