# Import Dependencies

In [1]:
import re
import pickle
import string
import numpy as np
import pandas as pd

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.attrs import IS_ALPHA
from spacy.lang.en import English
from spacy import displacy, lemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

# Load and Pre-Process Corpus

In [2]:
# Load pickled clean_corpus
clean_goodyear = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/clean_goodyear.p","rb"))

In [6]:
# Create tokenized corpus and preprocess text
def preprocess(speech):
    """
    Takes in a corpus of speech strings, tokenizes, lemmatizes and removes stop words and punctuation for each speech
    """
    # Instantiate spacy
    nlp = spacy.load('en_core_web_sm')
    # Create our list of punctuation marks
    punctuations = '!"#$%&\'()’*+,-./:”;<=>?@[\\]^_`{|}~'

    # Create our list of stopwords
    stop_words = spacy.lang.en.stop_words.STOP_WORDS
    
    # Establish weird words in our corpus to be filtered out
    weird_words = ['abc', 'c', 'o', 'n']
    
    # Establish new_corpus
    new_speech = []
    
    # Loop through each speech in the corpus and preprocess
    for sentence in speech:
        
        # Tokenize speech
        sentence_tokens = nlp(sentence)
        
        # Lemmatizing each token and converting each token into lowercase
        sentence_tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in sentence_tokens]

        # Remove stop words, weird words and punctuations
        sentence_tokens = [word for word in sentence_tokens if word not in stop_words and word not in punctuations \
                         and word not in weird_words]

        # Remove numbers
        sentence_tokens = [word for word in sentence_tokens if word.isalpha()]
                
        # Join it all into one string
        sentence_tokens = ','.join(sentence_tokens)
        
        # Append to new_corpus
        new_speech.append(sentence_tokens)
    
    return new_speech

In [7]:
# Preprocess our corpus
processed_goodyear = preprocess(clean_goodyear)

# Doc-Term Matrix

In [10]:
# The first document-term matrix has default Count Vectorizer values - counts of unigrams
from sklearn.feature_extraction.text import CountVectorizer

cv1 = CountVectorizer(stop_words='english')

X = cv1.fit_transform(processed_goodyear)

df = pd.DataFrame(X.toarray(), columns=cv1.get_feature_names())

In [11]:
df

Unnamed: 0,able,abolish,abortion,abraham,abuse,access,accomplishment,accountability,achieve,acknowledge,...,wyatt,xenophobic,year,yee,yes,yesterday,york,young,yuma,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,4,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
117,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
118,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,1,0
119,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
