# Import Dependencies

In [None]:
import re
import pickle
import string
import numpy as np
import pandas as pd

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.attrs import IS_ALPHA
from spacy.lang.en import English
from spacy import displacy, lemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

# Load and Pre-Process Corpus

In [None]:
# Load pickled clean_corpus
clean_goodyear = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/clean_goodyear.p","rb"))

In [None]:
# Create tokenized corpus and preprocess text
def preprocess(clean_speech):
    """
    Takes in a speech, tokenizes and breaks it out into sentences, lemmatizes and removes stop words and punctuation for each speech
    """
    
    # Create our list of punctuation marks
    punctuations = '!"#$%&\'()’*+,-./:”;<=>?@[\\]^_`{|}~'

    # Create our list of stopwords
    stop_words = spacy.lang.en.stop_words.STOP_WORDS
    
    # Establish weird words in our corpus to be filtered out
    weird_words = ['abc', 'c', 'o', 'n']
    
    # Etablish new sentence list
    sentence_list = []
    
    # Tokenizer
    tokenizer = spacy.load('en_core_web_sm')
    speech_tokens = tokenizer(clean_speech)

    for sentence in speech_tokens.sents:
        
        # Lemmatizing each token and converting each token into lowercase
        sentence = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in sentence]
        
        # Remove stop words, weird words and punctuations
        sentence = [word for word in sentence if word not in stop_words and word not in punctuations \
                         and word not in weird_words]

        # Remove numbers
        sentence = [word for word in sentence if word.isalpha()]
        
        # Join all words in sentence
        sentence = ','.join(sentence)
        
        # Append to new list
        sentence_list.append(sentence)
    
    return sentence_list

In [None]:
# Preprocess our corpus
processed_goodyear = preprocess(clean_goodyear)

In [None]:
processed_goodyear

# Doc-Term Matrix

In [None]:
# The first document-term matrix has default Count Vectorizer values - counts of unigrams
from sklearn.feature_extraction.text import CountVectorizer

cv1 = CountVectorizer(stop_words='english')

X = cv1.fit_transform(processed_goodyear)

df = pd.DataFrame(X.toarray(), columns=cv1.get_feature_names())

In [None]:
df