# Import Dependencies

In [67]:
import re
import pickle
import string
import numpy as np
import pandas as pd

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.attrs import IS_ALPHA
from spacy.lang.en import English
from spacy import displacy, lemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Load and Pre-Process Corpus

In [7]:
# Load pickled clean_corpus
clean_corpus = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/clean_corpus.p","rb"))

In [82]:
# Create tokenized corpus and preprocess text
def preprocess(corpus):
    """
    Takes in a corpus of speech strings, tokenizes, lemmatizes and removes stop words and punctuation for each speech
    """
    # Instantiate spacy
    nlp = spacy.load('en_core_web_sm')
    # Create our list of punctuation marks
    punctuations = '!"#$%&\'()’*+,-./:”;<=>?@[\\]^_`{|}~'

    # Create our list of stopwords
    stop_words = spacy.lang.en.stop_words.STOP_WORDS
    
    # Establish weird words in our corpus to be filtered out
    weird_words = ['abc', 'c', 'o', 'n']
    
    # Establish new_corpus
    new_corpus = []
    
    # Loop through each speech in the corpus and preprocess
    for speech in corpus:
        
        # Tokenize speech
        speech_tokens = nlp(speech)
        
        # Lemmatizing each token and converting each token into lowercase
        speech_tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in speech_tokens]

        # Remove stop words, weird words and punctuations
        speech_tokens = [word for word in speech_tokens if word not in stop_words and word not in punctuations \
                         and word not in weird_words]

        # Remove numbers
        speech_tokens = [word for word in speech_tokens if word.isalpha()]
                
        # Join it all into one string
        speech_tokens = ','.join(speech_tokens)
        
        # Append to new_corpus
        new_corpus.append(speech_tokens)
    
    return new_corpus

In [83]:
# Preprocess our corpus
processed_corpus = preprocess(clean_corpus)

# Doc-Term Matrix

In [84]:
# The first document-term matrix has default Count Vectorizer values - counts of unigrams
from sklearn.feature_extraction.text import CountVectorizer

cv1 = CountVectorizer(stop_words='english')

X = cv1.fit_transform(processed_corpus)

df = pd.DataFrame(X.toarray(), columns=cv1.get_feature_names())

In [85]:
df

Unnamed: 0,able,abolish,abortion,abraham,absentee,absolute,absolutely,abuse,accept,acceptance,...,york,young,youtube,yu,yuma,zero,zippo,zone,zoning,zucker
0,4,1,1,3,0,0,0,1,0,0,...,5,5,0,0,1,1,0,0,0,0
1,0,1,1,3,0,0,2,0,1,0,...,2,3,0,0,1,0,0,1,0,0
2,0,2,0,1,0,0,1,0,0,0,...,0,3,0,0,0,0,0,3,0,0
3,1,0,0,1,0,1,1,1,0,0,...,1,5,0,0,0,0,0,0,0,0
4,1,2,0,2,0,1,1,2,0,1,...,2,8,1,0,0,0,0,0,0,0
5,2,1,0,3,0,0,2,0,0,0,...,4,4,0,0,0,1,0,1,2,0
6,3,1,0,2,0,0,4,0,0,0,...,3,6,0,1,0,1,0,0,0,0
7,3,1,0,0,1,0,2,0,1,0,...,6,1,0,0,0,3,1,0,0,1
