# Cleaning and preparing data for NLP

In [9]:
import re
import pickle

In [10]:
# Load the pickle files
goodyear_az = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/goodyear_az.p","rb"))
bullhead_az = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/bullhead_az.p","rb"))
omaha_ne = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/omaha_ne.p","rb"))
wsalem_wi = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/wsalem_wi.p","rb"))
lansing_mi = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/lansing_mi.p","rb"))
martinsburg_pa = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/martinsburg_pa.p","rb"))
lititz_pa = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/lititz_pa.p","rb"))
allentown_pa = pickle.load(open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/allentown_pa.p","rb"))

In [16]:
# Group corpuses by state
az = [goodyear_az, bullhead_az]
ne = [omaha_ne]
wi = [wsalem_wi]
mi = [lansing_mi]
pa = [martinsburg_pa, lititz_pa, allentown_pa]

In [61]:
# Create corpus
corpus = [goodyear_az, bullhead_az, omaha_ne, wsalem_wi, lansing_mi, martinsburg_pa, lititz_pa, allentown_pa]

In [65]:
def clean(corpus):
    """
    Takes a speech as an argument and cleans it so it's ready to be pre-processed
    """
    # Initiate clean_corpus
    clean_corpus = [] 
    
    for speech in corpus:
    
        # Removes meaningless intro    
        speech = speech[5:] 

        for i in range(len(speech)):
            # Removes 'meaningless text hear (min:sec)\n' at the beginning of each paragraph
            speech[i] = speech[i][speech[i].find('\n') + 1:] 
            # Replaces brackets with paranthesis
            speech[i] = speech[i].replace('[', '(') 
            speech[i] = speech[i].replace(']', ')')
            # Removes meaningless text in parantheses
            speech[i] = re.sub(r'\([^)]*\)', '', speech[i]) 

        # Join all of the paragraphs into one speech
        speech = ','.join(speech) 

        clean_corpus.append(speech)
    
    # Combined all of the speeches into one document
    
    if len(clean_corpus) == 1:
        clean_corpus = clean_corpus[0]
    if len(clean_corpus) == 2:
        clean_corpus = clean_corpus[0] + clean_corpus[1]
    if len(clean_corpus) == 3:
        clean_corpus = clean_corpus[0] + clean_corpus[1] + clean_corpus[2]
    if len(clean_corpus) == 8:
        clean_corpus = clean_corpus[0] + clean_corpus[1] + clean_corpus[2] + clean_corpus[3] + clean_corpus[4] + \
                       clean_corpus[5] + clean_corpus[6] + clean_corpus[7]
        
    return clean_corpus

In [59]:
# Clean our corpuses
clean_az = clean(az)
clean_ne = clean(ne)
clean_wi = clean(wi)
clean_mi = clean(mi)
clean_pa = clean(pa)

In [67]:
clean_corpus = clean(corpus)

In [68]:
# Pickle clean_corpus
pickle.dump(clean_corpus, open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/clean_corpus.p","wb"))

In [58]:
# Pickle clean_speech
pickle.dump(clean_az, open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/clean_az.p","wb"))
pickle.dump(clean_ne, open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/clean_ne.p","wb"))
pickle.dump(clean_wi, open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/clean_wi.p","wb"))
pickle.dump(clean_mi, open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/clean_mi.p","wb"))
pickle.dump(clean_pa, open("/Users/dominguez/Documents/Trump-Speech-NLP/speech_data/clean_pa.p","wb"))