In [9]:
#Import the packages
import nltk
import nltk.corpus
import pickle
import re
import pandas as pd
import numpy as np

from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
nltk.download("stopwords") #Import stopwords and punctuation from NLTK
from string import punctuation
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/angelanazarian/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load data and form corpora

In [10]:
trump_corpus = nltk.corpus.PlaintextCorpusReader('/Users/angelanazarian/NLTK_Final_Project/Clinton-Trump-Corpus/Trump/','Trump_.*.txt')

In [11]:
clinton_corpus = nltk.corpus.PlaintextCorpusReader('/Users/angelanazarian/NLTK_Final_Project/Clinton-Trump-Corpus/Clinton/','Clinton_.*.txt')

In [24]:
trump_raw = trump_corpus.raw()
trump_words = trump_corpus.words()
trump_text = nltk.Text(trump_words)

clinton_raw = clinton_corpus.raw()
clinton_words = clinton_corpus.words()
clinton_text = nltk.Text(clinton_words)

## Clean up the data
Get rid of applause, stop words, anything between < >, punctuation " -- . , '
Write a for loop to clean up all the data 



In [13]:
#this is a function that filters out the stopwords, punctuation, and audience directions in a given corpus

def clean_up_data(x):
    filtered_for_punctuation = x
    filtered_for_punctuation = re.sub("[\(\<].*[\)\>]", "", filtered_for_punctuation)
    for punc in punctuation:
        filtered_for_punctuation = filtered_for_punctuation.replace(punc, "") #remove punctuation
    tokens = nltk.wordpunct_tokenize(filtered_for_punctuation) #tokenize text
    #filtered_for_punctuation = [word for word in tokens if word.lower() not in stopwords.words('english')] #remove stopwords
        #note that stopwords include words like very and against 
        #remove audience reactions like applause and laughter located between carrots and 
        #parentheses using regex
    filtered_for_punctuation = str(filtered_for_punctuation)
    return filtered_for_punctuation

In [14]:
#load ANEW sentiment dictionary
anew_df = pd.read_csv('https://github.com/peachypunk/NLTK_Final_Project/raw/master/ANEW2010_CSV.csv')
anew_df.head()

Unnamed: 0,Word,Wdnum,ValMn,ValSD,AroMn,AroSD,DomMn,DomSD
0,abduction,621,2.76,2.06,5.53,2.43,3.49,2.38
1,able,1041,6.74,2.0,4.3,2.17,6.83,2.04
2,abortion,622,3.5,2.3,5.39,2.8,4.59,2.54
3,absent,1042,3.69,1.72,4.73,1.76,4.35,1.87
4,absurd,623,4.26,1.82,4.36,2.2,4.73,1.72


In [25]:
#clean up both corpora using the custom function from above
clean_trump = clean_up_data(trump_raw)
clean_clinton = clean_up_data(clinton_raw)

#join the corpora together while keeping them as separate entities as part of one 
#larger corpus umbrella
collected_corpora_df = {'clean_clinton' : clean_clinton, 'clean_trump' : clean_trump}
pd.Series(collected_corpora_df)

#take the "Word" column from the anew_df and convert it into a list called "wordlist"
wordlist = anew_df["Word"].tolist()

#initialize an empty array for the summed word counts for each w in wordlist
#these will be summed across all corpora in the larger corpus as well
wordfreq_corpus = [] 

#make an empty matrix that's the size of the ANEX words and two corpora 
matrix = np.zeros((len(collected_corpora_df), len(wordlist)))
for i, cid in enumerate(collected_corpora_df): #for each corpus in the list of corpora
    this_corpus_words = collected_corpora_df[(cid)]
    for j, w in enumerate(wordlist): #for each word in the ANEW wordlist...
        count = this_corpus_words.count(w) #count how many times each word (w) occurs in the wordlist for each corpus
        matrix[i,j] = count
        
df = pd.DataFrame(matrix)
df.columns = wordlist
df.index = collected_corpora_df.keys()
print(df)

#optional: print the output to a csv file. just change the "path_or_buf" part to be where you
#want to save the file
#df.to_csv(path_or_buf='/Users/angelanazarian/nltk_output.csv', sep=',', header=True, index=True, line_terminator='\n')



               abduction  able  abortion  absent  absurd  abundance  abuse  \
clean_clinton        0.0  21.0       0.0     0.0     1.0        0.0    1.0   
clean_trump          0.0  57.0       0.0     0.0     0.0        0.0    0.0   

               accept  acceptance  access  ...   yellow  yelp  yolk  young  \
clean_clinton     2.0         1.0     2.0  ...      0.0   0.0   0.0    7.0   
clean_trump       4.0         1.0     7.0  ...      1.0   0.0   0.0    7.0   

               youth  zeal  zealous  zest  zipper  zoom  
clean_clinton    0.0   0.0      0.0   0.0     0.0   0.0  
clean_trump     10.0   0.0      0.0   0.0     0.0   0.0  

[2 rows x 2476 columns]
