In [1]:
#Import the packages
import nltk
import nltk.corpus
import pickle
import re
import pandas as pd
import numpy as np

from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
nltk.download("stopwords") #Import stopwords and punctuation from NLTK
from string import punctuation
from nltk.corpus import stopwords



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load data and form corpora

In [5]:
trump_corpus = nltk.corpus.PlaintextCorpusReader('C:/Users/User/Box Sync/2017 - Spring/PSC290 Python/Hw5 - Final project plan/Cloned from Github/Clinton-Trump-Corpus/Trump/','Trump_.*.txt')

In [6]:
clinton_corpus = nltk.corpus.PlaintextCorpusReader('C:/Users/User/Box Sync/2017 - Spring/PSC290 Python/Hw5 - Final project plan/Cloned from Github/Clinton-Trump-Corpus/Clinton/','Clinton_.*.txt')

In [7]:
trump_raw = trump_corpus.raw()
trump_words = trump_corpus.words()
trump_text = nltk.Text(trump_words)

clinton_raw = clinton_corpus.raw()
clinton_words = clinton_corpus.words()
clinton_text = nltk.Text(clinton_words)

## Clean up the data
Get rid of applause, stop words, anything between < >, punctuation " -- . , '
Write a for loop to clean up all the data 



In [8]:
#this is a function that filters out the stopwords, punctuation, and audience directions in a given corpus

def clean_up_data(x):
    filtered_for_punctuation = x
    filtered_for_punctuation = re.sub("[\(\<].*[\)\>]", "", filtered_for_punctuation)
    for punc in punctuation:
        filtered_for_punctuation = filtered_for_punctuation.replace(punc, "") #remove punctuation
    tokens = nltk.wordpunct_tokenize(filtered_for_punctuation) #tokenize text
    #filtered_for_punctuation = [word for word in tokens if word.lower() not in stopwords.words('english')] #remove stopwords
        #note that stopwords include words like very and against 
        #remove audience reactions like applause and laughter located between carrots and 
        #parentheses using regex
    filtered_for_punctuation = str(filtered_for_punctuation)
    return filtered_for_punctuation

In [9]:
#load ANEW sentiment dictionary
anew_df = pd.read_csv('https://github.com/peachypunk/NLTK_Final_Project/raw/master/ANEW2010_CSV.csv')
anew_df.head()

Unnamed: 0,Word,Wdnum,ValMn,ValSD,AroMn,AroSD,DomMn,DomSD
0,abduction,621,2.76,2.06,5.53,2.43,3.49,2.38
1,able,1041,6.74,2.0,4.3,2.17,6.83,2.04
2,abortion,622,3.5,2.3,5.39,2.8,4.59,2.54
3,absent,1042,3.69,1.72,4.73,1.76,4.35,1.87
4,absurd,623,4.26,1.82,4.36,2.2,4.73,1.72


In [10]:
#clean up both corpora using the custom function from above
clean_trump = clean_up_data(trump_raw)
clean_clinton = clean_up_data(clinton_raw)

#join the corpora together while keeping them as separate entities as part of one 
#larger corpus umbrella
collected_corpora_df = {'clean_clinton' : clean_clinton, 'clean_trump' : clean_trump}
pd.Series(collected_corpora_df)

#take the "Word" column from the anew_df and convert it into a list called "wordlist"
wordlist = anew_df["Word"].tolist()

#initialize an empty array for the summed word counts for each w in wordlist
#these will be summed across all corpora in the larger corpus as well
wordfreq_corpus = [] 

#make an empty matrix that's the size of the ANEX words and two corpora 
matrix = np.zeros((len(collected_corpora_df), len(wordlist)))
for i, cid in enumerate(collected_corpora_df): #for each corpus in the list of corpora
    this_corpus_words = collected_corpora_df[(cid)]
    for j, w in enumerate(wordlist): #for each word in the ANEW wordlist...
        count = this_corpus_words.count(w) #count how many times each word (w) occurs in the wordlist for each corpus
        matrix[i,j] = count
        
df = pd.DataFrame(matrix)
df.columns = wordlist
df.index = collected_corpora_df.keys()
print(df)

#optional: print the output to a csv file. just change the "path_or_buf" part to be where you
#want to save the file
#df.to_csv(path_or_buf='C:/Users/User/Box Sync/2017 - Spring/PSC290 Python/Hw5 - Final project plan/Cloned from Github/nltk_output.csv', sep=',', header=True, index=True, line_terminator='\n')



               abduction  able  abortion  absent  absurd  abundance  abuse  \
clean_clinton        0.0  21.0       0.0     0.0     1.0        0.0    1.0   
clean_trump          0.0  57.0       0.0     0.0     0.0        0.0    0.0   

               accept  acceptance  access  ...   yellow  yelp  yolk  young  \
clean_clinton     2.0         1.0     2.0  ...      0.0   0.0   0.0    7.0   
clean_trump       4.0         1.0     7.0  ...      1.0   0.0   0.0    7.0   

               youth  zeal  zealous  zest  zipper  zoom  
clean_clinton    0.0   0.0      0.0   0.0     0.0   0.0  
clean_trump     10.0   0.0      0.0   0.0     0.0   0.0  

[2 rows x 2476 columns]


In [31]:
df_wide = df.copy() #make a copy of df and calling it "df_wide"
df_long = df_wide.transpose() #transpose "df_wide" into "df_long" format
df_long.reset_index(level=0, inplace=True) #converting the ANEW word indices into numeric indices
df_long.columns = ['Word', 'clinton_WC', 'trump_WC'] #renaming columns (WC = word count)
df_long

Unnamed: 0,Word,clinton_WC,trump_WC
0,abduction,0.0,0.0
1,able,21.0,57.0
2,abortion,0.0,0.0
3,absent,0.0,0.0
4,absurd,1.0,0.0
5,abundance,0.0,0.0
6,abuse,1.0,0.0
7,accept,2.0,4.0
8,acceptance,1.0,1.0
9,access,2.0,7.0


In [25]:
anew_sliced = anew_df[['Word','ValMn', 'AroMn', 'DomMn']] 
anew_sliced #subsetting the mean ratings from ANEW df

Unnamed: 0,Word,ValMn,AroMn,DomMn
0,abduction,2.76,5.53,3.49
1,able,6.74,4.30,6.83
2,abortion,3.50,5.39,4.59
3,absent,3.69,4.73,4.35
4,absurd,4.26,4.36,4.73
5,abundance,6.59,5.51,5.80
6,abuse,1.80,6.83,3.69
7,accept,6.80,5.53,5.41
8,acceptance,7.98,5.40,6.64
9,access,6.14,5.07,6.25


In [32]:
df_combined = pd.merge(df_long, anew_sliced)
df_combined #combining df_long with anew_sliced into one dataframe

Unnamed: 0,Word,clinton_WC,trump_WC,ValMn,AroMn,DomMn
0,abduction,0.0,0.0,2.76,5.53,3.49
1,able,21.0,57.0,6.74,4.30,6.83
2,abortion,0.0,0.0,3.50,5.39,4.59
3,absent,0.0,0.0,3.69,4.73,4.35
4,absurd,1.0,0.0,4.26,4.36,4.73
5,abundance,0.0,0.0,6.59,5.51,5.80
6,abuse,1.0,0.0,1.80,6.83,3.69
7,accept,2.0,4.0,6.80,5.53,5.41
8,acceptance,1.0,1.0,7.98,5.40,6.64
9,access,2.0,7.0,6.14,5.07,6.25


In [35]:
# WORD COUNTS x VALENCE MEANS:

# "clinton_Val" = clinton_WC x ValMn
df_combined['clinton_Val'] = df_combined.apply(lambda row: (row['clinton_WC']*row['ValMn']), axis=1)

# "trump_Val" = trump_WC x ValMn
df_combined['trump_Val'] = df_combined.apply(lambda row: (row['trump_WC']*row['ValMn']), axis=1)



# WORD COUNTS x AROUSAL MEANS:

# "clinton_Aro" = clinton_WC x AroMn
df_combined['clinton_Aro'] = df_combined.apply(lambda row: (row['clinton_WC']*row['AroMn']), axis=1)

# "trump_Aro" = trump_WC x AroMn
df_combined['trump_Aro'] = df_combined.apply(lambda row: (row['trump_WC']*row['AroMn']), axis=1)



# WORD COUNTS x DOMINANCE MEANS:

# "clinton_Dom" = clinton_WC x DomMn
df_combined['clinton_Dom'] = df_combined.apply(lambda row: (row['clinton_WC']*row['DomMn']), axis=1)

# "trump_Val" = trump_WC x ValMn
df_combined['trump_Dom'] = df_combined.apply(lambda row: (row['trump_WC']*row['DomMn']), axis=1)



df_combined

Unnamed: 0,Word,clinton_WC,trump_WC,ValMn,AroMn,DomMn,clinton_Val,trump_Val,clinton_Aro,trump_Aro,clinton_Dom,trump_Dom
0,abduction,0.0,0.0,2.76,5.53,3.49,0.00,0.00,0.00,0.00,0.00,0.00
1,able,21.0,57.0,6.74,4.30,6.83,141.54,384.18,90.30,245.10,143.43,389.31
2,abortion,0.0,0.0,3.50,5.39,4.59,0.00,0.00,0.00,0.00,0.00,0.00
3,absent,0.0,0.0,3.69,4.73,4.35,0.00,0.00,0.00,0.00,0.00,0.00
4,absurd,1.0,0.0,4.26,4.36,4.73,4.26,0.00,4.36,0.00,4.73,0.00
5,abundance,0.0,0.0,6.59,5.51,5.80,0.00,0.00,0.00,0.00,0.00,0.00
6,abuse,1.0,0.0,1.80,6.83,3.69,1.80,0.00,6.83,0.00,3.69,0.00
7,accept,2.0,4.0,6.80,5.53,5.41,13.60,27.20,11.06,22.12,10.82,21.64
8,acceptance,1.0,1.0,7.98,5.40,6.64,7.98,7.98,5.40,5.40,6.64,6.64
9,access,2.0,7.0,6.14,5.07,6.25,12.28,42.98,10.14,35.49,12.50,43.75


## Mean Valence in Clinton Corpus vs. Trump Corpus 
### (weighted mean: ValMn x Word Count) ###

In [40]:
#Sum of all values in "clinton_Val" column (summing across all words)
SUM_clinton_Val = df_combined['clinton_Val'].values.sum() 

SUM_clinton_Val 

31152.200000000001

In [42]:
#Sum of all values in "trump_Val" column (summing across all words)
SUM_trump_Val = df_combined['trump_Val'].values.sum()

SUM_trump_Val

89329.540000000008

## Mean Arousal in Clinton Corpus vs. Trump Corpus 
### (weighted mean: AroMn x Word Count) ###

In [45]:
#Sum of all values in "clinton_Aro" column (summing across all words)
SUM_clinton_Aro = df_combined['clinton_Aro'].values.sum() 

SUM_clinton_Aro 

27309.540000000001

In [46]:
#Sum of all values in "trump_Aro" column (summing across all words)
SUM_trump_Aro = df_combined['trump_Aro'].values.sum() 

SUM_trump_Aro 

79172.510000000009

## Mean Dominance in Clinton Corpus vs. Trump Corpus 
### (weighted mean: DomMn x Word Count) ###

In [43]:
#Sum of all values in "clinton_Dom" column (summing across all words)
SUM_clinton_Dom = df_combined['clinton_Dom'].values.sum() 

SUM_clinton_Dom 

29476.309999999998

In [44]:
#Sum of all values in "clinton_Dom" column (summing across all words)
SUM_trump_Dom = df_combined['trump_Dom'].values.sum() 

SUM_trump_Dom 

84429.429999999993