# NLP SENTIMENT ANALYSIS

In [1]:
import nltk
# nltk.download('punkt')
# nltk.download('words')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('brown')
# nltk.download('stopwords')
# nltk.download('vader_lexicon')
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import words, brown
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer, LancasterStemmer
import pandas as pd
from nltk.stem import WordNetLemmatizer
from pathlib import Path
import numpy as np
from nltk.sentiment import SentimentIntensityAnalyzer
import math

In [2]:
df = pd.read_csv('Research/cleaneddata.csv')

textqs = ['Q19','Q20', 'Q39', 'Q41', 'Q42', 'Q53', 'Q54']
groups = ['White Conservative', 'White Liberal', 'Black Conservative', 'Black Non-Conservative']
stopwords = nltk.corpus.stopwords.words("english")

sia = SentimentIntensityAnalyzer()

def text_groups(data, text_questions, groups):
    
    text_groups = []
    
    for group in groups:
        
        textblock = []
        
        for question in text_questions:
            
            temp = data[question].loc[data['targetgroup']==group]
            temp = temp.str.cat(sep=' ')
            textblock.append(temp)
        text_groups.append([group, textblock])
    return text_groups

# print(text_groups(df, textqs, groups))

def sentiment_analysis(textblocks):
    
    for text in textblocks:
        
        tokens = [word.lower() for sent in sent_tokenize(text[1][0]) for word in word_tokenize(sent) if word.isalpha()]
        tokens = [w for w in tokens if w.lower() not in stopwords]
        
        ##Frequency Distribution
        
        print('\nFrequency Distribution for '+text[0]+'s')
        fd = nltk.FreqDist(tokens)
        fd.tabulate(10)
        
        ##Bigrams
        print('\nBigrams for '+text[0]+'s')
        finder = nltk.collocations.BigramCollocationFinder.from_words(tokens)
        finder.ngram_fd.tabulate(5)
        
        tokens = [word.lower() for sent in sent_tokenize(text[1][0]) for word in word_tokenize(sent) if word.isalpha()]
        tokens = nltk.Text(tokens)

        ##Examples of where the word slavery shows up
        print('\nExample sentences where '+text[0]+'s mention slavery')
        tokens.concordance("slavery", lines = 5)
        
        ##Examples of where the word racism shows up
        print('\nExample sentences where '+text[0]+'s mention racism')
        tokens.concordance("racism", lines = 5)
        
    return None

sentiment_analysis(text_groups(df, textqs, groups))



Frequency Distribution for White Conservatives
  slavery    slaves abolished     still    people     civil       war     since     ended    rights 
       90        46        44        38        37        33        31        19        19        19 

Bigrams for White Conservatives
        ('civil', 'war') ('slavery', 'abolished')      ('slaves', 'freed')       ('since', 'civil')     ('slavery', 'still') 
                      25                       13                       10                        9                        8 

Example sentences where White Conservatives mention slavery
Displaying 5 of 90 matches:
there is no longer slavery of african americans in the usa it 
sure it didn t completely eliminate slavery immediately by beating the person i
here i do like it there hasn t been slavery since then we ve come a long way bu
slaves were set free so there is no slavery now slavery was outlawed by a const
set free so there is no slavery now slavery was outlawed by a constitutio

In [3]:
#Classify Sentiment for each text response

def compound_score(row):
    if(row != row):
        return 'NaN'
    else:
        row = str(row)
        return(sia.polarity_scores(row)["compound"])

def SentimentScoring(data, text_questions):

    for question in text_questions:
        data[question+'_score'] = df[question].apply(lambda row: compound_score(row))
    return df

new_df = SentimentScoring(df, textqs)

filepath = Path('/Users/philipsurendran/Documents/Research/scored_data.csv')
filepath.parent.mkdir(parents=True, exist_ok=True) 
new_df.to_csv(filepath, index=False)