# Sentiment Analysis by hand and with NLTK 

In [1]:
import nltk
import csv

In [2]:
#our two texts that we know are positive and negative
sentences = [
    'Victims of terror attacks by Isis, the far right, the IRA and other groups have formed a network that aims to defeat terrorism and support those affected by atrocities'
    ,'Most parents would go to the ends of the Earth to save their child and Rene Nel is no exception. Her daughter Linke was just 14 months old when she came down with a case of pneumonia that antibiotics could not treat. At the time of her illness, their family had been living on a rural farm in South Africa. But when the test results came back, doctors immediately told them to drive to the nearest hospital, which was about 4 hours away.'
    ,'Hamas military wing blamed Israel on Saturday for killing six of its men in an explosion in the central Gaza Strip. The Israeli military denied any involvement in the blast. Six people were killed and three others were wounded in the explosion, the Hamas-run Health Ministry said. The blast took place in Deir al-Balah, a city located in the central Gaza Strip. The cause of the blast was not immediately clear.'
    ,'These numbers show that progress is being made in Afghanistan. Destruction is easy, construction is difficult, so we have a lot of work left to do, but I am hopeful of a peaceful, bright future for Afghanistan'
    ,'New Zealand Pledges To Get All Homeless People Off The Streets In Under 4 Weeks'
]

In [3]:
#loading the AFINN mapping
lol = list(csv.reader(open('data/AFINN-111.txt', 'r'), delimiter='\t')) #load afinn into list of lists
afinn = {d[0]: int(d[1]) for d in lol} #create afinn dictionary

In [4]:
afinn

{'abandon': -2,
 'abandoned': -2,
 'abandons': -2,
 'abducted': -2,
 'abduction': -2,
 'abductions': -2,
 'abhor': -3,
 'abhorred': -3,
 'abhorrent': -3,
 'abhors': -3,
 'abilities': 2,
 'ability': 2,
 'aboard': 1,
 'absentee': -1,
 'absentees': -1,
 'absolve': 2,
 'absolved': 2,
 'absolves': 2,
 'absolving': 2,
 'absorbed': 1,
 'abuse': -3,
 'abused': -3,
 'abuses': -3,
 'abusive': -3,
 'accept': 1,
 'accepted': 1,
 'accepting': 1,
 'accepts': 1,
 'accident': -2,
 'accidental': -2,
 'accidentally': -2,
 'accidents': -2,
 'accomplish': 2,
 'accomplished': 2,
 'accomplishes': 2,
 'accusation': -2,
 'accusations': -2,
 'accuse': -2,
 'accused': -2,
 'accuses': -2,
 'accusing': -2,
 'ache': -2,
 'achievable': 1,
 'aching': -2,
 'acquit': 2,
 'acquits': 2,
 'acquitted': 2,
 'acquitting': 2,
 'acrimonious': -3,
 'active': 1,
 'adequate': 1,
 'admire': 3,
 'admired': 3,
 'admires': 3,
 'admiring': 3,
 'admit': -1,
 'admits': -1,
 'admitted': -1,
 'admonish': -2,
 'admonished': -2,
 'adopt': 

## Remove punctuation 

In [5]:
import string

def removePunctuation(sentence):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in sentence if ch not in exclude)

sentences = [removePunctuation(sentence) for sentence in sentences]
sentences[0]

'Victims of terror attacks by Isis the far right the IRA and other groups have formed a network that aims to defeat terrorism and support those affected by atrocities'

## Getting the sentiment score for the sentence

In [6]:
def afinnScore(word):
    return afinn[word.lower()] if word.lower() in afinn else 0

In [7]:
#get the afinn scores
wordList = sentences[0].split(' ')
wordList_scores = [afinnScore(word) for word in wordList]

In [8]:
#which words got scored?
#get all scores in a dictionary
scoredWords = dict(zip(wordList,wordList_scores))
#get only the ones with value != 0
scoredWords = {key: val for key, val in scoredWords.items() if val != 0}
print(scoredWords)

{'Victims': -3, 'terror': -3, 'attacks': -1, 'support': 2, 'affected': -1}


In [9]:
#get score for a sentence

def getAfinnScores(wordList):    
    scores = [afinnScore(word) for word in wordList] #repeating words are respected
    sentenceScore = sum(scores)
    
    scoredWords = dict(zip(wordList,scores))
    scoredWords = {key: val for key, val in scoredWords.items() if val != 0} #only get the scored words that matter
    return sentenceScore,scoredWords

for sentence in sentences:
    sentenceScore,scoredWords = getAfinnScores(removePunctuation(sentence).split(' '))
    print(sentenceScore)
    print(scoredWords)
    print(sentence,end='\n----------\n')

-6
{'Victims': -3, 'terror': -3, 'attacks': -1, 'support': 2, 'affected': -1}
Victims of terror attacks by Isis the far right the IRA and other groups have formed a network that aims to defeat terrorism and support those affected by atrocities
----------
-1
{'save': 2, 'no': -1, 'illness': -2}
Most parents would go to the ends of the Earth to save their child and Rene Nel is no exception Her daughter Linke was just 14 months old when she came down with a case of pneumonia that antibiotics could not treat At the time of her illness their family had been living on a rural farm in South Africa But when the test results came back doctors immediately told them to drive to the nearest hospital which was about 4 hours away
----------
-9
{'blamed': -2, 'killing': -3, 'denied': -2, 'killed': -3, 'clear': 1}
Hamas military wing blamed Israel on Saturday for killing six of its men in an explosion in the central Gaza Strip The Israeli military denied any involvement in the blast Six people were ki

## Removal of Stopwords using NLTK

In [10]:
from nltk.corpus import stopwords
nltk.download('stopwords')

def removeStopWords(sentence):
    stopwordList = stopwords.words("english")
    wordList = [word for word in sentence.split(' ') if removePunctuation(word.lower()) not in stopwordList]
    return ' '.join(wordList)

print(removePunctuation(sentences[0]), end='\n-----------\n')
print(removeStopWords(removePunctuation(sentences[0])))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tiger\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
Victims of terror attacks by Isis the far right the IRA and other groups have formed a network that aims to defeat terrorism and support those affected by atrocities
-----------
Victims terror attacks Isis far right IRA groups formed network aims defeat terrorism support affected atrocities


## Word stemming using NLTK

In [11]:
from nltk.stem import PorterStemmer

def stemWords(wordList):
    ps = PorterStemmer()
    return [ps.stem(word) for word in wordList]

for sentence in sentences:
    sentenceScore,scoredWords = getAfinnScores(stemWords(removeStopWords(removePunctuation(sentence)).split(' ')))
    print('AFINN sentiment score: {}'.format(sentenceScore))
    print(scoredWords,end='\n----------\n')

AFINN sentiment score: -8
{'victim': -3, 'terror': -3, 'attack': -1, 'support': 2}
----------
AFINN sentiment score: 0
{'save': 2, 'ill': -2}
----------
AFINN sentiment score: -7
{'blame': -2, 'kill': -3, 'clear': 1}
----------
AFINN sentiment score: 4
{'progress': 2, 'difficult': -1, 'hope': 2, 'bright': 1}
----------
AFINN sentiment score: 0
{}
----------


In [12]:
#why did the score for the negative text change change form -9 to -7? lets check the outcome of the porter stemmer
ps = PorterStemmer()
wordList = removeStopWords(removePunctuation(sentences[0])).split(' ')
[ps.stem(word) for word in wordList]

#answer: the word AFFECTED was stemmed to AFFECT which is not in the AFINN List

['victim',
 'terror',
 'attack',
 'isi',
 'far',
 'right',
 'ira',
 'group',
 'form',
 'network',
 'aim',
 'defeat',
 'terror',
 'support',
 'affect',
 'atroc']

## Full black box sentiment analysis with NLTK 

In [13]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon') #pre-trained

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Tiger\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [14]:
sid = SentimentIntensityAnalyzer()

for sentence in sentences:
    print(sid.polarity_scores(sentence))
    print(sentence, end='\n------------\n')

#source: https://opensourceforu.com/2016/12/analysing-sentiments-nltk/

{'neg': 0.429, 'neu': 0.506, 'pos': 0.065, 'compound': -0.9337}
Victims of terror attacks by Isis the far right the IRA and other groups have formed a network that aims to defeat terrorism and support those affected by atrocities
------------
{'neg': 0.083, 'neu': 0.88, 'pos': 0.037, 'compound': -0.4512}
Most parents would go to the ends of the Earth to save their child and Rene Nel is no exception Her daughter Linke was just 14 months old when she came down with a case of pneumonia that antibiotics could not treat At the time of her illness their family had been living on a rural farm in South Africa But when the test results came back doctors immediately told them to drive to the nearest hospital which was about 4 hours away
------------
{'neg': 0.208, 'neu': 0.792, 'pos': 0.0, 'compound': -0.9523}
Hamas military wing blamed Israel on Saturday for killing six of its men in an explosion in the central Gaza Strip The Israeli military denied any involvement in the blast Six people were 

## Training own classifier 

In [15]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')
  
# Step 1 – Training data
labels = ['neg','pos','neg','pos','pos']
dataset = list(zip(sentences,labels))
  
# Step 2
dictionary = set(word.lower() for passage in dataset for word in word_tokenize(passage[0]))
  
# Step 3
t = [({word: (word in word_tokenize(x[0])) for word in dictionary}, x[1]) for x in dataset]
  
# Step 4 – the classifier is trained with sample data
classifier = nltk.NaiveBayesClassifier.train(t)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tiger\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
classifier.show_most_informative_features()

Most Informative Features
                     the = True              neg : pos    =      2.2 : 1.0
                      is = False             neg : pos    =      2.2 : 1.0
                       4 = False             neg : pos    =      2.2 : 1.0
                     and = True              neg : pos    =      2.2 : 1.0
                     far = False             pos : neg    =      1.8 : 1.0
                   clear = False             pos : neg    =      1.8 : 1.0
                  people = False             pos : neg    =      1.8 : 1.0
                 support = False             pos : neg    =      1.8 : 1.0
                 located = False             pos : neg    =      1.8 : 1.0
                 wounded = False             pos : neg    =      1.8 : 1.0


In [17]:
test_data = 'Her daughter on the bright homeless future'
test_data = 'Most parents would go to the ends of Rene Nel is no exception Her daughter'
test_data_features = {word.lower(): (word in word_tokenize(test_data.lower())) for word in dictionary}

distribution = classifier.prob_classify(test_data_features)
for label in distribution.samples():
    print("%s: %f" % (label, distribution.prob(label)))

neg: 0.000000
pos: 1.000000


In [18]:
test_data_features

{'14': False,
 '4': False,
 'a': False,
 'about': False,
 'affected': False,
 'afghanistan': False,
 'africa': False,
 'aims': False,
 'albalah': False,
 'all': False,
 'am': False,
 'an': False,
 'and': False,
 'antibiotics': False,
 'any': False,
 'at': False,
 'atrocities': False,
 'attacks': False,
 'away': False,
 'back': False,
 'been': False,
 'being': False,
 'blamed': False,
 'blast': False,
 'bright': False,
 'but': False,
 'by': False,
 'came': False,
 'case': False,
 'cause': False,
 'central': False,
 'child': False,
 'city': False,
 'clear': False,
 'construction': False,
 'could': False,
 'daughter': True,
 'defeat': False,
 'deir': False,
 'denied': False,
 'destruction': False,
 'difficult': False,
 'do': False,
 'doctors': False,
 'down': False,
 'drive': False,
 'earth': False,
 'easy': False,
 'ends': True,
 'exception': True,
 'explosion': False,
 'family': False,
 'far': False,
 'farm': False,
 'for': False,
 'formed': False,
 'future': False,
 'gaza': False,
 'ge