In [39]:
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import pandas as pd

In [41]:
data = pd.read_csv('../../tales_emotion.tsv', sep='\t', header=0)
print("Instances before removing classes: ", len(data))
emotions = ["fear", "anger","disgust","sadness","joy"]
data = data[data.emotion_label != "surprise"]
data = data[data.emotion_label != "noemo"]
print("Instanecs after removing classes: ",len(data))

Instances before removing classes:  13693
Instanecs after removing classes:  4263


In [42]:
label_data = data["emotion_label"]
text_data = data["text"]
assert(len(label_data) == len(text_data))

In [43]:
tokenized_text_data = [word_tokenize(x) for x in text_data]

In [44]:
print(text_data[1])
print(label_data[1])
print(tokenized_text_data[1])
labels = set(label_data)

Listen to the story of Jemima Puddle-duck, who was annoyed because the farmer's wife would not let her hatch her own eggs.
anger
['Her', 'sister-in-law', ',', 'Mrs.', 'Rebeccah', 'Puddle-duck', ',', 'was', 'perfectly', 'willing', 'to', 'leave', 'the', 'hatching', 'to', 'someone', 'else', '--', '``', 'I', 'have', 'not', 'the', 'patience', 'to', 'sit', 'on', 'a', 'nest', 'for', 'twenty-eight', 'days', ';', 'and', 'no', 'more', 'have', 'you', ',', 'Jemima', '.']


In [45]:
NRC = pd.read_csv('../../NRCLexicon.csv', sep='\t', header=0)
newNRC = NRC.groupby(['word']).agg(lambda x: tuple(x)).applymap(list).reset_index() #because words have multiple emotions, want list of emotions per word
wordAssociations = dict(zip(newNRC['word'], newNRC['emotion'])) # this is the database that you want to use for feature extraction

## Features
Which features do we want to extract?

    - count of emotions associated with words in text 
    
        [fear, anger, trust, sadness, surprise, joy, anticipation, disgust] --> 8
    
    - count of valence 
    
        [positive, negative]  --> 2
        
    - length of sentence (in words)  --> 1
    
    - length of sentence (in chars)  --> 1
    
    - count of punctuation
    
        [',', '.', '?', '!', '#', '-', ';', ':']  --> 8
    
    - tense
    
        [future, present, past]  --> 3
        
    - negation count  --> 1

    - count of capital letters --> 1
    
    - rate of repitition of words  --> 1

In [46]:
def determine_tense(sentence): #sentence should be a list of tokens
    tagged = pos_tag(sentence)

    tense = [0,0,0]
    tense[0] = len([word for word in tagged if word[1] in ["MD", "VBC", "VBF"]])  #future
    tense[1] = len([word for word in tagged if word[1] in ["VBP", "VBZ","VBG"]])  #present
    tense[2] = len([word for word in tagged if word[1] in ["VBD", "VBN"]])        #past
    return tense

print(determine_tense(["This", "is", "a", "fine","day","."]))

[0, 1, 0]


In [47]:
def count_emotions(text): #text should be a list of tokens
    #TODO: check if each word in text (tokenized sentence) is in NRC, if so, add 1 to count for that emotion
    #should also check for valence :) so this vector will be length 10 (8 emotions, 2 valence)
    emotion_count = {"positive":0,"negative":0,"fear":0, "anger":0, "trust":0, "sadness":0, "surprise":0, "joy":0, "anticipation":0, "disgust":0
}
    for w in text:
        if w.lower() in wordAssociations.keys():
            for e in wordAssociations[w.lower()]:
                emotion_count[e] += 1
    return list(emotion_count.values())

print(count_emotions(["I","love","the","sun","on","my","smooth","face"]))

[2, 0, 0, 0, 1, 0, 1, 2, 1, 0]


In [48]:
def count_punctuation(text): #text must be list of tokens
    #TODO: count all punctuation in the text
    punctuation = [',', '.', '?', '!', '#', '-', ';', ':']
    punctuation_count = {',':0, '.':0, '?':0, '!':0, '#':0, '-':0, ';':0, ':':0}
    for w in text:
        if w in punctuation:
            punctuation_count[w] += 1
    return list(punctuation_count.values())

print(count_punctuation(["This","!", "is", "a", "fine","day","."]))

[0, 1, 0, 1, 0, 0, 0, 0]


In [49]:
def count_negation(text): #text must be list of tokens
    count = 0
    negs = ["not","never","no","none","neither", "nor"]
    for w in text:
        if w.lower() in negs:
            count += 1
    return count #returns single int

print(count_negation(["This", "is","not", "a", "fine","day","."]))

1


In [50]:
def count_capitals(text): #takes a list of tokens
    text = ' '.join(text)
    return(sum(1 for c in text if c.isupper()))  #returns a single int

print(count_capitals(["This", "is","NOT", "a", "FINE","day","."]))

8


In [51]:
def calc_rep_rate(text):
    uniq = set(text)
    return(int(round(len(text)/len(uniq), 0)))

print(calc_rep_rate(["This", "is","not", "a", "fine","day",",","its","a", "fine","day",".",".","."]))

2


In [52]:
def create_feature_vector(text): #text must be a list of tokens
    #TODO: call all the methods that will process the text
    #count_emotions (also valence)
    #len(sentence)
    #len(sentenceChars)
    #count_punctuation
    #determine_tense
    #count_negation
    #count_capitals
    #calculate_repetition
    feat_vec = []
    feat_vec += count_emotions(text)
    #feat_vec.append(len(text))
    #feat_vec.append(len(' '.join(text)))
    feat_vec += count_punctuation(text)
    feat_vec += determine_tense(text)
    feat_vec.append(count_negation(text))
    feat_vec.append(count_capitals(text))
    feat_vec.append(calc_rep_rate(text))
    assert len(feat_vec) == 24
    return feat_vec

print(create_feature_vector(["This", "is","not", "a", "fine","day",",","its","a", "fine","day",".",".","."]))
print(create_feature_vector(["I","love","the","sun","on","my","smooth","face"]))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 2]
[2, 0, 0, 0, 1, 0, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1]


## Training and testing on Tales

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

dataset = []
for instance in tokenized_text_data:
    dataset.append(create_feature_vector(instance))

X = dataset
y = label_data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)  #split dataset into 80% train and 20% test

scaler = StandardScaler()
scaler.fit(X_train) #feature scaling

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

classifier = KNeighborsClassifier(n_neighbors=20)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

#print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

       anger       0.31      0.30      0.31       134
     disgust       0.20      0.03      0.05        69
        fear       0.35      0.22      0.27       135
         joy       0.55      0.82      0.66       349
     sadness       0.30      0.20      0.24       166

    accuracy                           0.46       853
   macro avg       0.34      0.31      0.31       853
weighted avg       0.40      0.46      0.41       853



## Testing on ISEAR

In [54]:
data = pd.read_csv('../../isear_emotion.tsv', sep='\t', header=0)
print("Instances before removing classes: ", len(data))
data = data[data.emotion_label != "shame"]
data = data[data.emotion_label != "guilt"]
print("Instanecs after removing classes: ",len(data))

Instances before removing classes:  7666
Instanecs after removing classes:  5477


In [55]:
label_data = data["emotion_label"]
text_data = data["text"]
assert(len(label_data) == len(text_data))
tokenized_text_data = [word_tokenize(x) for x in text_data]

In [56]:
print(text_data[1])
print(label_data[1])
print(tokenized_text_data[1])

When I was involved in a traffic accident.
fear
['When', 'I', 'was', 'involved', 'in', 'a', 'traffic', 'accident', '.']


In [57]:
dataset = []
for instance in tokenized_text_data:
    dataset.append(create_feature_vector(instance))

X = dataset
y = label_data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)  #split dataset into 80% train and 20% test

y_pred = classifier.predict(X_test)

#print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.40      0.06      0.10       208
     disgust       0.00      0.00      0.00       217
        fear       0.30      0.11      0.16       208
         joy       0.24      0.96      0.39       233
     sadness       0.29      0.09      0.13       230

    accuracy                           0.25      1096
   macro avg       0.25      0.24      0.16      1096
weighted avg       0.24      0.25      0.16      1096



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
