In [9]:
documents = []
with open('emotion_data.txt', 'rt', encoding='latin') as doc_file:
    for line in doc_file: 
        documents.append(line)

In [19]:
import numpy as np
import re
import string
import pickle

def get_emotion_cause_pairs(documents):
    with open('emotion_seeds.pickle', 'rb') as f:
        emotion_seeds = pickle.load(f)

    document_clauses = []
    emotion_labels = []
    cause_labels = []
    
    emotion_count_dict = {}
    emotion_cause_pairs = []
    emotion_cause_pairs_label = []

    for i, line in enumerate(documents):
        emotion_of_line = ""
        if line.startswith("<happy>"):
            emotion_of_line = "happy"
        elif line.startswith("<sad>"):
            emotion_of_line = "sad"
        elif line.startswith("<surprise>"):
            emotion_of_line = "surprise"
        elif line.startswith("<disgust>"):
            emotion_of_line = "disgust"
        elif line.startswith("<anger>"):
            emotion_of_line = "anger"
        elif line.startswith("<shame>"):
            emotion_of_line = "shame"
        elif line.startswith("<fear>"):
            emotion_of_line = "fear"

        if emotion_of_line not in emotion_count_dict:
            emotion_count_dict[emotion_of_line] = 1
        else:
            emotion_count_dict[emotion_of_line] += 1

        clauses = re.split(r"[.,!;:\"]+", line)
        emotion_clauses = []
        cause_clauses = []
        clauses_in_line = []
        for clause in clauses:
            cleaned_clause = remove_punctuation_from_clause(clause)
            clause_words = cleaned_clause.split()

            if not clause_words:
                continue

            document_clauses.append(clause_words)
            clauses_in_line.append(clause_words)

            if "<cause>" in clause:
                cause_labels.append(1)
                cause_clauses.append(clause_words)
            else:
                cause_labels.append(0)

            has_seed = any(word.lower() in emotion_seeds for word in clause_words)
            emotion_labels.append(1 if has_seed else 0)
            if has_seed:
                emotion_clauses.append(clause_words)

        for m in range(len(clauses_in_line)):
            for n in range(len(clauses_in_line)):
                if clauses_in_line[m] in emotion_clauses and clauses_in_line[n] in cause_clauses:
                    emotion_cause_pairs.append((clauses_in_line[m], clauses_in_line[n]))
                    emotion_cause_pairs_label.append(emotion_of_line)
                elif m != n:
                    emotion_cause_pairs.append((clauses_in_line[m], clauses_in_line[n]))
                    emotion_cause_pairs_label.append("None")

    # Use lists for mixed data types and return as-is
    return (
        document_clauses,
        np.array(emotion_labels),
        np.array(cause_labels),
        emotion_cause_pairs,
        emotion_cause_pairs_label,
        emotion_count_dict
    )

def remove_punctuation_from_clause(clause):
    clause = re.sub(r'<[^<]+>', "", clause)
    clause = clause.translate(str.maketrans('', '', string.punctuation))
    clause = clause.translate(str.maketrans('', '', string.digits))
    return clause


In [23]:
document_clauses, emotion_labels, cause_labels, emotion_cause_pairs,emotion_cause_pairs_labels,emotion_count_dict = get_emotion_cause_pairs(documents)
import pandas as pd

print("Total number of documents in the dataset: ",len(documents))
print("Total pairs of clauses",len(emotion_cause_pairs))
print("Total pairs of emotion-cause clauses: ",len(emotion_cause_pairs_labels)-emotion_cause_pairs_labels.count("None"))
print("\nEmotion wise count of sentences in the dataset \n")
pd.DataFrame(data = emotion_count_dict,index = [0])

Total number of documents in the dataset:  820
Total pairs of clauses 5422
Total pairs of emotion-cause clauses:  865

Emotion wise count of sentences in the dataset 



Unnamed: 0,happy,sad,surprise,disgust,anger,fear,shame
0,211,107,53,38,199,144,68


In [7]:
# print(emotion_cause_pairs)

In [25]:
from sklearn.model_selection import train_test_split
import random 

random.Random(56).shuffle(emotion_cause_pairs)
random.Random(56).shuffle(emotion_cause_pairs_labels)

#Split the emotion cause pairs into Train Data(72%), Validation Data(8%) and Test Data(20%)
emotion_cause_pairs_inter, emotion_cause_pairs_test, emotion_cause_pairs_inter_labels, emotion_cause_pairs_test_labels = train_test_split(emotion_cause_pairs, emotion_cause_pairs_labels,
                                                    stratify=emotion_cause_pairs_labels, 
                                                    test_size=0.20,random_state = 10)

emotion_cause_pairs_train, emotion_cause_pairs_cv, emotion_cause_pairs_train_labels, emotion_cause_pairs_cv_labels = train_test_split(emotion_cause_pairs_inter, emotion_cause_pairs_inter_labels,
                                                    stratify=emotion_cause_pairs_inter_labels, 
                                                    test_size=0.10,random_state = 10)


emotion_count_dict_train = dict()
for e in emotion_cause_pairs_train_labels:
  if emotion_count_dict_train.get(e) == None:
    emotion_count_dict_train[e] = 1
  else:
    emotion_count_dict_train[e]+=1
print("Emotion wise count in train data",emotion_count_dict_train)

emotion_count_dict_cv = dict()
for e in emotion_cause_pairs_cv_labels:
  if emotion_count_dict_cv.get(e) == None:
    emotion_count_dict_cv[e] = 1
  else:
    emotion_count_dict_cv[e]+=1
print("Emotion wise count in validation data",emotion_count_dict_cv)


emotion_count_dict_test = dict()
for e in emotion_cause_pairs_test_labels:
  if emotion_count_dict_test.get(e) == None:
    emotion_count_dict_test[e] = 1
  else:
    emotion_count_dict_test[e]+=1
print("Emotion wise count in test data",emotion_count_dict_test)

emotion_cause_pairs_train_labels = [ 0 if i == "None" else 1 for i in emotion_cause_pairs_train_labels]
emotion_cause_pairs_cv_labels = [ 0 if i == "None" else 1 for i in emotion_cause_pairs_cv_labels]
emotion_cause_pairs_test_labels  = [ 0 if i == "None" else 1 for i in emotion_cause_pairs_test_labels]

Emotion wise count in train data {'None': 3280, 'happy': 161, 'anger': 151, 'fear': 113, 'disgust': 28, 'sad': 80, 'surprise': 40, 'shame': 50}
Emotion wise count in validation data {'None': 365, 'happy': 18, 'fear': 12, 'anger': 17, 'shame': 6, 'disgust': 3, 'sad': 9, 'surprise': 4}
Emotion wise count in test data {'None': 912, 'shame': 14, 'sad': 22, 'happy': 45, 'anger': 42, 'fear': 31, 'disgust': 8, 'surprise': 11}


In [9]:
len(emotion_cause_pairs_train_labels),len(emotion_cause_pairs_test_labels),len(emotion_cause_pairs_cv_labels)

(3903, 1085, 434)

In [13]:
######################################TRAIN DATA#################################################
clauses_train = []
emotion_train_labels = []
cause_train_labels = []

for i in range(len(emotion_cause_pairs_train)):
  if emotion_cause_pairs_train_labels[i] == 1:
    if emotion_cause_pairs_train[i][0] not in clauses_train:
        clauses_train.append(emotion_cause_pairs_train[i][0])
        emotion_train_labels.append(1)
        cause_train_labels.append(0)
    if emotion_cause_pairs_train[i][1] not in clauses_train:
        clauses_train.append(emotion_cause_pairs_train[i][1])
        emotion_train_labels.append(0)
        cause_train_labels.append(1)

clauses_cv = []
emotion_cv_labels = []
cause_cv_labels = []
for i in range(len(emotion_cause_pairs_cv)):
  if emotion_cause_pairs_cv_labels[i] == 1:
    if emotion_cause_pairs_cv[i][0] not in clauses_cv:
        clauses_cv.append(emotion_cause_pairs_cv[i][0])
        emotion_cv_labels.append(1)
        cause_cv_labels.append(0)
    if emotion_cause_pairs_cv[i][1] not in clauses_cv:
        clauses_cv.append(emotion_cause_pairs_cv[i][1])
        emotion_cv_labels.append(0)
        cause_cv_labels.append(1)


clauses_test = []
emotion_test_labels = []
cause_test_labels = []
for i in range(len(emotion_cause_pairs_test)):
  if emotion_cause_pairs_test_labels[i] == 1:
    if emotion_cause_pairs_test[i][0] not in clauses_test:
        clauses_test.append(emotion_cause_pairs_test[i][0])
        emotion_test_labels.append(1)
        cause_test_labels.append(0)
    if emotion_cause_pairs_test[i][1] not in clauses_test:
        clauses_test.append(emotion_cause_pairs_test[i][1])
        emotion_test_labels.append(0)
        cause_test_labels.append(1)

In [14]:
len(emotion_cause_pairs_train),emotion_cause_pairs_train_labels.count(1),len(emotion_cause_pairs_test),emotion_cause_pairs_test_labels.count(1),len(emotion_cause_pairs_cv),emotion_cause_pairs_cv_labels.count(1)

(3903, 623, 1085, 173, 434, 69)

In [15]:
len(clauses_train),len(clauses_cv),len(clauses_test)

(1166, 136, 333)

In [17]:
def calculateF1Score(emotion_cause_pairs_test,potential_emotion_cause_pairs_test,emotion_cause_pairs_test_labels,potential_emotion_cause_pairs_test_pred):
  proposed_pairs = list(potential_emotion_cause_pairs_test_pred).count(1)
  annotated_pairs = emotion_cause_pairs_test_labels.count(1)

  #if both predicted and actual emotion cause pair are same then it is a correct pair
  correct_pairs = 0
  for i in range(len(emotion_cause_pairs_test)):
    for j in range(potential_emotion_cause_pairs_test_pred.shape[0]):
      if emotion_cause_pairs_test[i] == potential_emotion_cause_pairs_test[j]:
        if emotion_cause_pairs_test_labels[i] == potential_emotion_cause_pairs_test_pred[j] and emotion_cause_pairs_test_labels[i] == 1:
          correct_pairs+=1

  precision = correct_pairs/proposed_pairs if proposed_pairs > 0  else 0
  recall = correct_pairs/annotated_pairs

  F1_score = 0
  if precision+recall !=0:
    F1_score = 2 * precision * recall/(precision+recall)

  print("Correct pairs found: {} Proposed pairs: {} Annotated Pairs: {}  ".format(correct_pairs,proposed_pairs,annotated_pairs))

  print("Precision:{}  Recall:{}   F1-Score:{}".format(precision,recall,F1_score))

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

clauses_train.extend(clauses_cv)
emotion_train_labels.extend(emotion_cv_labels)
cause_train_labels.extend(cause_cv_labels)

X_train_text = [" ".join(l) for l in clauses_train]
X_test_text =  [" ".join(l) for l in clauses_test]
print(len(X_train_text),len(X_test_text))
tfidfVectorizer = TfidfVectorizer(min_df = 0)
X_train_vectors = tfidfVectorizer.fit_transform(X_train_text)
X_test_vectors =  tfidfVectorizer.transform(X_test_text)

1302 333


In [22]:
from sklearn.model_selection import GridSearchCV 
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
import pandas as pd

logistic_classifier = SGDClassifier(loss = 'log',random_state = 0,n_jobs = -1)
param_grid = {'alpha':[10**-6,10**(-5),10**(-4),10**(-3),10**(-2),10**(-1),1,10,100]}
gridSearch = GridSearchCV(logistic_classifier, param_grid,scoring = 'f1_weighted',cv = 7)
gridSearch.return_train_score = True
gridSearch.fit(X_train_vectors,emotion_train_labels)

dataframe = pd.DataFrame(gridSearch.cv_results_)
dataframe[['param_alpha','mean_test_score','mean_train_score']]

  'precision', 'predicted', average, warn_for)


Unnamed: 0,param_alpha,mean_test_score,mean_train_score
0,1e-06,0.909326,0.99936
1,1e-05,0.921653,0.99936
2,0.0001,0.9301,0.998848
3,0.001,0.91703,0.966592
4,0.01,0.88861,0.924473
5,0.1,0.842111,0.87643
6,1.0,0.65489,0.666304
7,10.0,0.323144,0.323143
8,100.0,0.323144,0.323143


In [23]:
from sklearn.metrics import  classification_report

emotion_pred_labels = gridSearch.best_estimator_.predict(X_test_vectors)
class_report = classification_report(emotion_test_labels, emotion_pred_labels, target_names=[0,1], output_dict=True)
report = pd.DataFrame(data = class_report)
report

Unnamed: 0,0,1,micro avg,macro avg,weighted avg
precision,0.895954,0.9625,0.927928,0.929227,0.930326
recall,0.962733,0.895349,0.927928,0.929041,0.927928
f1-score,0.928144,0.927711,0.927928,0.927927,0.92792
support,161.0,172.0,333.0,333.0,333.0


In [24]:
from sklearn.model_selection import GridSearchCV 
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
import pandas as pd


logistic_classifier = SGDClassifier(loss = 'log',random_state = 0,n_jobs = 2)
param_grid = {'alpha':[10**-6,10**(-5),10**(-4),10**(-3),10**(-2),10**(-1),1,10,100]}
gridSearch = GridSearchCV(logistic_classifier, param_grid,scoring = 'f1_weighted',cv = 5)
gridSearch.return_train_score = True
gridSearch.fit(X_train_vectors,cause_train_labels)

dataframe = pd.DataFrame(gridSearch.cv_results_)
dataframe[['param_alpha','mean_test_score','mean_train_score']]

  'precision', 'predicted', average, warn_for)


Unnamed: 0,param_alpha,mean_test_score,mean_train_score
0,1e-06,0.917787,0.999424
1,1e-05,0.925496,0.999424
2,0.0001,0.929334,0.998847
3,0.001,0.917811,0.967551
4,0.01,0.888519,0.925482
5,0.1,0.830669,0.874115
6,1.0,0.798126,0.838977
7,10.0,0.339358,0.339576
8,100.0,0.339358,0.339576


In [26]:
from sklearn.metrics import  classification_report

cause_pred_labels = gridSearch.best_estimator_.predict(X_test_vectors)
class_report = classification_report(cause_test_labels, cause_pred_labels, target_names=["non-emotion-cause","emotion-cause"], output_dict=True)
report = pd.DataFrame(data = class_report)
report

Unnamed: 0,non-emotion-cause,emotion-cause,micro avg,macro avg,weighted avg
precision,0.9625,0.895954,0.927928,0.929227,0.930326
recall,0.895349,0.962733,0.927928,0.929041,0.927928
f1-score,0.927711,0.928144,0.927928,0.927927,0.92792
support,172.0,161.0,333.0,333.0,333.0
