Import libraries

In [1]:
import pandas as pd
import nltk, csv, collections
nltk.download('punkt')
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.svm import LinearSVC
from sklearn.metrics import precision_recall_fscore_support,accuracy_score
from scipy.sparse import hstack

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\psheth5\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Load data

In [2]:
# # load data from Google drive
# from google.colab import drive
# drive.mount('/content/drive') # mount drive

# path to the datasets on Google drive (training and test subsets of the LGBT and migrants datasets)
lgbt_test = "C:\\Users\\psheth5\\Downloads\\Stylometric-emotion-approach (1)\\Stylometric-emotion-approach\\lgbt-test.csv"
lgbt_train = "C:\\Users\\psheth5\\Downloads\\Stylometric-emotion-approach (1)\\Stylometric-emotion-approach\\lgbt-train.csv"
migrants_test = "C:\\Users\\psheth5\\Downloads\\Stylometric-emotion-approach (1)\\Stylometric-emotion-approach\\migrants-train.csv"
migrants_train = "C:\\Users\\psheth5\\Downloads\\Stylometric-emotion-approach (1)\\Stylometric-emotion-approach\\migrants-test.csv"

train = pd.read_csv(migrants_train)
test = pd.read_csv(lgbt_test)
# concatenate the training and test subsets of the LGBT and migrants datasets
# train = pd.concat([pd.read_csv(lgbt_train, sep=','), pd.read_csv(migrants_train, sep=',')]).reset_index(drop=True)
# test = pd.concat([pd.read_csv(lgbt_test, sep=','), pd.read_csv(migrants_test, sep=',')]).reset_index(drop=True)

In [3]:
# display the data (note that in the preprocessing step, the texts were tokenized, lemmatized, and universal POS tags were extracted into the corresponding columns)
train.head(2)

Unnamed: 0,id,text,exact_label,label,tokens,lemmas,upos
0,10153273111682217,Why should we? It's the biggest humanitarian c...,Acceptable speech,0,Why should we ? It 's the biggest humanitarian...,why should we ? it be the biggest humanitarian...,ADV AUX PRON PUNCT PRON AUX DET ADJ ADJ NOUN A...
1,10153273119732217,these refugees adult males are cowards for not...,Background offensive,1,these refugees adult males are cowards for not...,these refuge adult male be cowards for not def...,DET NOUN ADJ NOUN AUX NOUN SCONJ ADV VERB PRON...


In [4]:
test.head(2)

Unnamed: 0,id,text,exact_label,label,tokens,lemmas,upos
0,10155108649247217,This is the problem with having a referendum o...,Acceptable speech,0,This is the problem with having a referendum o...,this be the problem with have a referendum on ...,PRON AUX DET NOUN SCONJ VERB DET NOUN ADP PRON...
1,10155108659877217,"I live in the UK, so luckily our government ha...",Acceptable speech,0,"I live in the UK , so luckily our government h...","I live in the UK , so luckily we government ha...",PRON VERB ADP DET PROPN PUNCT ADV ADV PRON NOU...


Load NRC emotion lexicon

In [5]:
# load the NRC emotion lexicon into a dictionary with emotion words and corresponding associations
# lexicon = '/content/drive/My Drive/Stylometry-approach-HS/nrc-lexicon-en.txt' # path to the NRC emotion lexicon on Google drive
lexicon = "C:\\Users\\psheth5\\Downloads\\Stylometric-emotion-approach (1)\\Stylometric-emotion-approach\\nrc-lexicon-en.txt"
emotions = {}
for line in open(lexicon).read().split('\n'):	
	emotion_word = line.split('\t')[0]
	emotion = line.split('\t')[1]
	association = line.split('\t')[2]
	if association == "1":
		if emotion_word in emotions and emotion_word in ['positive','negative','neutral']:
			emotions[emotion_word].append(emotion)
		else:
			emotions[emotion_word] = [emotion] 

list(emotions.items())[:3] # print first 3 entries

[('smut', ['negative']),
 ('expletive', ['negative']),
 ('greeting', ['surprise'])]

Features

In [6]:
# extract features as described in the paper:
# - pos_fw_emo = representation of the text through POS tags, function words, and emotion words (from this representation n-grams (n=1-3) are built, see vectorize below)
# - count = number of emotion words in a text
# - emotion_associations = emotion associations from the NRC emotion lexicon

fw_list = ['ADP', 'AUX', 'CCONJ', 'DET', 'NUM', 'PART', 'PRON', 'SCONJ'] # POS tags that correspond to function words

def get_feats_en(upos, lemmas):	
  pos_fw_emo = []
  count = 0
  emotion_associations = []
  for i, lemma in enumerate(lemmas.split()):		
    if lemma.lower() in emotions:
      pos_fw_emo.append(lemma)
      count += 1
      temp = emotions[lemma.lower()]
      temp1 = []
      if("positive" in temp):
        temp1.append("positive")
      if("negattive" in temp):
        temp1.append("negative")
      if("neutral" in temp):
        temp1.append("neutral")
      emotion_associations.append(temp1)     
    else:
      if upos.split()[i] in fw_list:
        pos_fw_emo.append(lemma)
      else:
        pos_fw_emo.append(upos.split()[i])
  emotion_associations = [emo for sublist in emotion_associations for emo in sublist]
  return pd.Series([' '.join(pos_fw_emo), count, ' '.join(emotion_associations)])

train[['pos_fw_emo', 'count', 'emotion_associations']] = train.apply(lambda x: get_feats_en(x['upos'], x['lemmas']), axis=1) 
test[['pos_fw_emo', 'count', 'emotion_associations']] = test.apply(lambda x: get_feats_en(x['upos'], x['lemmas']), axis=1) 

In [7]:
train.head(2)

Unnamed: 0,id,text,exact_label,label,tokens,lemmas,upos,pos_fw_emo,count,emotion_associations
0,10153273111682217,Why should we? It's the biggest humanitarian c...,Acceptable speech,0,Why should we ? It 's the biggest humanitarian...,why should we ? it be the biggest humanitarian...,ADV AUX PRON PUNCT PRON AUX DET ADJ ADJ NOUN A...,ADV should we PUNCT it be the ADJ humanitarian...,2,
1,10153273119732217,these refugees adult males are cowards for not...,Background offensive,1,these refugees adult males are cowards for not...,these refuge adult male be cowards for not def...,DET NOUN ADJ NOUN AUX NOUN SCONJ ADV VERB PRON...,these NOUN ADJ NOUN be NOUN for ADV defend the...,1,positive


Vectorize

In [8]:
vectorizer1 = CountVectorizer(tokenizer=lambda x: x.split(), analyzer='word', ngram_range=(1, 3)) # to build n-grams (n=1-3) from the pos_fw_emo representation
vectorizer2 = CountVectorizer(tokenizer=lambda x: x.split(), analyzer='word', ngram_range=(1, 1)) # unigrams of emotion associations

# combine the features
X_train = vectorizer2.fit_transform(train.emotion_associations)
X_test =  vectorizer2.transform(test.emotion_associations)

Y_train = train.label.values
Y_test = test.label.values

Classify

In [9]:
clf_svc = LinearSVC(random_state=0, C=0.1) # parameter C was selected based on grid search
clf_svc.fit(X_train, Y_train)
Y_pred = clf_svc.predict(X_test)



In [10]:
# from sklearn.linear_model import LogisticRegression

# model = LogisticRegression().fit(X_train, Y_train)
# Y_pred = model.predict(X_test)

Results

In [11]:
results = pd.DataFrame(
    [list(precision_recall_fscore_support(test.label, Y_pred, average='macro')[:3])],
    columns=['precision', 'recall', 'F1'])
results['Accuracy'] = accuracy_score(test.label, Y_pred)
results

Unnamed: 0,precision,recall,F1,Accuracy
0,0.509105,0.50623,0.500805,0.699115


In [12]:
# LGBT to MIGRANT
precision = 0.67
recall = 0.62
F1 = 0.60

# MIGRANT to LGBT
precision = 0.58
recall = 0.58
F1 = 0.58