In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report

from tensorflow.keras.utils import to_categorical
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
filename = 'sentence_db_candidate.csv'
df = pd.read_csv(filename)

In [3]:
df.shape

(29621, 18)

In [4]:
def preproc(sentence):
    sentence = sentence.lower()
    sentence = ''.join([i for i in sentence if i not in string.punctuation])
    return sentence

In [5]:
df['Speech'] = df['Speech'].apply(preproc)

In [6]:
valid = ['Claim', 'Premise', 'O']
df = df.loc[(df['Component'].isin(valid))]

In [7]:
df.shape

(29532, 18)

In [8]:
#turning labels into two classes 
classes = []

for s in df.Component:
    if s == 'O':
        classes.append(0.0)
    else:
        classes.append(1.0)

In [9]:
df['Annotation'] = classes

In [10]:
df = df[['Speech', 'Annotation', 'Set']]

### Add Feature: Claim Connective 

In [11]:
def add_connectives (df, speech_sents):

    """ 
    :input: df: entire DataFrame
            speech_sents: numpy array of text data instances in DataFrame
    :return: df: DataFrame with a new feature Claim_Connective, 
            representing the presence/absence of any connective from a given list in a sentence
    
    """
    
    connectives = ['so that', 'as a result', 'therefore', 'thus', 'thereby', 'in the end', 'hence', 'accordingly', 'in this way']
    lst = []
    
    for sent in speech_sents:
        if any(w in sent for w in connectives):
            lst.append(1)
        else:
            lst.append(0)
    df['Claim_Connective'] = lst
    
    return df

In [12]:
add_connectives(df, df['Speech'])

Unnamed: 0,Speech,Annotation,Set,Claim_Connective
0,gwen i want to thank you and i want to thank ...,0.0,TRAIN,0
1,its a very important event and theyve done a s...,0.0,TRAIN,0
2,its important to look at all of our developmen...,0.0,TRAIN,0
3,and after 911 it became clear that we had to d...,1.0,TRAIN,0
4,and we also then finally had to stand up democ...,1.0,TRAIN,0
...,...,...,...,...
29616,and well continue to promote freedom around th...,1.0,VALIDATION,0
29617,freedom is on the march,1.0,VALIDATION,0
29618,tomorrow afghanistan will be voting for a pres...,1.0,VALIDATION,0
29619,in iraq well be having free elections and a fr...,1.0,VALIDATION,0


### Add Feature: Sentiment of a sentence 

In [13]:
def add_sentiment (df, speech_sents): 
    
    analyzer = SentimentIntensityAnalyzer()

    senti = []
    
    for sent in speech_sents:
        vs = analyzer.polarity_scores(sent)
        senti.append([list(vs.values())[3]])
    
    senti_arr = np.array(senti)
    df['Sentiment'] = senti_arr
    
    return df 

In [14]:
add_sentiment(df, df['Speech'])

Unnamed: 0,Speech,Annotation,Set,Claim_Connective,Sentiment
0,gwen i want to thank you and i want to thank ...,0.0,TRAIN,0,0.6808
1,its a very important event and theyve done a s...,0.0,TRAIN,0,0.7346
2,its important to look at all of our developmen...,0.0,TRAIN,0,-0.7579
3,and after 911 it became clear that we had to d...,1.0,TRAIN,0,-0.7269
4,and we also then finally had to stand up democ...,1.0,TRAIN,0,-0.7721
...,...,...,...,...,...
29616,and well continue to promote freedom around th...,1.0,VALIDATION,0,0.8360
29617,freedom is on the march,1.0,VALIDATION,0,0.6369
29618,tomorrow afghanistan will be voting for a pres...,1.0,VALIDATION,0,0.0000
29619,in iraq well be having free elections and a fr...,1.0,VALIDATION,0,0.9041


### Add Feature: NER set

In [15]:
import spacy
import collections

In [16]:
ner = spacy.load("en_core_web_sm")

In [17]:
# tag text and extract tags into a list

df['ner'] = df['Speech'].apply(lambda x: [(tag.text, tag.label_) 
                                for tag in ner(x).ents])

In [18]:
import collections

# utils function to count the element of a list

def utils_lst_count(lst):
    dic_counter = collections.Counter()
    
    for x in lst:
        dic_counter[x] += 1
    
    dic_counter = collections.OrderedDict( 
                     sorted(dic_counter.items(), 
                     key=lambda x: x[1], reverse=True))
    
    lst_count = [{key:value} for key,value in dic_counter.items()]
    
    return lst_count

In [19]:
# count tags
df['ner'] = df['ner'].apply(lambda x: utils_lst_count(x))

In [20]:
# utils function create new column for each tag category

def utils_ner_features(lst_dics_tuples, tag):
    if len(lst_dics_tuples) > 0:
        tag_type = []
        for dic_tuples in lst_dics_tuples:
            for tuple in dic_tuples:
                type, n = tuple[1], dic_tuples[tuple]
                tag_type = tag_type + [type]*n
                dic_counter = collections.Counter()
                for x in tag_type:
                    dic_counter[x] += 1
        return dic_counter[tag]
    else:
        return 0

In [21]:
# extract features

tags_set = []

for lst in df['ner'].tolist():
    for dic in lst:
        for k in dic.keys():
            tags_set.append(k[1])
            
tags_set = list(set(tags_set))

for feature in tags_set:
    df['ner_' + feature] = df['ner'].apply(lambda x: utils_ner_features(x, feature))

In [22]:
df = df.drop(['ner'], axis=1)

### Add Feature: POS for adverbs and adjectives 

In [29]:
pos = spacy.load('en_core_web_sm')

In [30]:
df['pos'] = df['Speech'].apply(lambda x: [(tag.text, tag.pos_) 
                                for tag in pos(x)])

In [31]:
# count tags
df['pos'] = df['pos'].apply(lambda x: utils_lst_count(x))

In [32]:
# extract pos 
pos_set = []

for lst in df['pos'].tolist():
    for dic in lst:
        for k in dic.keys():
            pos_set.append(k[1])
            
pos_set = list(set(pos_set))

for feature in pos_set:
    df['pos_' + feature] = df['pos'].apply(lambda x: utils_ner_features(x, feature))

In [33]:
# keeping only adverbs and adjectives and dropping other pos, like authors had
for feature in df.columns:
    if feature != 'pos_ADV' and feature != 'pos_ADJ' and 'pos' in feature:
        df = df.drop(feature, axis=1)

In [35]:
#splitting as the authors did 
df_train = df[df['Set'] == 'TRAIN']
df_val = df[df['Set'] == 'VALIDATION']
df_test = df[df['Set'] == 'TEST']

In [36]:
df_train = df_train.drop(['Set'], axis=1)
df_test = df_test.drop(['Set'], axis=1)

In [37]:
X_train = df_train.drop(['Annotation'], axis=1)
y_train = df_train.Annotation

X_test = df_test.drop(['Annotation'], axis=1)
y_test = df_test.Annotation

In [38]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,3))

#tf-idf
train_vecs =  vectorizer.fit_transform(X_train['Speech'])
test_vecs = vectorizer.transform(X_test['Speech'])

In [39]:
names = vectorizer.get_feature_names()
dense = train_vecs.todense()
denselist = dense.tolist()
fe = pd.DataFrame(denselist, columns = names)

In [40]:
X_train = X_train.drop(['Speech'], axis=1)

In [41]:
train_features = np.hstack([X_train, fe])

In [42]:
train_features

array([[ 0.    ,  0.6808,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.7346,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.    , -0.7579,  0.    , ...,  0.    ,  0.    ,  0.    ],
       ...,
       [ 0.    , -0.5994,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.    , -0.296 ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.5106,  0.    , ...,  0.    ,  0.    ,  0.    ]])

In [43]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='hinge')
clf.fit(train_features, y_train)

SGDClassifier()

### Testing preparations

In [44]:
names = vectorizer.get_feature_names()
dense = test_vecs.todense()
denselist = dense.tolist()
fe = pd.DataFrame(denselist, columns = names)

In [45]:
X_test = X_test.drop(['Speech'], axis=1)

In [46]:
test_features = np.hstack([X_test, fe])

In [47]:
#predicting 
y_pred_test_sgd = clf.predict(test_features)

In [48]:
#classification report on test set SGD
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred_test_sgd, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.72      0.38      0.49      1880
     class 1       0.84      0.96      0.90      6575

    accuracy                           0.83      8455
   macro avg       0.78      0.67      0.69      8455
weighted avg       0.82      0.83      0.81      8455



In [55]:
svm = SVC(kernel='linear', C=1, random_state=600)
svm.fit(train_features, y_train)

SVC(C=1, kernel='linear', random_state=600)

In [56]:
y_pred_test_svm = svm.predict(test_features)

In [57]:
#classification report on test set SVM
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred_test_svm, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.71      0.38      0.49      1880
     class 1       0.84      0.96      0.90      6575

    accuracy                           0.83      8455
   macro avg       0.77      0.67      0.69      8455
weighted avg       0.81      0.83      0.81      8455



In [52]:
rbf = SVC(kernel='rbf', C=10, random_state=42)
rbf.fit(train_features, y_train)

SVC(C=10, random_state=42)

In [53]:
y_pred_test_rbf = rbf.predict(test_features)

In [54]:
#classification report on test set SVM rbf
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred_test_rbf, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.58      0.45      0.51      1880
     class 1       0.85      0.91      0.88      6575

    accuracy                           0.81      8455
   macro avg       0.72      0.68      0.69      8455
weighted avg       0.79      0.81      0.80      8455

