## Experimental Setting 4: Features + Feed Forward Neural Network 
### Task 2: Classification Claims vs Premises

In [1]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import spacy
import collections

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer

import keras
import tensorflow_addons as tfa
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers.core import Activation
from tensorflow.keras.utils import to_categorical

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

1. Load SpaCy model for pos and ner tagging 

In [2]:
import spacy
import collections

nlp = spacy.load('en_core_web_sm')

2. Import the raw dataset as a dataframe

In [3]:
filename = '../../../data/sentence_db_candidate.csv'
df = pd.read_csv(filename)

In [4]:
def preproc(sentence):
    sentence = sentence.lower()
    sentence = ''.join([i for i in sentence if i not in string.punctuation])
    return sentence

In [5]:
df['Speech'] = df['Speech'].apply(preproc)
valid = ['Claim', 'Premise']
df = df.loc[(df['Component'].isin(valid))]

classes = []

for s in df.Component:
    if s == 'Claim':
        classes.append(1.0)
    else:
        classes.append(0.0)
        
df['Annotation'] = classes
df.Annotation.value_counts()
df = df[['Speech', 'Annotation', 'Set']]

3. We extract different kind of features from the dataset

In [6]:
def list_count(lst):
    
    """
    :function: count the elements of a list -- the number of words with a respective POS or NER labels in a sentence. 
    :input: lst: list of tuples, where tuple has two elements -- a word and its POS or NER label
    :return: lst_count: list of dictionaries, where
    the dictionary consists of keys -- the elements are words and their POS or NER labels
    and values -- how many times each word and its POS or NER label occurs
    If a sentence has no POS or NER labels, return an empty list 

    """
    
    dic_counter = collections.Counter()
    
    for x in lst:
        dic_counter[x] += 1
    
    dic_counter = collections.OrderedDict( 
                     sorted(dic_counter.items(), 
                     key=lambda x: x[1], reverse=True))
    
    lst_count = [{key:value} for key,value in dic_counter.items()]
    
    return lst_count


def column_tag(lst_dics_tuples, tag):
    
    """
    :function: new column for each POS or NER tag category 
    :input: lst_dics_tuples: list of dictionaries with tuples 
            tag: POS or NER label from a list
    :return: tag: new column for each POS or NER label with their counts

    """
    
    if len(lst_dics_tuples) > 0:
        tag_type = []
        
        for dic_tuples in lst_dics_tuples:
            for tuple in dic_tuples:
                type, n = tuple[1], dic_tuples[tuple]
                tag_type = tag_type + [type]*n
                dic_counter = collections.Counter()
                for x in tag_type:
                    dic_counter[x] += 1
        return dic_counter[tag]
    
    else:
        return 0

    
def add_connectives (df, speech_sents):

    """ 
    :function: add a boolean feature based on presence/absence of a claim connective from the pre-defined list 
    :input: df: entire DataFrame
            speech_sents: Series of sentences in DataFrame
    :return: df: DataFrame with a new feature Claim_Connective
    
    """
    
    connectives = ['so that', 'as a result', 'therefore', 'thus', 'thereby', 'in the end', 'hence', 'accordingly', 'in this way', 'because', 'now that', 'insofar as', 'given that', 'in response to', 'consequently', 'as a consequence']
    lst = []
    
    for sent in speech_sents:
        if any(w in sent for w in connectives):
            lst.append(1)
        else:
            lst.append(0)
    df['Claim_Connective'] = lst
    
    return df


def pos_features (df, speech_sents):
    
    """
    :function: add two new columns with two POS: adjectives and adverbs, and their counts per sentence.
    Two helper functions -- list_count, column_tag -- are needed 
    :input: df: entire DataFrame
            speech_sents: Series of sentences in DataFrame
    :return: df: new DataFrame with two new features

    """
    
    
    df['pos'] = speech_sents.apply(lambda x: [(tag.text, tag.pos_) 
                                for tag in nlp(x)])
    
    df['pos'] = df['pos'].apply(lambda x: list_count(x))
    
    #extract features
    tags_set = []

    for lst in df['pos'].tolist():
        for dic in lst:
            for k in dic.keys():
                tags_set.append(k[1])
            
    tags_set = list(set(tags_set))

    for feature in tags_set:
        df['pos_' + feature] = df['pos'].apply(lambda x: column_tag(x, feature))
        
    # keeping only adverbs and adjectives and dropping other pos
    for feature in df.columns:
        if feature != 'pos_ADV' and feature != 'pos_ADJ' and 'pos' in feature:
            df = df.drop(feature, axis=1)
    
    return df


def ner_features(df, speech_sents):
    
    """
    :function: add several new columns with NER labels, and their counts per sentence.
    Two helper functions -- list_count, column_tag -- are needed 
    :input: df: entire DataFrame
            speech_sents: Series of sentences in DataFrame
    :return: df: new DataFrame with new features for each NER label

    """
    
    df['ner'] = speech_sents.apply(lambda x: [(tag.text, tag.label_) 
                                for tag in nlp(x).ents])
    # count tags
    df['ner'] = df['ner'].apply(lambda x: list_count(x))
    
    # extract features
    tags_set = []

    for lst in df['ner'].tolist():
        for dic in lst:
            for k in dic.keys():
                tags_set.append(k[1])
            
    tags_set = list(set(tags_set))

    for feature in tags_set:
        df['ner_' + feature] = df['ner'].apply(lambda x: column_tag(x, feature))
        
    df = df.drop(['ner'], axis=1)
    
    return df 


def verbs_features (df, speech_sents):
    
    """
    :function: add several new columns with features for verb tenses and the presence of modal verbs, 
    and their counts per sentence.
    Two helper functions -- list_count, column_tag -- are needed 
    :input: df: entire DataFrame
            speech_sents: Series of sentences in DataFrame
    :return: df: new DataFrame with features for each verb tense and for modal verbs

    """
    
    df['verb_tag'] = speech_sents.apply(lambda x: [(tag.text, tag.tag_) 
                                for tag in nlp(x)])
    
    df['verb_tag'] = df['verb_tag'].apply(lambda x: list_count(x))
    
    #extract features
    verbs_set = []

    for lst in df['verb_tag'].tolist():
        for dic in lst:
            for k in dic.keys():
                verbs_set.append(k[1])
            
    verbs_set = list(set(verbs_set))

    for feature in verbs_set:
        df['verb_tag_' + feature] = df['verb_tag'].apply(lambda x: column_tag(x, feature))
    
    #out of all detailed POS tags, keeping only verbs-related ones 
    for f in df.columns:
        if f != 'verb_tag_VB' and f != 'verb_tag_VBZ' and f != 'verb_tag_VBP' and f != 'verb_tag_VBD' and f != 'verb_tag_VBN' and f != 'verb_tag_VBG' and f != 'verb_tag_MD' and 'verb_tag' in f:
            df = df.drop(f, axis=1)
    
    
    return df


def add_personal(df, speech_sents):
    """
    :function: add two boolean features based on the presence/absence of any pronoun from two given lists.
    :input: df: entire DataFrame
            speech_sents: Series of sentences in DataFrame
    :return: df: DataFrame with two new features Pronoun_Singular and Pronoun_Plural

    """

    singular = [' i ', ' me ', ' my ', ' myself ', ' mine ']
    plural = [' we ', ' our ', ' ours ', ' ourselves ']
    lst_sing = []
    lst_plur = []

    for sent in speech_sents:
        if any(w in sent for w in singular):
            lst_sing.append(1)
        else:
            lst_sing.append(0)
    df['Pronoun_Singular'] = lst_sing

    for sent in speech_sents:
        if any(w in sent for w in plural):
            lst_plur.append(1)
        else:
            lst_plur.append(0)
    df['Pronoun_Plural'] = lst_plur

    return df


def add_sentiment (df, speech_sents): 
    
    
    """ 
    :function: add a feature with a sentiment score for each sentence 
    :input: df: entire DataFrame
            speech_sents: Series of sentences in DataFrame
    :return: df: DataFrame with a new feature Sentiment
    
    """
    
    analyzer = SentimentIntensityAnalyzer()

    senti = []
    
    for sent in speech_sents:
        vs = analyzer.polarity_scores(sent)
        senti.append([list(vs.values())[3]])
    
    senti_arr = np.array(senti)
    df['Sentiment'] = senti_arr
    
    return df 

In [7]:
df = pos_features(df, df['Speech'])
df = ner_features(df, df['Speech'])
df = verbs_features(df, df['Speech'])
df = add_connectives(df, df['Speech'])
df = add_personal(df, df['Speech'])
df = add_sentiment(df, df['Speech'])

In [8]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag, word_tokenize, RegexpParser

[nltk_data] Downloading package punkt to C:\Users\Peng
[nltk_data]     Chen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Peng Chen\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [9]:
def syntactic_features(df, speech_sents):
        
    """
    :function: add syntactic features -- 1) the number of productions, 2) the number of VP groups per sentence, 
    and 3) the depth of a sentence tree 
    :input: df: entire DataFrame
            speech_sents: Series of sentences in DataFrame
    :return: df: new DataFrame with three syntactic features 

    """
  
    a, b , c, d, e = [], [], [], [], []
    for x, y in enumerate(speech_sents):
        tagged = pos_tag(word_tokenize(y))
        chunker = RegexpParser(r"""
            NBAR:
            {<NN.*|JJ>*<NN.*>}  
            VP:
            {<V.*>}  
            NP:
            {<NBAR>}
            {<NBAR><IN><NBAR>}  
        """)
    
        a.append(chunker.parse(tagged))
        b.append(len(chunker.parse(tagged).productions()))
        e.append(chunker.parse(tagged).productions())
        c.append(chunker.parse(tagged).height())

    df.loc[:, 'Speech_parsed'] = a
    df.loc[:, 'Productions_count'] = b
    df.loc[:, 'Tree_depth'] = c
  

    for i in e:
        vp = []
        for u in i:
            if str(u).startswith('VP'):
                vp.append(u)
        d.append(len(vp))
  
    df.loc[:, 'VP_count'] = d
    
    df = df.drop(['Speech_parsed'], axis=1)
    
    return df

In [None]:
df = syntactic_features(df, df['Speech'])

4. Prepare the dataframe for training and testing. Also, include bag-of-words and ngram features.

In [11]:
#splitting as the authors did 
df_train = df[df['Set'] == 'TRAIN']
df_val = df[df['Set'] == 'VALIDATION']
df_test = df[df['Set'] == 'TEST']

df_train = df_train.drop(['Set'], axis=1)
df_test = df_test.drop(['Set'], axis=1)

X_train = df_train.drop(['Annotation'], axis=1)
y_train = df_train.Annotation
X_test = df_test.drop(['Annotation'], axis=1)
y_test = df_test.Annotation

In [12]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,3))
train_vecs = vectorizer.fit_transform(X_train['Speech'])
test_vecs = vectorizer.transform(X_test['Speech'])

names = vectorizer.get_feature_names()
dense = train_vecs.todense()
denselist = dense.tolist()
fe = pd.DataFrame(denselist, columns = names)

X_train = X_train.drop(['Speech'], axis=1)
train_features = np.hstack([X_train, fe])

in_dim = train_features.shape[1]
print(in_dim)
print(train_features.shape)
print(train_features)

10034
(10464, 10034)
[[2. 6. 0. ... 0. 0. 0.]
 [5. 3. 0. ... 0. 0. 0.]
 [1. 1. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 2. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


In [13]:
names = vectorizer.get_feature_names()
dense = test_vecs.todense()
denselist = dense.tolist()
fe = pd.DataFrame(denselist, columns = names)

X_test = X_test.drop(['Speech'], axis=1)
test_features = np.hstack([X_test, fe])

5. The feed forward neural network is created with 2 hidden layers of 64/32 neurons and relu/sigmoid activation functions respectively. The output layer has the sigmoid activation function. The model is compiled with binary_crossentropy loss function and adam optimizer.

In [14]:
#keras NN model initialization

model = keras.Sequential([
    #input layer
    keras.layers.Dense(in_dim,input_shape=(in_dim,)),
    #hidden layer
    keras.layers.Dense(64, activation='relu'),
    #hidden layer
    keras.layers.Dense(32, activation='sigmoid'),
    #output layer
    keras.layers.Dense(1, activation='sigmoid'),   
])

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['AUC'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 10034)             100691190 
                                                                 
 dense_1 (Dense)             (None, 64)                642240    
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 1)                 33        
                                                                 
Total params: 101,335,543
Trainable params: 101,335,543
Non-trainable params: 0
_________________________________________________________________


In [15]:
train_features = np.asarray(train_features)
y_train = np.asarray(y_train)    
model.fit(train_features, y_train, epochs=1, batch_size=8)
model.fit(train_features, y_train, epochs=1, batch_size=128)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


<keras.callbacks.History at 0x29bb375f6c8>

In [16]:
y_pred = model.predict(test_features)
for i in range(len(y_pred)):
    if y_pred[i][0] > 0.5:
        y_pred[i][0] = 1
    else:
        y_pred[i][0] = 0
            
target_names = ['Premise', 'Claim']
print(classification_report(y_test, y_pred, target_names=target_names, digits=3))

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
              precision    recall  f1-score   support

     Premise      0.650     0.619     0.634      3214
       Claim      0.652     0.682     0.666      3361

    accuracy                          0.651      6575
   macro avg      0.651     0.650     0.650      6575
weighted avg      0.651     0.651     0.651      6575

