In [2]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import spacy
import collections

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer

import keras
import tensorflow_addons as tfa
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers.core import Activation
from tensorflow.keras.utils import to_categorical

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [3]:
filename = 'sentence_db_candidate.csv'
df = pd.read_csv(filename)

In [4]:
df.shape

(29621, 18)

In [5]:
def preproc(sentence):
    sentence = sentence.lower()
    sentence = ''.join([i for i in sentence if i not in string.punctuation])
    return sentence

In [6]:
df['Speech'] = df['Speech'].apply(preproc)

In [7]:
valid = ['Claim', 'Premise']
df = df.loc[(df['Component'].isin(valid))]

In [8]:
#turning labels into two classes 
classes = []

for s in df.Component:
    if s == 'Claim':
        classes.append(1.0)
    else:
        classes.append(0.0)

In [9]:
df['Annotation'] = classes
df.Annotation.value_counts()
df = df[['Speech', 'Annotation', 'Set']]

In [10]:
def add_connectives (df, speech_sents):

    """ 
    :input: df: entire DataFrame
            speech_sents: numpy array of text data instances in DataFrame
    :return: df: DataFrame with a new feature Claim_Connective, 
            representing the presence/absence of any connective from a given list in a sentence
    
    """
    
    connectives = ['so that', 'as a result', 'therefore', 'thus', 'thereby', 'in the end', 'hence', 'accordingly', 'in this way']
    lst = []
    
    for sent in speech_sents:
        if any(w in sent for w in connectives):
            lst.append(1)
        else:
            lst.append(0)
    df['Claim_Connective'] = lst
    
    return df

In [11]:
add_connectives(df, df['Speech'])

Unnamed: 0,Speech,Annotation,Set,Claim_Connective
3,and after 911 it became clear that we had to d...,1.0,TRAIN,0
4,and we also then finally had to stand up democ...,0.0,TRAIN,0
9,what we did in iraq was exactly the right thin...,1.0,TRAIN,0
10,if i had it to recommend all over again i woul...,0.0,TRAIN,0
11,the world is far safer today because saddam hu...,0.0,TRAIN,0
...,...,...,...,...
29615,but our longterm security depends on our deep ...,1.0,VALIDATION,0
29616,and well continue to promote freedom around th...,1.0,VALIDATION,0
29617,freedom is on the march,1.0,VALIDATION,0
29618,tomorrow afghanistan will be voting for a pres...,0.0,VALIDATION,0


In [12]:
def add_sentiment (df, speech_sents): 
    
    analyzer = SentimentIntensityAnalyzer()

    senti = []
    
    for sent in speech_sents:
        vs = analyzer.polarity_scores(sent)
        senti.append([list(vs.values())[3]])
    
    senti_arr = np.array(senti)
    df['Sentiment'] = senti_arr
    
    return df 

In [13]:
add_sentiment(df, df['Speech'])

Unnamed: 0,Speech,Annotation,Set,Claim_Connective,Sentiment
3,and after 911 it became clear that we had to d...,1.0,TRAIN,0,-0.7269
4,and we also then finally had to stand up democ...,0.0,TRAIN,0,-0.7721
9,what we did in iraq was exactly the right thin...,1.0,TRAIN,0,0.0000
10,if i had it to recommend all over again i woul...,0.0,TRAIN,0,0.6124
11,the world is far safer today because saddam hu...,0.0,TRAIN,0,0.1531
...,...,...,...,...,...
29615,but our longterm security depends on our deep ...,1.0,VALIDATION,0,0.9081
29616,and well continue to promote freedom around th...,1.0,VALIDATION,0,0.8360
29617,freedom is on the march,1.0,VALIDATION,0,0.6369
29618,tomorrow afghanistan will be voting for a pres...,0.0,VALIDATION,0,0.0000


In [14]:
spacy_loaded = spacy.load("en_core_web_sm")
# tag text and extract tags into a list

df['ner'] = df['Speech'].apply(lambda x: [(tag.text, tag.label_) 
                                for tag in spacy_loaded(x).ents])

In [15]:
import collections

# utils function to count the element of a list

def utils_lst_count(lst):
    dic_counter = collections.Counter()
    
    for x in lst:
        dic_counter[x] += 1
    
    dic_counter = collections.OrderedDict( 
                     sorted(dic_counter.items(), 
                     key=lambda x: x[1], reverse=True))
    
    lst_count = [{key:value} for key,value in dic_counter.items()]
    
    return lst_count

In [16]:
# count tags
df['ner'] = df['ner'].apply(lambda x: utils_lst_count(x))

In [17]:
# utils function create new column for each tag category

def utils_ner_features(lst_dics_tuples, tag):
    if len(lst_dics_tuples) > 0:
        tag_type = []
        for dic_tuples in lst_dics_tuples:
            for tuple in dic_tuples:
                type, n = tuple[1], dic_tuples[tuple]
                tag_type = tag_type + [type]*n
                dic_counter = collections.Counter()
                for x in tag_type:
                    dic_counter[x] += 1
        return dic_counter[tag]
    else:
        return 0

In [18]:
# extract features

tags_set = []

for lst in df['ner'].tolist():
    for dic in lst:
        for k in dic.keys():
            tags_set.append(k[1])
            
tags_set = list(set(tags_set))

for feature in tags_set:
    df['ner_' + feature] = df['ner'].apply(lambda x: utils_ner_features(x, feature))

In [19]:
df = df.drop(['ner'], axis=1)

In [21]:
df['pos'] = df['Speech'].apply(lambda x: [(tag.text, tag.pos_) 
                                for tag in spacy_loaded(x)])

In [22]:
# count tags
df['pos'] = df['pos'].apply(lambda x: utils_lst_count(x))

In [23]:
# extract pos 
pos_set = []

for lst in df['pos'].tolist():
    for dic in lst:
        for k in dic.keys():
            pos_set.append(k[1])
            
pos_set = list(set(pos_set))

for feature in pos_set:
    df['pos_' + feature] = df['pos'].apply(lambda x: utils_ner_features(x, feature))

In [24]:
# keeping only adverbs and adjectives and dropping other pos, like authors had
for feature in df.columns:
    if feature != 'pos_ADV' and feature != 'pos_ADJ' and 'pos' in feature:
        df = df.drop(feature, axis=1)

In [25]:
#splitting as the authors did 
df_train = df[df['Set'] == 'TRAIN']
df_val = df[df['Set'] == 'VALIDATION']
df_test = df[df['Set'] == 'TEST']

In [26]:
df_train = df_train.drop(['Set'], axis=1)
df_test = df_test.drop(['Set'], axis=1)

X_train = df_train.drop(['Annotation'], axis=1)
y_train = df_train.Annotation
X_test = df_test.drop(['Annotation'], axis=1)
y_test = df_test.Annotation

In [27]:
bow = TfidfVectorizer(max_features=10000, ngram_range=(1,3))
bow_train = bow.fit_transform(X_train['Speech'])
bow_test = bow.transform(X_test['Speech'])

In [28]:
names = bow.get_feature_names()
dense = bow_train.todense()
denselist = dense.tolist()
fe = pd.DataFrame(denselist, columns = names)

In [29]:
X_train = X_train.drop(['Speech'], axis=1)

In [30]:
train_features = np.hstack([X_train, fe])

In [31]:
in_dim = train_features.shape[1]
print(in_dim)
print(train_features.shape)
print(train_features)

10022
(10464, 10022)
[[ 0.     -0.7269  0.     ...  0.      0.      0.    ]
 [ 0.     -0.7721  0.     ...  0.      0.      0.    ]
 [ 0.      0.      0.     ...  0.      0.      0.    ]
 ...
 [ 0.     -0.5994  0.     ...  0.      0.      0.    ]
 [ 0.     -0.296   0.     ...  0.      0.      0.    ]
 [ 0.      0.5106  0.     ...  0.      0.      0.    ]]


In [32]:
names = bow.get_feature_names()
dense = bow_test.todense()
denselist = dense.tolist()
fe = pd.DataFrame(denselist, columns = names)

In [33]:
X_test = X_test.drop(['Speech'], axis=1)

In [34]:
test_features = np.hstack([X_test, fe])

In [56]:
#keras NN model initialization

model = keras.Sequential([
    #input layer
    keras.layers.Dense(in_dim,input_shape=(in_dim,)),
    #hidden layer
    keras.layers.Dense(64, activation='relu'),
    #hidden layer
    keras.layers.Dense(32, activation='sigmoid'),
    #output layer
    keras.layers.Dense(1, activation='sigmoid'),   
])

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['AUC'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 10022)             100450506 
                                                                 
 dense_13 (Dense)            (None, 64)                641472    
                                                                 
 dense_14 (Dense)            (None, 32)                2080      
                                                                 
 dense_15 (Dense)            (None, 1)                 33        
                                                                 
Total params: 101,094,091
Trainable params: 101,094,091
Non-trainable params: 0
_________________________________________________________________


In [57]:
train_features = np.asarray(train_features)
y_train = np.asarray(y_train)    
model.fit(train_features, y_train, epochs=1, batch_size=8)
model.fit(train_features, y_train, epochs=1, batch_size=128)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


<keras.callbacks.History at 0x1f893f884c8>

In [58]:
y_pred = model.predict(test_features)
for i in range(len(y_pred)):
    if y_pred[i][0] > 0.5:
        y_pred[i][0] = 1
    else:
        y_pred[i][0] = 0

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'


In [59]:
target_names = ['Premise', 'Claim']
print(classification_report(y_test, y_pred, target_names=target_names, digits=3))

              precision    recall  f1-score   support

     Premise      0.635     0.613     0.624      3214
       Claim      0.642     0.664     0.653      3361

    accuracy                          0.639      6575
   macro avg      0.639     0.638     0.638      6575
weighted avg      0.639     0.639     0.639      6575

