In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import spacy
import collections

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer

import keras
import tensorflow_addons as tfa
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers.core import Activation
from tensorflow.keras.utils import to_categorical

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
filename = 'sentence_db_candidate.csv'
df = pd.read_csv(filename)

In [None]:
df.shape

In [None]:
def preproc(sentence):
    sentence = sentence.lower()
    sentence = ''.join([i for i in sentence if i not in string.punctuation])
    return sentence

In [None]:
df['Speech'] = df['Speech'].apply(preproc)

In [None]:
valid = ['Claim', 'Premise', 'O']
df = df.loc[(df['Component'].isin(valid))]

In [None]:
#turning labels into two classes 
classes = []

for s in df.Component:
    if s == 'O':
        classes.append(0.0)
    else:
        classes.append(1.0)

In [None]:
df['Annotation'] = classes
df.Annotation.value_counts()
df = df[['Speech', 'Annotation', 'Set']]

In [None]:
def add_connectives (df, speech_sents):

    """ 
    :input: df: entire DataFrame
            speech_sents: numpy array of text data instances in DataFrame
    :return: df: DataFrame with a new feature Claim_Connective, 
            representing the presence/absence of any connective from a given list in a sentence
    
    """
    
    connectives = ['so that', 'as a result', 'therefore', 'thus', 'thereby', 'in the end', 'hence', 'accordingly', 'in this way']
    lst = []
    
    for sent in speech_sents:
        if any(w in sent for w in connectives):
            lst.append(1)
        else:
            lst.append(0)
    df['Claim_Connective'] = lst
    
    return df

In [None]:
add_connectives(df, df['Speech'])

In [None]:
def add_sentiment (df, speech_sents): 
    
    analyzer = SentimentIntensityAnalyzer()

    senti = []
    
    for sent in speech_sents:
        vs = analyzer.polarity_scores(sent)
        senti.append([list(vs.values())[3]])
    
    senti_arr = np.array(senti)
    df['Sentiment'] = senti_arr
    
    return df 

In [None]:
add_sentiment(df, df['Speech'])

In [None]:
spacy_loaded = spacy.load("en_core_web_sm")
# tag text and extract tags into a list

df['ner'] = df['Speech'].apply(lambda x: [(tag.text, tag.label_) 
                                for tag in spacy_loaded(x).ents])

In [None]:
import collections

# utils function to count the element of a list

def utils_lst_count(lst):
    dic_counter = collections.Counter()
    
    for x in lst:
        dic_counter[x] += 1
    
    dic_counter = collections.OrderedDict( 
                     sorted(dic_counter.items(), 
                     key=lambda x: x[1], reverse=True))
    
    lst_count = [{key:value} for key,value in dic_counter.items()]
    
    return lst_count

In [None]:
# count tags
df['ner'] = df['ner'].apply(lambda x: utils_lst_count(x))

In [None]:
# utils function create new column for each tag category

def utils_ner_features(lst_dics_tuples, tag):
    if len(lst_dics_tuples) > 0:
        tag_type = []
        for dic_tuples in lst_dics_tuples:
            for tuple in dic_tuples:
                type, n = tuple[1], dic_tuples[tuple]
                tag_type = tag_type + [type]*n
                dic_counter = collections.Counter()
                for x in tag_type:
                    dic_counter[x] += 1
        return dic_counter[tag]
    else:
        return 0

In [None]:
# extract features

tags_set = []

for lst in df['ner'].tolist():
    for dic in lst:
        for k in dic.keys():
            tags_set.append(k[1])
            
tags_set = list(set(tags_set))

for feature in tags_set:
    df['ner_' + feature] = df['ner'].apply(lambda x: utils_ner_features(x, feature))

In [None]:
df = df.drop(['ner'], axis=1)

In [None]:
df['pos'] = df['Speech'].apply(lambda x: [(tag.text, tag.pos_) 
                                for tag in spacy_loaded(x)])

In [None]:
# count tags
df['pos'] = df['pos'].apply(lambda x: utils_lst_count(x))

In [None]:
# extract pos 
pos_set = []

for lst in df['pos'].tolist():
    for dic in lst:
        for k in dic.keys():
            pos_set.append(k[1])
            
pos_set = list(set(pos_set))

for feature in pos_set:
    df['pos_' + feature] = df['pos'].apply(lambda x: utils_ner_features(x, feature))

In [None]:
# keeping only adverbs and adjectives and dropping other pos, like authors had
for feature in df.columns:
    if feature != 'pos_ADV' and feature != 'pos_ADJ' and 'pos' in feature:
        df = df.drop(feature, axis=1)

In [None]:
#splitting as the authors did 
df_train = df[df['Set'] == 'TRAIN']
df_val = df[df['Set'] == 'VALIDATION']
df_test = df[df['Set'] == 'TEST']

In [None]:
df_train = df_train.drop(['Set'], axis=1)
df_test = df_test.drop(['Set'], axis=1)

X_train = df_train.drop(['Annotation'], axis=1)
y_train = df_train.Annotation
X_test = df_test.drop(['Annotation'], axis=1)
y_test = df_test.Annotation

In [None]:
bow = TfidfVectorizer(max_features=10000, ngram_range=(1,3))
bow_train = bow.fit_transform(X_train['Speech'])
bow_test = bow.transform(X_test['Speech'])

In [None]:
names = bow.get_feature_names()
dense = bow_train.todense()
denselist = dense.tolist()
fe = pd.DataFrame(denselist, columns = names)

In [None]:
X_train = X_train.drop(['Speech'], axis=1)

In [None]:
train_features = np.hstack([X_train, fe])

In [None]:
in_dim = train_features.shape[1]
print(in_dim)
print(train_features.shape)
print(train_features)

In [None]:
names = bow.get_feature_names()
dense = bow_test.todense()
denselist = dense.tolist()
fe = pd.DataFrame(denselist, columns = names)

In [None]:
X_test = X_test.drop(['Speech'], axis=1)

In [None]:
test_features = np.hstack([X_test, fe])

In [None]:
#keras NN model initialization

model = keras.Sequential([
    #input layer
    keras.layers.Dense(in_dim,input_shape=(in_dim,)),
    #hidden layer
    keras.layers.Dense(64, activation='relu'),
    #hidden layer
    keras.layers.Dense(32, activation='sigmoid'),
    #output layer
    keras.layers.Dense(1, activation='sigmoid'),   
])

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['Precision','Recall'])
model.summary()

In [None]:
train_features = np.asarray(train_features)
y_train = np.asarray(y_train)    
model.fit(train_features, y_train, epochs=2, batch_size=128)

In [None]:
y_pred = model.predict(test_features)
print(y_pred)

In [None]:
for i in range(len(y_pred)):
    if y_pred[i][0] > 0.5:
        y_pred[i][0] = 1
    else:
        y_pred[i][0] = 0

In [None]:
target_names = ['Not Argument', 'Is Argument']
print(classification_report(y_test, y_pred, target_names=target_names, digits=3))