In [None]:
import io
import os
import re
import sys
import glob

import string
import numpy as np
import pandas as pd
from sklearn import svm
from tqdm import tqdm
from nltk import tokenize
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import keras
import tensorflow as tf
import tensorflow_addons as tfa
from keras.models import Sequential
from keras.layers import LSTM, Bidirectional, Embedding, Flatten
from keras.layers.core import Dense
from keras.layers.core import Activation
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

from gensim.models import FastText
from gensim.models import KeyedVectors
from gensim.models.fasttext import load_facebook_model

from nltk.tokenize import word_tokenize

from collections import Counter

np.set_printoptions(threshold=sys.maxsize)

In [None]:
filename = 'sentence_db_candidate.csv'
df = pd.read_csv(filename)

In [None]:
def preproc(sentence):
    sentence = sentence.lower()
    sentence = ''.join([i for i in sentence if i not in string.punctuation])
    return sentence

In [None]:
df['Speech'] = df['Speech'].apply(preproc)

In [None]:
valid = ['Claim', 'Premise']
df = df.loc[(df['Component'].isin(valid))]

In [None]:
#turning labels into two classes 
classes = []

for s in df.Component:
    if s == 'Claim':
        classes.append(1.0)
    else:
        classes.append(0.0)

In [None]:
df['Annotation'] = classes
df.Annotation.value_counts()
df = df[['Speech', 'Annotation', 'Set']]

max_seq_len = 150

In [None]:
df_train = df[df['Set'] == 'TRAIN']
df_val = df[df['Set'] == 'VALIDATION']
df_test = df[df['Set'] == 'TEST']

In [None]:
#task 1, compile all sentences(for feature engineering) and corresponding labels, 1 for containing argument component
all_sentences = df.iloc[:, 0].tolist()
all_sentences_train = df_train.iloc[:, 0].tolist()
all_sentences_test = df_test.iloc[:, 0].tolist()

all_labels_train = df_train.iloc[:, 1].tolist()
all_labels_test = df_test.iloc[:, 1].tolist()

print(all_sentences_train[0:5])
print(all_labels_train[0:5])

In [None]:
FT = "fasttext/wiki-news-300d-1M.vec"
fasttext = KeyedVectors.load_word2vec_format(FT)

In [None]:
all_sent_train_tokenized = []
all_sent_test_tokenized = []
all_sent_tokenized = []
longest_word_len = []
for i in range (len(all_sentences)):
    all_sent_tokenized.append(word_tokenize(all_sentences[i]))

for i in range (len(all_sentences_train)):
    all_sent_train_tokenized.append(word_tokenize(all_sentences_train[i]))
    
for i in range (len(all_sentences_test)):
    all_sent_test_tokenized.append(word_tokenize(all_sentences_test[i]))
    
#print("longest sentence: ", max(all_sent_tokenized,key=len))
#print("longest sentence length: ", len(max(all_sent_tokenized,key=len)))

In [None]:
max_seq_len = 149

In [None]:
vocab = Counter()

for sent in tqdm(all_sent_tokenized):
    vocab.update(sent)
    
unique_words = len(fasttext)
word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(unique_words))}

sequences_train = [[word_index.get(t, 0) for t in sent] for sent in all_sent_train_tokenized]
sequences_test = [[word_index.get(t, 0) for t in sent] for sent in all_sent_test_tokenized]

In [None]:
data_train = pad_sequences(sequences_train, maxlen=max_seq_len, padding="pre", truncating="post")
print('Shape of data:', data_train.shape)

data_test = pad_sequences(sequences_test, maxlen=max_seq_len, padding="pre", truncating="post")
print('Shape of data:', data_test.shape)

In [None]:
# we initialize the matrix with random numbers
embedding_matrix = (np.random.rand(unique_words, 300) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= unique_words:
        continue
    try:
        embedding_vector = fasttext[word]
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    except:
        pass  

In [None]:
with tf.device('cpu:0'):
  embedding_layer = Embedding(len(fasttext), 300, weights = [embedding_matrix] , trainable=False)
  embedding_layer.build((len(fasttext), 300))

model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(200, return_sequences=False, activation="sigmoid"), input_shape=(300, 1)))
model.add(Dense(1,activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["AUC"])
model.summary()

In [None]:
data_train = np.asarray(data_train)
all_labels_train = np.asarray(all_labels_train)
model.fit(x=data_train, y=all_labels_train, epochs=1,batch_size=1)

In [None]:
y_pred=model.predict(data_test)

for i in range(len(y_pred)):
    if y_pred[i][0] > 0.5:
        y_pred[i][0] = 1
    else:
        y_pred[i][0] = 0

In [None]:
target_names = ['Premise', 'Claim']
print(classification_report(all_labels_test, y_pred, target_names=target_names, digits=3))