## Experimental Setting 3: fasttext embedding + LSTM Network 
### Task 1: Classification Argument (contains either Claim or Premise) vs non-Argument

In [1]:
import sys
import string
import numpy as np
import pandas as pd
from sklearn import svm
from nltk import tokenize
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Bidirectional, Embedding, Flatten
from keras.layers.core import Dense
from keras.layers.core import Activation
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

from gensim.models import FastText
from gensim.models import KeyedVectors
from gensim.models.fasttext import load_facebook_model

from nltk.tokenize import word_tokenize

from collections import Counter

np.set_printoptions(threshold=sys.maxsize)

**We import the raw dataset as a dataframe and process it to acquire for each entry, a tokenized sentence with with its corresponding label, 1 for argument, and 0 for non-argument**

In [2]:
filename = '../../../data/sentence_db_candidate.csv'
df = pd.read_csv(filename)

In [3]:
def preproc(sentence):
    sentence = sentence.lower()
    sentence = ''.join([i for i in sentence if i not in string.punctuation])
    return sentence

In [4]:
df['Speech'] = df['Speech'].apply(preproc)
valid = ['Claim', 'Premise', 'O']
df = df.loc[(df['Component'].isin(valid))]
 
classes = []

for s in df.Component:
    if s == 'O':
        classes.append(0.0)
    else:
        classes.append(1.0)
        
df['Annotation'] = classes
df.Annotation.value_counts()
df = df[['Speech', 'Annotation', 'Set']]

df_train = df[df['Set'] == 'TRAIN']
df_val = df[df['Set'] == 'VALIDATION']
df_test = df[df['Set'] == 'TEST']

all_sentences = df.iloc[:, 0].tolist()
all_sentences_train = df_train.iloc[:, 0].tolist()
all_sentences_test = df_test.iloc[:, 0].tolist()

all_labels_train = df_train.iloc[:, 1].tolist()
all_labels_test = df_test.iloc[:, 1].tolist()

all_sent_train_tokenized = []
all_sent_test_tokenized = []
all_sent_tokenized = []
longest_word_len = []
for i in range (len(all_sentences)):
    all_sent_tokenized.append(word_tokenize(all_sentences[i]))
    
for i in range (len(all_sentences_train)):
    all_sent_train_tokenized.append(word_tokenize(all_sentences_train[i]))
    
for i in range (len(all_sentences_test)):
    all_sent_test_tokenized.append(word_tokenize(all_sentences_test[i]))
    
print("longest sentence: ", max(all_sent_tokenized,key=len))
print("longest sentence length: ", len(max(all_sent_tokenized,key=len)))

longest sentence:  ['now', 'when', 'we', 'have', 'a', 'presidential', 'candidate', 'for', 'example', 'senator', 'kennedy', 'stating', 'over', 'and', 'over', 'again', 'that', 'the', 'united', 'states', 'is', 'second', 'in', 'space', 'and', 'the', 'fact', 'of', 'the', 'matter', 'is', 'that', 'the', 'space', 'score', 'today', 'is', 'twentyeight', 'to', 'eight', 'weve', 'had', 'twentyeight', 'successful', 'shots', 'theyve', 'had', 'eight', 'when', 'he', 'states', 'that', 'were', 'second', 'in', 'education', 'and', 'i', 'have', 'seen', 'soviet', 'education', 'and', 'ive', 'seen', 'ours', 'and', 'were', 'not', 'that', 'were', 'second', 'in', 'science', 'because', 'they', 'may', 'be', 'ahead', 'in', 'one', 'area', 'or', 'another', 'when', 'overall', 'were', 'way', 'ahead', 'of', 'the', 'soviet', 'union', 'and', 'all', 'other', 'countries', 'in', 'science', 'when', 'he', 'says', 'as', 'he', 'did', 'in', 'january', 'of', 'this', 'year', 'that', 'we', 'have', 'the', 'worst', 'slums', 'that', 'we

**We import the fasttext embeddings and then represent each tokenized sentence as indices of the respective tokens in the embedding vocabulary. All sentences will be padded to match the length of the longest sentence in the dataset**

In [5]:
FT = "../../wiki-news-300d-1M.vec"
fasttext = KeyedVectors.load_word2vec_format(FT)

In [6]:
max_seq_len = 149
vocab = Counter()

for sent in all_sent_tokenized:
    vocab.update(sent)
    
unique_words = len(fasttext)
word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(unique_words))}

sequences_train = [[word_index.get(t, 0) for t in sent] for sent in all_sent_train_tokenized]
sequences_test = [[word_index.get(t, 0) for t in sent] for sent in all_sent_test_tokenized]

data_train = pad_sequences(sequences_train, maxlen=max_seq_len, padding="pre", truncating="post")
print('Shape of data:', data_train.shape)

data_test = pad_sequences(sequences_test, maxlen=max_seq_len, padding="pre", truncating="post")
print('Shape of data:', data_test.shape)

Shape of data: (14044, 149)
Shape of data: (8455, 149)


**An embedding matrix will be created as the input embedding layer in the LSTM network**

In [7]:
embedding_matrix = (np.random.rand(unique_words, 300) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= unique_words:
        continue
    try:
        embedding_vector = fasttext[word]
        embedding_matrix[i] = embedding_vector
    except:
        pass  

**The bidirectional LSTM network is created with with an embedding layer whose weights are the pretrained fasttext embeddings and has a dimension of 300. We use 128 neurons for the bidirectional LSTM layer, sigmoid activation function for the output layer, binary_crossentropy for the loss function, and adam optimizer**

In [8]:
with tf.device('cpu:0'):
  embedding_layer = Embedding(len(fasttext), 300, weights = [embedding_matrix] , trainable=False)
  embedding_layer.build((len(fasttext), 300))

model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(128, return_sequences=False), input_shape=(300, 1)))
model.add(Dense(1,activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=["Precision","Recall"])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 300)         299998200 
                                                                 
 bidirectional (Bidirectiona  (None, 256)              439296    
 l)                                                              
                                                                 
 dense (Dense)               (None, 1)                 257       
                                                                 
Total params: 300,437,753
Trainable params: 439,553
Non-trainable params: 299,998,200
_________________________________________________________________


**Train and test the model**

In [9]:
data_train = np.asarray(data_train)
all_labels_train = np.asarray(all_labels_train)
model.fit(x=data_train, y=all_labels_train, epochs=5,batch_size=32)

Epoch 1/5
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x12fafa9cf88>

In [10]:
y_pred=model.predict(data_test)

for i in range(len(y_pred)):
    if y_pred[i][0] > 0.5:
        y_pred[i][0] = 1
    else:
        y_pred[i][0] = 0
        
target_names = ['None', 'Arg']
print(classification_report(all_labels_test, y_pred, target_names=target_names, digits=3))

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
              precision    recall  f1-score   support

        None      0.748     0.403     0.524      1880
         Arg      0.849     0.961     0.902      6575

    accuracy                          0.837      8455
   macro avg      0.799     0.682     0.713      8455
weighted avg      0.827     0.837     0.818      8455

