In [1]:
import io
import os
import re
import sys
import glob
import numpy as np
import pandas as pd
from sklearn import svm
from nltk import tokenize
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Bidirectional, Embedding, Flatten
from keras.layers.core import Dense
from keras.layers.core import Activation
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

from gensim.models import FastText
from gensim.models import KeyedVectors
from gensim.models.fasttext import load_facebook_model

from nltk.tokenize import word_tokenize

from collections import Counter

np.set_printoptions(threshold=sys.maxsize)

Using TensorFlow backend.


In [2]:
filename = 'updated_csv.csv'
df = pd.read_csv(filename)
df2 = df[df.Annotation != 'None']

#task 1, compile all sentences(for feature engineering) and corresponding labels, 1 for containing argument component
all_sentences = df.iloc[:, 1].tolist()
all_labels = df.iloc[:, 2].tolist()
for i in range(len(all_labels)):
    if all_labels[i] == "Claim" or all_labels[i] == "Premise":
        all_labels[i] = 1
    else:
        all_labels[i] = 0
        
#task 2, compile only sentences containing claim/premise(for feature engineering) and corresponding labels, 1 for claim
cp_sentences = df2.iloc[:, 1].tolist()
cp_labels = df2.iloc[:, 2].tolist()
for i in range(len(cp_labels)):
    if cp_labels[i] == "Claim":
        cp_labels[i] = 1
    else:
        cp_labels[i] = 0

In [3]:
FT = "fasttext/wiki-news-300d-1M.vec"

In [4]:
fasttext = KeyedVectors.load_word2vec_format(FT)

In [5]:
similarities = fasttext.most_similar(positive=['economy'])
similarities

[('economies', 0.7268105149269104),
 ('Economy', 0.6976400017738342),
 ('ecomony', 0.6842233538627625),
 ('ecomomy', 0.6667296886444092),
 ('economics', 0.6634849905967712),
 ('economic', 0.6561825275421143),
 ('society', 0.6531913876533508),
 ('econmy', 0.6524451375007629),
 ('recession', 0.6477683186531067),
 ('econony', 0.6444004774093628)]

In [6]:
all_sent_tokenized = []
longest_word_len = []
for i in range (len(all_sentences)):
    all_sent_tokenized.append(word_tokenize(all_sentences[i]))
    longest_word_len.append(len(max(all_sent_tokenized[i], key=len)))

print("longest word: ", max(longest_word_len))    
print("longest sentence: ", max(all_sent_tokenized,key=len))
print("longest sentence length: ", len(max(all_sent_tokenized,key=len)))

max_seq_len = 180

longest word:  28
longest sentence:  ['Now', ',', 'when', 'we', 'have', 'a', 'presidential', 'candidate', ',', 'for', 'example', '-', 'Senator', 'Kennedy', '-', 'stating', 'over', 'and', 'over', 'again', 'that', 'the', 'United', 'States', 'is', 'second', 'in', 'space', 'and', 'the', 'fact', 'of', 'the', 'matter', 'is', 'that', 'the', 'space', 'score', 'today', 'is', 'twenty-eight', 'to', 'eight', '-', 'we', "'ve", 'had', 'twenty-eight', 'successful', 'shots', ',', 'they', "'ve", 'had', 'eight', ';', 'when', 'he', 'states', 'that', 'we', "'re", 'second', 'in', 'education', ',', 'and', 'I', 'have', 'seen', 'Soviet', 'education', 'and', 'I', "'ve", 'seen', 'ours', ',', 'and', 'we', "'re", 'not', ';', 'that', 'we', "'re", 'second', 'in', 'science', 'because', 'they', 'may', 'be', 'ahead', 'in', 'one', 'area', 'or', 'another', ',', 'when', 'overall', 'we', "'re", 'way', 'ahead', 'of', 'the', 'Soviet', 'Union', 'and', 'all', 'other', 'countries', 'in', 'science', ';', 'when', 'he', 'says', '

In [7]:
unique_words = len(fasttext)

word_index = {t[0]: i+1 for i,t in enumerate(Counter().most_common(unique_words))}
sequences = [[word_index.get(t, 0) for t in sent] for sent in all_sentences]

# pad
data = pad_sequences(sequences, maxlen=max_seq_len, padding="pre", truncating="post")

print('Shape of data:', data.shape)

Shape of data: (36201, 180)


In [16]:
# we initialize the matrix with random numbers
embedding_matrix = (np.random.rand(unique_words, 300) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= unique_words:
        continue
    try:
        embedding_vector = fasttext[word]
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    except:
        pass  

In [8]:
all_labels_cat = to_categorical(all_labels)

In [9]:
X_train, X_test, y_train, y_true = train_test_split(data, all_labels_cat, test_size=0.25, random_state=42)

In [10]:
unique_words = len(fasttext)
print(unique_words)

999994


In [11]:
print(df)

       Index                                          Sentences Annotation
0          7                                      Good evening.       None
1         21  The television and radio stations of the Unite...       None
2        247               The candidates need no introduction.       None
3        284  The Republican candidate, Vice President Richa...       None
4        398  According to rules set by the candidates thems...       None
...      ...                                                ...        ...
36196  46019                                    Thank you both.       None
36197  46035  While millions have already voted, Election Da...       None
36198  46120  One thing everyone here can agree on: We hope ...       None
36199  46184  It is one of the honors and obligations of liv...       None
36200  46257                         Thank you, and good night.       None

[36201 rows x 3 columns]


In [38]:
with tf.device('cpu:0'):
  embedding_layer = Embedding(len(fasttext), 300, weights = [embedding_matrix] , trainable=False)
  embedding_layer.build((len(fasttext), 300))

model = Sequential()
model.add(embedding_layer)
model.add(Bidirectional(LSTM(100, return_sequences=False), input_shape=(300, 1)))
model.add(Dense(2,activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, None, 300)         299998200 
_________________________________________________________________
bidirectional_13 (Bidirectio (None, 200)               320800    
_________________________________________________________________
dense_12 (Dense)             (None, 2)                 402       
Total params: 300,319,402
Trainable params: 321,202
Non-trainable params: 299,998,200
_________________________________________________________________


In [39]:
model.fit(x=X_train, y=y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.callbacks.History at 0x1e7103da808>