<h2>Load the data</h2>

In [16]:
import os
import re
import sys
import glob
import numpy as np
import pandas as pd
from afinn import Afinn
from sklearn import svm
from nltk import tokenize
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import keras
from keras.models import Sequential
from keras.layers.core import Dense
from keras.layers.core import Activation
from tensorflow.keras.utils import to_categorical

np.set_printoptions(threshold=sys.maxsize)

In [2]:
#edit line 3 and line 8 to narrow the selection of files to be read in
txt_files = []
for name in glob.glob('ElecDeb60To16/*.txt'):
    name = name.replace('\\','/')
    txt_files.append(name)

ann_files = []
for name in glob.glob('ElecDeb60To16/*.ann'):
    name = name.replace('\\','/')
    ann_files.append(name)

In [3]:
#This block reads in all the sentences and annotations 
sent_all = []
ind_all = []

for file in txt_files:
    sent_list = []
    ind_list = []
    prev_ind = 0
    
    txt_file = open(file)
    text = txt_file.read()
    txt_file.close()
    
    
    with open(file) as infile:
        for line in infile:
            line = re.sub(r'(:.+ANCHORRRR)','',line)
            line = re.sub(r'(\([A-Z]+\)_[A-Z]+(\_[A-Z]+)?: )','',line)
            line = re.sub(r'(\([A-Z\s]+:[A-Za-z\.\s]+\))','',line)
            line = re.sub(r'([A-Z]+\_[A-Z]+\_[A-Z]+: )','',line)
            line = re.sub(r'([A-Z]+\_[A-Z]+: )','',line)
            line = re.sub(r'([A-Z]+\.\s[A-Z]+: )','',line)
            line = re.sub(r'([A-Z]+\'[A-Z]+: )','',line)
            line = re.sub(r'(\_[A-Z]+: )','',line)
            line = re.sub(r'([A-Z]+: )','',line)
            line = re.sub(r'\[[a-zA-Z0-9;,\s\"\'\.\S]*\]*','',line)
            
                  
            for sent in tokenize.sent_tokenize(line):
                if not sent_list:
                    ind = text.find(sent,0,len(text)) 
                else:
                    ind = text.find(sent,prev_ind,len(text))
                prev_ind = ind
                sent_list.append([sent,ind])
                ind_list.append(ind)
                
    ind_all.append(np.array(ind_list))
    sent_all.append(sent_list)
    
ann_all = []

for file in ann_files:
    ann_list = []
    
    with open(file) as infile:
        for line in infile:
            entry = line.split("\t")
            ann_list.append(entry)
    
    ann_all.append(ann_list)

#Up to this point we get a sent_all containing every debate (42 total) tokenized into sentences.
#sent_all = [sent_list, sent_list, sent_list,...]
#sent_list = [[sent1, index],[sent2, index],[sent3,index],...]
#ind_all = [ind_list, ind_list, ind_list,...]
#ind_list = [ind, ind, ind,...]

#ann_all = [ann_list, ann_list, ann_list, ...]
#ann_list = [['T1', 'Claim/Premise xxx xxx', '...'], ['T2', 'Claim/Premise xxx xxx', '...'], 
    #['T3', 'Claim/Premise xxx xxx', '...'], ...]

In [4]:
#following part creates labels for task 1 (whether a sentence contains an argument component)

labels_arg_all = []

for i in range(len(ind_all)):
    labels_arg_arr = np.empty([len(ind_all[i]),2])
    sent_start_ind_list = []

    #column 0 of labels_arg are the starting indices of the senteces 
    for j in range(len(ind_all[i])):
        labels_arg_arr[j,0] = ind_all[i][j]
        
    #loop through each annotation entry, get the corresponding sentence starting index 
    for j in range(len(ann_all[i])):
        phrase_start = re.search(r'[0-9]+\s',ann_all[i][j][1]).group(0)
        sent_start_ind_list.append(ind_all[i][ind_all[i] <= int(phrase_start)].max())
        
        phrase_start = re.search(r';[0-9]+\s',ann_all[i][j][1])
        if phrase_start != None:
            phrase_start = phrase_start.group(0)[1:]
            ind2 = ind_all[i][ind_all[i] <= int(phrase_start)].max()
            if ind2 not in sent_start_ind_list:
                sent_start_ind_list.append(ind2)
    
    #loop through starting indices for sentences, check if each sentence contains an annotation (labeled as 1)
    for j in range(len(ind_all[i])):   
        if ind_all[i][j] in sent_start_ind_list:
            labels_arg_arr[j,1] = 1
        else:
            labels_arg_arr[j,1] = 0
    
    labels_arg_all.append(labels_arg_arr.astype(int))
    
#following part creates labels for task 2 (whether a sentence contains an premise or a claim or none)
    
labels_cp_all = []

for i in range(len(labels_arg_all)):
    labels_cp_list = []
    
    for j in range(labels_arg_all[i].shape[0]):
        if labels_arg_all[i][j,1] != 0:
            labels_cp_list.append(labels_arg_all[i][j,0])
    
    labels_cp_arr = np.zeros((len(labels_cp_list),3))
    labels_cp_out = np.zeros((len(labels_cp_list),2))
    
    for k in range(len(labels_cp_list)):
        labels_cp_arr[k,0] = labels_cp_list[k]

        
    for j in range(len(ann_all[i])):
        #check for claim with only 1 continuous part
        phrase_start = re.search(r'c|Claim\s[0-9]+\s[0-9]+',ann_all[i][j][1])
        if phrase_start != None:
            phrase_start = phrase_start.group(0)
            labels_cp_arr[np.where(labels_cp_arr[:,0] == ind_all[i][ind_all[i] <= int(phrase_start.split()[1])].max())[0],1] +=\
            int(phrase_start.split()[2]) - int(phrase_start.split()[1])
        
        #check for claim formed from 2 separate parts
        phrase_start = re.search(r'c|Claim[0-9\s]+;[0-9]+\s[0-9]+',ann_all[i][j][1])
        if phrase_start != None:
            phrase_start = phrase_start.group(0)
            labels_cp_arr[np.where(labels_cp_arr[:,0] == ind_all[i][ind_all[i] <= int(phrase_start.split(';')[1].split()[0])].max())[0],1] +=\
            int(phrase_start.split(';')[1].split()[1]) - int(phrase_start.split(';')[1].split()[0])
        
        #check for premise with only 1 continuous part
        phrase_start = re.search(r'p|Premise\s[0-9]+\s[0-9]+',ann_all[i][j][1])
        if phrase_start != None:
            phrase_start = phrase_start.group(0)
            labels_cp_arr[np.where(labels_cp_arr[:,0] == ind_all[i][ind_all[i] <= int(phrase_start.split()[1])].max())[0],2] +=\
            int(phrase_start.split()[2]) - int(phrase_start.split()[1])
        
        #check for premise formed from 2 separate parts
        phrase_start = re.search(r'p|Premise[0-9\s]+;[0-9]+\s[0-9]+',ann_all[i][j][1])
        if phrase_start != None:
            phrase_start = phrase_start.group(0)
            labels_cp_arr[np.where(labels_cp_arr[:,0] == ind_all[i][ind_all[i] <= int(phrase_start.split(';')[1].split()[0])].max())[0],2] +=\
            int(phrase_start.split(';')[1].split()[1]) - int(phrase_start.split(';')[1].split()[0])
    
    #compare length of claim and premise components in a sentence, and label the sentence according to the longer component
    labels_cp_out[:,0] = labels_cp_arr[:,0]
    for m in range(labels_cp_arr.shape[0]):
        if labels_cp_arr[m,1] >= labels_cp_arr[m,2]:
            labels_cp_out[m,1] = 1
            
    labels_cp_all.append(labels_cp_out)

#labels_arg_all = [labels_arg_arr, labels_arg_arr, labesl_arg_arr, ...]
#labels_arg_arr = np array of size (len(ind_list),2), no. of rows = no. of sentences in a txt file, 
    #col 0 = starting index of sentece, col 1 = label (1 for contain claim/premise, 0 for otherwise)  
      
#labels_cp_all = [labels_cp_out, labels_cp_out, labels_cp_out, ...]
#labels_cp_out = np array of size ((len(labels_cp_list),3)), 
    #no. of rows = no. sentences that contains at least 1 claim or 1 premise,
    #col 0 = starting index of such a sentence, col 1 = label (1 for claim, 0 for premise)

In [10]:
filename = 'updated_csv.csv'
df = pd.read_csv(filename)
df2 = df[df.Annotation != 'None']

#task 1, compile all sentences(for feature engineering) and corresponding labels, 1 for containing argument component
all_sentences = df.iloc[:, 1].tolist()
all_labels = df.iloc[:, 2].tolist()
for i in range(len(all_labels)):
    if all_labels[i] == "Claim" or all_labels[i] == "Premise":
        all_labels[i] = 1
    else:
        all_labels[i] = 0
        
#task 2, compile only sentences containing claim/premise(for feature engineering) and corresponding labels, 1 for claim
cp_sentences = df2.iloc[:, 1].tolist()
cp_labels = df2.iloc[:, 2].tolist()
for i in range(len(cp_labels)):
    if cp_labels[i] == "Claim":
        cp_labels[i] = 1
    else:
        cp_labels[i] = 0

In [37]:
analyzer = SentimentIntensityAnalyzer()

all_features = []
for sentence in all_sentences:
    vs = analyzer.polarity_scores(sentence)
    all_features.append(list(vs.values()))
    
#print(all_features)
    
#################################################################
cp_features = []
for sentence in cp_sentences:
    vs = analyzer.polarity_scores(sentence)
    cp_features.append(list(vs.values()))

#print(cp_features)

In [36]:
X_train, X_test, y_train, y_true = train_test_split(all_features, all_labels, test_size=0.25, random_state=42)

#tinkering with svm
clf = svm.SVC(kernel='linear', random_state = 1).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("tn, fp, fn, tp: ",confusion_matrix(y_true, y_pred).ravel())

tn, fp, fn, tp:  [   0 3251    0 5800]


In [39]:
#keras NN model initialization

model = keras.Sequential([
    #input layer
    keras.layers.Dense(4,input_shape=(4,)),
    #hidden layer
    keras.layers.Dense(20, activation='relu'),
    #output layer
    keras.layers.Dense(2, activation='softmax'),   
])

model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])
#model.summary()

In [None]:
all_labels_cat = to_categorical(all_labels)
all_features_arr = np.array(all_features)
cp_labels_cat = to_categorical(cp_labels)
cp_features_arr = np.array(cp_features)

#tinkering with keras NN
X_train, X_test, y_train, y_true = train_test_split(all_features_arr, all_labels_cat, test_size=0.25, random_state=42)
model_fit = model.fit(X_train, y_train, epochs=10, batch_size=32)