## Imports

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction import DictVectorizer

from nltk.tokenize import word_tokenize 
from nltk import pos_tag

import pandas as pd
import os, re
from collections import defaultdict


## Preprocessing

In [35]:
def read_csv(csv_path):
    df = pd.read_csv(csv_path, encoding='utf-8')
    df.drop(df[df['Gold'] == 'None'].index, inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df

def count_tag_percent(data_frame):
    print(data_frame['Gold'].value_counts())
    

## Feature selection

In [36]:
def get_words_and_tags(row):
    tokens = word_tokenize(row['Sentence'])
    tags = [t for (w, t) in pos_tag(tokens)]
    return tokens + tags

def get_vocabulary(data_frame):
    vocab = defaultdict(int)
    for index, row in data_frame.iterrows():
        words_and_tags = get_words_and_tags(row)
        for word_or_tag in words_and_tags:
            vocab[word_or_tag] += 1

    common_vocab = {word for word, count in vocab.items() if count > 1}
    return common_vocab

def count_numbers(row):
    matches = re.findall(r'\d+', row['Sentence'])
    return len(matches)

def count_names(row):
    matches = re.findall(r'([A-Z][a-z]*|Mr.|Mrs.|Ms.) [A-Z][a-z]*', row['Sentence'])
    return len(matches)

def count_pronouns(row):
    pronouns = ['I', 'me', 'my', 'we', 'us', 'our']  # Use first-person pronouns only, to find anecdotes?
    words = get_words_and_tags(row)
    count = 0
    for word in words:
        if word in pronouns:
            count += 1
        
    return count

def count_connectives(row):
    # Ideally use a pre-compiled list of connectives
    connectives = ['and','but','yet','so','despite','indeed','while','thus','plus','even','although','however','because','eventually','meanwhile','since','as']
    words = get_words_and_tags(row)
    count = 0
    for word in words:
        if word in connectives:
            count += 1
    return count

## Training and testing methods

In [37]:
def featurize(data_frame, vocab):
    cols = ['_label_']
    feature_cols = list(vocab)
    feature_cols.extend(['number_count', 'name_count', 'pronoun_count', 'connective_count'])
    cols.extend(feature_cols)
    
    row_count = data_frame.shape[0]
    feat_data_frame = pd.DataFrame(index=range(row_count), columns=cols)
    feat_data_frame.fillna(0, inplace=True)
    
    for index, row in data_frame.iterrows():
        feat_data_frame.loc[index,'_label_'] = row['Gold']
        words_and_tags = get_words_and_tags(row)
        
        for word_or_tag in words_and_tags:
            if word_or_tag in vocab:
                feat_data_frame.loc[index, word_or_tag] += 1
                
        feat_data_frame.loc[index, 'number_count'] = count_numbers(row)
        feat_data_frame.loc[index, 'name_count'] = count_names(row)
        feat_data_frame.loc[index, 'pronoun_count'] = count_pronouns(row)
        feat_data_frame.loc[index, 'connective_count'] = count_connectives(row)
                
    return feat_data_frame

In order to be used in classifiers, the feature representations in the DataFrame need to be transformed into a matrix, and the labels need to be transformed into a vector. The shape of the feature matrix is (# of instances, # of features). 

In [38]:
def vectorize(df):
    df.fillna(0, inplace=True)
    data = list()
    for index, row in df.iterrows():
        datum = dict()
        datum['bias'] = 1
        for col in df.columns:
            if not (col == "_label_" or col == 'index'):
                datum[col] = row[col]
        data.append(datum)
    vec = DictVectorizer()
    data = vec.fit_transform(data).toarray()
    
    labels = df._label_.values
    
    return data, labels

In [39]:
def train_model(X_train,y_train, model):
    model.fit(X_train,y_train)
    
    return model

In [40]:
def test_model(X_test, y_test, model):
    predictions = model.predict(X_test)
    report = classification_report(predictions, y_test)
    accuracy = accuracy_score(predictions, y_test)
    return accuracy, report

In [41]:
def classify(training_dataframe, test_dataframe):
    print('Creating features...')
    vocab = get_vocabulary(training_dataframe)
    X_train, y_train = vectorize(featurize(training_dataframe, vocab))
    X_test, y_test = vectorize(featurize(test_dataframe, vocab))
    
    print('Training model...')
    model = LogisticRegression(multi_class='multinomial',penalty='l2', solver='lbfgs', max_iter=300, verbose=1)
    model = train_model(X_train, y_train, model)
    
    accuracy, report = test_model(X_test, y_test, model)
    print(report)
    

## Run it

In [43]:
%%time
print('Gold Standard Counts:')
gold_dataframe = read_csv('gold_standard.csv')
count_tag_percent(gold_dataframe)

training_dataframe = read_csv('training.csv')
# devtest_dataframe = read_csv('dev.csv')
test_dataframe = read_csv('test.csv')

# print('Dev Set Counts:')
# count_tag_percent(devtest_dataframe)
print('Test Counts:')
count_tag_percent(test_dataframe)

print('\n')
classify(training_dataframe, test_dataframe)


Gold Standard Counts:
ASSERTION         552
FACT-TESTIMONY    179
NON-ARGUMENT       33
OTHER-FORM         24
RH-QUESTION        21
ANECDOTE           20
Name: Gold, dtype: int64
Test Counts:
ASSERTION         67
FACT-TESTIMONY    15
RH-QUESTION        3
NON-ARGUMENT       2
Name: Gold, dtype: int64


Creating features...
Training model...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


                precision    recall  f1-score   support

     ASSERTION       0.97      0.92      0.94        71
FACT-TESTIMONY       0.73      0.85      0.79        13
  NON-ARGUMENT       1.00      1.00      1.00         2
   RH-QUESTION       0.33      1.00      0.50         1

     micro avg       0.91      0.91      0.91        87
     macro avg       0.76      0.94      0.81        87
  weighted avg       0.93      0.91      0.91        87

Wall time: 32.3 s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s finished
