# UChicagoCompLx at the 2018 SMM4H Shared Task

importing libraries

In [None]:
%load_ext autoreload
%autoreload 2

import pandas
import numpy
numpy.random.seed(1)

import keras
import nn_modules
import utilities

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, precision_score, recall_score

import os
import datetime

setting parameters

In [None]:
TASK = '1'
path_to_files = f'../smm4h_shared_task{TASK}/task{TASK}'
CV = False

## Preparing datasets

### Reading datasets

In [None]:
train = pandas.read_csv(
    filepath_or_buffer=path_to_files + 'train.tsv', 
    sep='\t', 
    engine='c')

val = pandas.read_csv(
    filepath_or_buffer=path_to_files + 'val.tsv', 
    sep='\t', 
    engine='c')

if CV: 
    train = pandas.concat([train,val]).sample(frac=1)
print(f'Training set examples: {train.shape[0]}')

if CV:
    print('Training for submission/cross-validation; will not use withheld validation set...')
else:
    print(f'Validation examples: {val.shape[0]}')
    
test = pandas.read_csv(
    filepath_or_buffer=path_to_files + 'test.tsv', 
    sep='\t', 
    engine='c')
print(f'Test set examples: {test.shape[0]}')

# for TASK 2, shift labels by -1
if TASK == '2':
    train.target = train.target - 1
    val.target = val.target - 1

### Preparing text

preprocessing

In [None]:
train.tweet = train.tweet.apply(utilities.text_preprocessor)
test.tweet = test.tweet.apply(utilities.text_preprocessor)
if not (CV or FINAL):
    val.tweet = val.tweet.apply(utilities.text_preprocessor)   

vectorizing

In [None]:
max_words_per_doc = 50
max_words = 7000

train_strings, val_strings, test_strings = train.tweet, val.tweet, test.tweet
corpus = numpy.concatenate((train_strings, val_strings, test_strings),axis=0)

tokenizer = keras.preprocessing.text.Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(corpus)

x_train = keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(train_strings), 
    maxlen=max_words_per_doc)
x_val = keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(val_strings), 
    maxlen=max_words_per_doc)
x_test = keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(test_strings), 
    maxlen=max_words_per_doc)

loading pre-trained word embeddings

In [None]:
dim = 100
embeddings_file = f'../embeddings/datastories.twitter.{dim}d.txt'

embedding_matrix = utilities.load_embeddings(
    embeddings_file=embeddings_file,
    word_index=tokenizer.word_index, 
    max_words=max_words, 
    embedding_dim=dim)

### Preparing labels

In [None]:
# get labels from train and val sets
train_labels = train.target.values
val_labels = val.target.values
class_weights = utilities.get_class_weights(train_labels) 

if TASK == '2':
    train_labels = keras.utils.to_categorical(train_labels)
    val_labels = keras.utils.to_categorical(val_labels)
    
if TASK == '4':
    test_labels = test.target

## Training 

### Designing & compiling the model

In [None]:
def lstm_with_context_att():
    input_text = keras.layers.Input(shape=(max_words_per_doc,), dtype='int32')

    # embedding
    embedding_params = {
        'gaussian_noise' : 0.3,
        'embedding_do' : 0.3
    }

    emb_text = nn_modules.embedding(
        input_text=input_text,
        max_sequence_length=max_words_per_doc,
        embedding_matrix=embedding_matrix,
        **embedding_params)

    encoder_params = {
        'rnn_layers' : 2,
        'linear_do': 0.3,
        'recurrent_do': 0.3,
        'attention_do': 0.5,
    }

    # encoding
    representation = nn_modules.rnn_encoders_with_attention(
        nb_cells=150,
        embeddings=emb_text,
        **encoder_params)

    softmax_params = {'l2_dense' : 0.0001}

    # prediction
    output_probs = nn_modules.softmax_classifier(
        representation,
        nb_classes=3 if TASK=='2' else 1,
        **softmax_params)

    # instantiate and compile
    model = keras.models.Model(inputs=input_text,outputs=output_probs)
    optimizer = keras.optimizers.Adam(lr=0.001,clipnorm=1)
    model.compile(
        optimizer=optimizer,
        loss='categorical_crossentropy' if TASK=='2' else 'binary_crossentropy',
        metrics=['acc'])
    
    return model

### Fitting model

In [None]:
FOLDS = 5
min_delta = 0.001
patience_epochs = 5

if CV:
    
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=1)
    
    f1_folds = []
    precision_folds = []
    recall_folds = []
        
    for i,(train_idx,val_idx) in enumerate(skf.split(x_train,train_labels)):
    
        earlystopper = keras.callbacks.EarlyStopping(
            monitor='val_acc', 
            min_delta=min_delta, 
            patience=patience_epochs,
            verbose=0, 
            mode='max')
        callbacks = [earlystopper]
        model = lstm_with_context_att()

        print(f'=====Training model in fold: {i+1}=====')
        history = model.fit(
            x=x_train[train_idx], 
            y=train_labels[train_idx],
            validation_data=(x_train[val_idx],train_labels[val_idx]),
            epochs=50, 
            batch_size=50,
            callbacks=callbacks,
            class_weight=class_weights)
        
        if TASK == '2':
            preds_val = numpy.argmax(model.predict(x_train[val_idx]), axis=1) + 1
        else:
            preds_val = [round(x[0]) for x in model.predict(x_train[val_idx])]
            
        f1 = f1_score(
            y_true=train_labels[val_idx] + (1 if TASK=='2' else 0), 
            y_pred=preds_val, 
            average='micro' if TASK=='2' else 'binary', 
            labels=[1,2])
        precision = precision_score(
            y_true=train_labels[val_idx] + (1 if TASK=='2' else 0), 
            y_pred=preds_val,
            average='micro' if TASK=='2' else 'binary', 
            labels=[1,2])
        recall = recall_score(
            y_true=train_labels[val_idx] + (1 if TASK=='2' else 0), 
            y_pred=preds_val,
            average='micro' if TASK=='2' else 'binary', 
            labels=[1,2])
        
        f1_folds.append(f1)
        precision_folds.append(precision)
        recall_folds.append(recall)
        
        keras.backend.clear_session()
        del model

    print(f'average precision_score: {numpy.mean(precision_folds)}')
    print(f'average recall_score: {numpy.mean(recall_folds)}')
    print(f'average f1_score: {numpy.mean(f1_folds)}')  

else:

    earlystopper = keras.callbacks.EarlyStopping(
        monitor='val_acc', 
        min_delta=min_delta, 
        patience=patience_epochs,
        verbose=0, 
        mode='max')
    callbacks = [earlystopper]
    
    model = lstm_with_context_att()
    history = model.fit(
        x=x_train, 
        y=train_labels,
        validation_data=(x_val,val_labels),
        epochs=50, 
        batch_size=50,
        callbacks=callbacks,
        class_weight=class_weights)
    
    if TASK == '2':
        preds_val = numpy.argmax(model.predict(x_val), axis=1) + 1
    else:
        preds_val = [round(x[0]) for x in model.predict(x_val)]
    f1 = f1_score(
        y_true=val.target + (1 if TASK=='2' else 0), 
        y_pred=preds_val, 
        average='micro' if TASK=='2' else 'binary', 
        labels=[1,2])
    print(f'f1_score: {f1}')
    precision = precision_score(
        y_true=val.target + (1 if TASK=='2' else 0), 
        y_pred=preds_val,
        average='micro' if TASK=='2' else 'binary', 
        labels=[1,2])
    print(f'precision_score: {precision}')
    recall = recall_score(
        y_true=val.target + (1 if TASK=='2' else 0), 
        y_pred=preds_val,
        average='micro' if TASK=='2' else 'binary', 
        labels=[1,2])
    print(f'recall_score: {recall}')

saving model and predictions

In [None]:
SUBMISSION = False
if SUBMISSION and not CV:
    os.makedirs('../models/',exist_ok=True)
    os.makedirs(f'../models/task{TASK}',exist_ok=True)
    model.save(f'../models/task{TASK}/lstm_with_context_attn.h5')
    
    os.makedirs('../submission/',exist_ok=True)
    os.makedirs(f'../submission/task{TASK}',exist_ok=True)
    if TASK == '2':
        preds = numpy.argmax(model.predict(x_test), axis=1) + 1
    else:
        preds = [int(round(x[0])) for x in model.predict(x_test)]
    
    save_to_path = f'../submission/task{TASK}/'
    timestamp = datetime.datetime.now().isoformat('_', timespec='minutes')
    fname = f'task{TASK}_' + 'lstm_with_context_attn' + ('_final' if FINAL else '') + '.tsv'
    
    submission = pandas.DataFrame({'tweet_id': test.tweet_id.values, 'target':preds})
    submission.target = submission.target.astype(int)
    submission.to_csv(
        path_or_buf=save_to_path + fname, 
        index=False, 
        header=None, 
        sep='\t', 
        columns = ['tweet_id','target'])