## Vanilla RNN based Tweet Classifier

- `data/simple_rnn1_results1.csv` -> 0.57033

In [1]:
%load_ext nb_black

import collections
import numpy as np
import pandas as pd
import re
from argparse import Namespace
import os

import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F

import utils
from dataset import SequenceTweetDataset
from classifiers import TweetSimpleRNNClassifier



<IPython.core.display.Javascript object>

In [2]:
args = Namespace(
    # Data and Path hyper parameters
    tweets_csv="data/train_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="models/simple_rnn/",
    # Model hyper parameters
    embedding_size=100,
    rnn_hidden_size=64,
    # Training hyper parameter
    seed=1337,
    learning_rate=0.001,
    dropout_p=0.1,
    batch_size=128,
    num_epochs=100,
    early_stopping_criteria=5,
    # Runtime option
    cuda=True,
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)

    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
utils.set_seed_everywhere(args.seed, args.cuda)

# handle dirs
utils.handle_dirs(args.save_dir)

Expanded filepaths: 
	models/simple_rnn/vectorizer.json
	models/simple_rnn/model.pth
Using CUDA: False


<IPython.core.display.Javascript object>

In [3]:
dataset = SequenceTweetDataset.load_dataset_and_make_vectorizer(args.tweets_csv)
dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

classifier = TweetSimpleRNNClassifier(
    embedding_size=args.embedding_size,
    num_embeddings=len(vectorizer.tweet_vocab),
    output_dim=1,
    rnn_hidden_size=args.rnn_hidden_size,
    padding_idx=vectorizer.tweet_vocab.mask_index,
)
print(classifier)
classifer = classifier.to(args.device)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer, mode="min", factor=0.5, patience=1
)

TweetSimpleRNNClassifier(
  (emb): Embedding(3111, 100, padding_idx=0)
  (rnn): ElmanRNN(
    (rnn_cell): RNNCell(100, 64)
  )
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)


<IPython.core.display.Javascript object>

In [4]:
train_state = utils.train_model(
    classifier=classifier,
    loss_func=loss_func,
    optimizer=optimizer,
    scheduler=scheduler,
    dataset=dataset,
    args=args,
)
train_state = utils.evaluate_test_split(
    classifier=classifier,
    dataset=dataset,
    loss_func=loss_func,
    args=args,
    train_state=train_state,
)

Training Routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/41 [00:00<?, ?it/s]

split=val:   0%|          | 0/8 [00:00<?, ?it/s]

--------------- 0th Epoch Stats---------------
Training Loss=0.6882142002989605, Training Accuracy=55.64024390243902
Validation Loss=0.6843034774065018, Validation Accuracy=57.51953125.
------------------------------------------------------------
--------------- 10th Epoch Stats---------------
Training Loss=0.6846244989371886, Training Accuracy=56.97408536585366
Validation Loss=0.6844219416379929, Validation Accuracy=56.93359375.
------------------------------------------------------------
--------------- 20th Epoch Stats---------------
Training Loss=0.68484344424271, Training Accuracy=56.99314024390243
Validation Loss=0.6834568679332733, Validation Accuracy=57.51953125.
------------------------------------------------------------
--------------- 30th Epoch Stats---------------
Training Loss=0.6843078936018597, Training Accuracy=56.99314024390243
Validation Loss=0.6858321875333786, Validation Accuracy=56.83593750000001.
------------------------------------------------------------
-----

<IPython.core.display.Javascript object>

In [5]:
def predict_class(classifier, vectorizer, tweet, max_length, decision_threshold=0.5):
    vectorized_tweet = torch.tensor(
        vectorizer.vectorize(tweet, vector_length=max_length)
    )
    result = classifier(vectorized_tweet.unsqueeze(0))
    probability_value = F.sigmoid(result).item()
    predicted_index = 1 if probability_value >= decision_threshold else 0
    return vectorizer.target_vocab.lookup_index(predicted_index)

<IPython.core.display.Javascript object>

In [6]:
test_dataset = pd.read_csv("data/test.csv")
results = []
for id, _, _, tweet in test_dataset.values:
    prediction = predict_class(
        classifier, dataset.get_vectorizer(), tweet, dataset._max_seq_length + 1
    )
    results.append([id, prediction])
submission_df = pd.DataFrame(results, columns=["id", "target"])
submission_df.to_csv("data/simple_rnn1_results1.csv", index=False)



<IPython.core.display.Javascript object>

## RNN Classifier with Pretrained Embeddings

- `data/simple_rnn1_results2.csv` -> 0.57033

In [7]:
args = Namespace(
    # Data and Path hyper parameters
    tweets_csv="data/train_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="models/simple_rnn/",
    # Model hyper parameters
    glove_filepath="../../data/glove.6B.100d.txt",
    use_glove=True,
    embedding_size=100,
    rnn_hidden_size=64,
    # Training hyper parameter
    seed=1337,
    learning_rate=0.001,
    dropout_p=0.1,
    batch_size=128,
    num_epochs=100,
    early_stopping_criteria=5,
    # Runtime option
    cuda=True,
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)

    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
utils.set_seed_everywhere(args.seed, args.cuda)

# handle dirs
utils.handle_dirs(args.save_dir)

Expanded filepaths: 
	models/simple_rnn/vectorizer.json
	models/simple_rnn/model.pth
Using CUDA: False


<IPython.core.display.Javascript object>

In [8]:
dataset = SequenceTweetDataset.load_dataset_and_make_vectorizer(args.tweets_csv)
dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

if args.use_glove:
    words = vectorizer.tweet_vocab._token_to_idx.keys()
    embeddings = utils.make_embedding_matrix(
        glove_filepath=args.glove_filepath, words=words
    )
    print("Using pre-trained embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

classifier = TweetSimpleRNNClassifier(
    embedding_size=args.embedding_size,
    num_embeddings=len(vectorizer.tweet_vocab),
    output_dim=1,
    rnn_hidden_size=args.rnn_hidden_size,
    padding_idx=vectorizer.tweet_vocab.mask_index,
    pretrained_embeddings=embeddings,
)
print(classifier)
classifer = classifier.to(args.device)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer, mode="min", factor=0.5, patience=1
)

Using pre-trained embeddings
TweetSimpleRNNClassifier(
  (emb): Embedding(3111, 100, padding_idx=0)
  (rnn): ElmanRNN(
    (rnn_cell): RNNCell(100, 64)
  )
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=1, bias=True)
)


  torch.nn.init.xavier_uniform(embedding_i)


<IPython.core.display.Javascript object>

In [9]:
train_state = utils.train_model(
    classifier=classifier,
    loss_func=loss_func,
    optimizer=optimizer,
    scheduler=scheduler,
    dataset=dataset,
    args=args,
)
train_state = utils.evaluate_test_split(
    classifier=classifier,
    dataset=dataset,
    loss_func=loss_func,
    args=args,
    train_state=train_state,
)

Training Routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/41 [00:00<?, ?it/s]

split=val:   0%|          | 0/8 [00:00<?, ?it/s]

--------------- 0th Epoch Stats---------------
Training Loss=0.6868613347774598, Training Accuracy=55.94512195121953
Validation Loss=0.684223160147667, Validation Accuracy=56.73828125.
------------------------------------------------------------
--------------- 10th Epoch Stats---------------
Training Loss=0.6841114933897811, Training Accuracy=57.05030487804878
Validation Loss=0.6875626444816589, Validation Accuracy=57.32421875000001.
------------------------------------------------------------
--------------- 20th Epoch Stats---------------
Training Loss=0.6846795401922088, Training Accuracy=56.9169207317073
Validation Loss=0.6852826178073883, Validation Accuracy=56.4453125.
------------------------------------------------------------
--------------- 30th Epoch Stats---------------
Training Loss=0.684196668427165, Training Accuracy=57.01219512195122
Validation Loss=0.6853861138224602, Validation Accuracy=56.93359374999999.
------------------------------------------------------------
-

<IPython.core.display.Javascript object>

In [10]:
test_dataset = pd.read_csv("data/test.csv")
results = []
for id, _, _, tweet in test_dataset.values:
    prediction = predict_class(
        classifier, dataset.get_vectorizer(), tweet, dataset._max_seq_length + 1
    )
    results.append([id, prediction])
submission_df = pd.DataFrame(results, columns=["id", "target"])
submission_df.to_csv("data/simple_rnn1_results2.csv", index=False)

<IPython.core.display.Javascript object>

## RNN with Pre Trained Classifier and 128 hidden unit size

In [12]:
args = Namespace(
    # Data and Path hyper parameters
    tweets_csv="data/train_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="models/simple_rnn/",
    # Model hyper parameters
    glove_filepath="../../data/glove.6B.100d.txt",
    use_glove=True,
    embedding_size=100,
    rnn_hidden_size=128,
    # Training hyper parameter
    seed=1337,
    learning_rate=0.001,
    dropout_p=0.1,
    batch_size=128,
    num_epochs=100,
    early_stopping_criteria=5,
    # Runtime option
    cuda=True,
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)

    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
utils.set_seed_everywhere(args.seed, args.cuda)

# handle dirs
utils.handle_dirs(args.save_dir)

dataset = SequenceTweetDataset.load_dataset_and_make_vectorizer(args.tweets_csv)
dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

if args.use_glove:
    words = vectorizer.tweet_vocab._token_to_idx.keys()
    embeddings = utils.make_embedding_matrix(
        glove_filepath=args.glove_filepath, words=words
    )
    print("Using pre-trained embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

classifier = TweetSimpleRNNClassifier(
    embedding_size=args.embedding_size,
    num_embeddings=len(vectorizer.tweet_vocab),
    output_dim=1,
    rnn_hidden_size=args.rnn_hidden_size,
    padding_idx=vectorizer.tweet_vocab.mask_index,
    pretrained_embeddings=embeddings,
)
print(classifier)
classifer = classifier.to(args.device)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer, mode="min", factor=0.5, patience=1
)

train_state = utils.train_model(
    classifier=classifier,
    loss_func=loss_func,
    optimizer=optimizer,
    scheduler=scheduler,
    dataset=dataset,
    args=args,
)
train_state = utils.evaluate_test_split(
    classifier=classifier,
    dataset=dataset,
    loss_func=loss_func,
    args=args,
    train_state=train_state,
)

Expanded filepaths: 
	models/simple_rnn/vectorizer.json
	models/simple_rnn/model.pth
Using CUDA: False
Using pre-trained embeddings
TweetSimpleRNNClassifier(
  (emb): Embedding(3111, 100, padding_idx=0)
  (rnn): ElmanRNN(
    (rnn_cell): RNNCell(100, 128)
  )
  (fc1): Linear(in_features=128, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)


Training Routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/41 [00:00<?, ?it/s]

split=val:   0%|          | 0/8 [00:00<?, ?it/s]

--------------- 0th Epoch Stats---------------
Training Loss=0.6868588691804466, Training Accuracy=56.15472560975609
Validation Loss=0.6846114024519919, Validation Accuracy=57.32421875.
------------------------------------------------------------
--------------- 10th Epoch Stats---------------
Training Loss=0.684220289311758, Training Accuracy=56.97408536585365
Validation Loss=0.6833800822496414, Validation Accuracy=57.32421875.
------------------------------------------------------------
--------------- 20th Epoch Stats---------------
Training Loss=0.6830715551608947, Training Accuracy=57.10746951219512
Validation Loss=0.686154693365097, Validation Accuracy=56.8359375.
------------------------------------------------------------
--------------- 30th Epoch Stats---------------
Training Loss=0.6831843634931053, Training Accuracy=57.1455792682927
Validation Loss=0.6875371411442757, Validation Accuracy=56.34765625.
------------------------------------------------------------
-------------

<IPython.core.display.Javascript object>