## Vanilla Sequence Tweet Classifier

- `data/emb_results1.csv` -> 0.42966

In [1]:
%load_ext nb_black

import collections
import numpy as np
import pandas as pd
import re
from argparse import Namespace
import os

import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F

import utils
from dataset import SequenceTweetDataset
from classifiers import TweetEmbeddingClassifier



<IPython.core.display.Javascript object>

In [2]:
args = Namespace(
    # Data and Path hyper parameters
    tweets_csv="data/train_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="models/embeddings/",
    # Model hyper parameters
    glove_filepath="../../data/glove.6B.100d.txt",
    use_glove=False,
    embedding_size=100,
    hidden_dim=100,
    num_channels=100,
    # Training hyper parameter
    seed=1337,
    learning_rate=0.001,
    dropout_p=0.1,
    batch_size=128,
    num_epochs=100,
    early_stopping_criteria=5,
    # Runtime option
    cuda=True,
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True,
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)

    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))

# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
print("Using CUDA: {}".format(args.cuda))

# Set seed for reproducibility
utils.set_seed_everywhere(args.seed, args.cuda)

# handle dirs
utils.handle_dirs(args.save_dir)

Expanded filepaths: 
	models/embeddings/document_classification/vectorizer.json
	models/embeddings/document_classification/model.pth
Using CUDA: False


<IPython.core.display.Javascript object>

In [3]:
args.use_glove = True

if args.reload_from_files:
    # training from a checkpoint
    dataset = SequenceTweetDataset.load_dataset_and_load_vectorizer(
        args.tweets_csv, args.vectorizer_file
    )
else:
    # create dataset and vectorizer
    dataset = SequenceTweetDataset.load_dataset_and_make_vectorizer(args.tweets_csv)
    dataset.save_vectorizer(args.vectorizer_file)
vectorizer = dataset.get_vectorizer()

# Use GloVe or randomly initialized embeddings
if args.use_glove:
    words = vectorizer.tweet_vocab._token_to_idx.keys()
    embeddings = utils.make_embedding_matrix(
        glove_filepath=args.glove_filepath, words=words
    )
    print("Using pre-trained embeddings")
else:
    print("Not using pre-trained embeddings")
    embeddings = None

classifier = TweetEmbeddingClassifier(
    embedding_size=args.embedding_size,
    num_embeddings=len(vectorizer.tweet_vocab),
    num_channels=args.num_channels,
    hidden_dim=args.hidden_dim,
    output_dim=1,
    dropout_p=args.dropout_p,
    pretrained_embeddings=embeddings,
    padding_idx=0,
)
print(classifier)

Using pre-trained embeddings
TweetEmbeddingClassifier(
  (emb): Embedding(3111, 100, padding_idx=0)
  (convnet): Sequential(
    (0): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
    (1): ELU(alpha=1.0)
    (2): Conv1d(100, 100, kernel_size=(3,), stride=(2,))
    (3): ELU(alpha=1.0)
    (4): Conv1d(100, 100, kernel_size=(3,), stride=(2,))
    (5): ELU(alpha=1.0)
    (6): Conv1d(100, 100, kernel_size=(3,), stride=(1,))
    (7): ELU(alpha=1.0)
  )
  (fc1): Linear(in_features=100, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=1, bias=True)
)


  torch.nn.init.xavier_uniform(embedding_i)


<IPython.core.display.Javascript object>

In [4]:
classifier = classifier.to(args.device)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer, mode="min", factor=0.5, patience=1
)
train_state = utils.train_model(
    classifier=classifier,
    loss_func=loss_func,
    optimizer=optimizer,
    scheduler=scheduler,
    dataset=dataset,
    args=args,
)
train_state = utils.evaluate_test_split(
    classifier=classifier,
    dataset=dataset,
    loss_func=loss_func,
    args=args,
    train_state=train_state,
)

Training Routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/41 [00:00<?, ?it/s]

split=val:   0%|          | 0/8 [00:00<?, ?it/s]

--------------- 0th Epoch Stats---------------
Training Loss=0.6367046709467724, Training Accuracy=61.985518292682926
Validation Loss=0.4669395312666893, Validation Accuracy=79.296875.
------------------------------------------------------------
--------------- 10th Epoch Stats---------------
Training Loss=0.1959298821847613, Training Accuracy=92.98780487804879
Validation Loss=0.6810704581439495, Validation Accuracy=77.1484375.
------------------------------------------------------------
--------------- 20th Epoch Stats---------------
Training Loss=0.17105375866337516, Training Accuracy=94.13109756097562
Validation Loss=0.735496610403061, Validation Accuracy=75.9765625.
------------------------------------------------------------
--------------- 30th Epoch Stats---------------
Training Loss=0.17115213322203335, Training Accuracy=94.16920731707317
Validation Loss=0.7313713729381562, Validation Accuracy=76.171875.
------------------------------------------------------------
-------------

<IPython.core.display.Javascript object>

In [13]:
def predict_class(classifier, vectorizer, tweet, max_length, decision_threshold=0.0):
    vectorized_tweet = torch.tensor(
        vectorizer.vectorize(tweet, vector_length=max_length)
    )
    result = classifier(vectorized_tweet.unsqueeze(0))
    probability_value = F.sigmoid(result).item()
    predicted_index = 1 if probability_value >= decision_threshold else 0
    return vectorizer.target_vocab.lookup_index(predicted_index)

<IPython.core.display.Javascript object>

In [14]:
test_dataset = pd.read_csv("data/test.csv")
results = []
for id, _, _, tweet in test_dataset.values:
    prediction = predict_class(
        classifier, dataset.get_vectorizer(), tweet, dataset._max_seq_length + 1
    )
    results.append([id, prediction])
submission_df6 = pd.DataFrame(results, columns=["id", "target"])
submission_df6.to_csv("data/emb_results1.csv", index=False)



<IPython.core.display.Javascript object>