## Classification using Perceptron

In [1]:
%load_ext nb_black

import collections
import numpy as np
import pandas as pd
import re
from argparse import Namespace
import os

import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F

import utils
from dataset import TweetDataset
from classifiers import TweetPerceptronClassifier

args = Namespace(
    frequency_cutoff=25,
    model_state_file="model.pth",
    tweets_csv="data/train_with_splits.csv",
    save_dir="models/perceptron",
    vectorizer_file="vectorizer.json",
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    catch_keyboard_interrupt=True,
    cuda=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)
    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)
    print("Expanded filepaths: ")
    print(f"\t{args.vectorizer_file}")
    print(f"\t{args.model_state_file}")


if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

print(f"Using Cuda: {args.cuda}")

utils.set_seed_everywhere(args.seed, args.cuda)
utils.handle_dirs(args.save_dir)

Expanded filepaths: 
	models/perceptron/vectorizer.json
	models/perceptron/model.pth
Using Cuda: False




<IPython.core.display.Javascript object>

In [2]:
if args.reload_from_files:
    print("Loading Dataset & Vectorizer")
    dataset = TweetDataset.load_dataset_and_load_vectorizer(
        args.tweets_csv, args.vectrozier_file
    )
else:
    print("Loading dataset & Creating vectorizer")
    dataset = TweetDataset.load_dataset_and_make_vectorizer(
        args.tweets_csv, args.vectorizer_file
    )
    dataset.save_vectorizer(args.vectorizer_file)
    vectorizer = dataset.get_vectorizer()

classifier = TweetPerceptronClassifier(num_features=len(vectorizer.tweet_vocab))
print(classifier)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer, mode="min", factor=0.5, patience=1
)

Loading dataset & Creating vectorizer
TweetPerceptronClassifier(
  (fc1): Linear(in_features=3108, out_features=1, bias=True)
)


<IPython.core.display.Javascript object>

In [3]:
train_state = utils.train_model(
    classifier, loss_func, optimizer, scheduler, dataset, args
)
train_state = utils.evaluate_test_split(
    classifier, dataset, loss_func, train_state, args
)

Training Routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/41 [00:00<?, ?it/s]

split=val:   0%|          | 0/8 [00:00<?, ?it/s]

--------------- 0th Epoch Stats---------------
Training Loss=0.681884057638122, Training Accuracy=66.88262195121949
Validation Loss=0.6708768680691719, Validation Accuracy=71.19140625.
------------------------------------------------------------
--------------- 10th Epoch Stats---------------
Training Loss=0.520560126479079, Training Accuracy=81.85975609756099
Validation Loss=0.5529165044426918, Validation Accuracy=78.515625.
------------------------------------------------------------
--------------- 20th Epoch Stats---------------
Training Loss=0.447150328537313, Training Accuracy=84.71798780487804
Validation Loss=0.5127294063568115, Validation Accuracy=79.58984374999999.
------------------------------------------------------------
--------------- 30th Epoch Stats---------------
Training Loss=0.4019845971247045, Training Accuracy=85.78506097560975
Validation Loss=0.48284977674484253, Validation Accuracy=80.17578125.
------------------------------------------------------------
-------

<IPython.core.display.Javascript object>

In [4]:
import utils

tweet = "The Campaign: Will Ferrell and Zach Galifianakis commit comic mayhem in this hilarious political farce. 4* http://t.co/tQ3j2qGtZQ'"
utils.predict_class(classifier, dataset.get_vectorizer(), tweet)



0

<IPython.core.display.Javascript object>

### Prepare test dataset results

- Got 0.78823 for perceptron_results.csv submission

In [5]:
test_dataset = pd.read_csv("data/test.csv")

<IPython.core.display.Javascript object>

In [7]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


<IPython.core.display.Javascript object>

In [16]:
results = []
for id, _, _, tweet in test_dataset.values:
    prediction = utils.predict_class(classifier, dataset.get_vectorizer(), tweet)
    results.append([id, prediction])
print(results[:10])



[[0, 1], [2, 1], [3, 1], [9, 0], [11, 1], [12, 1], [21, 0], [22, 0], [27, 0], [29, 0]]


<IPython.core.display.Javascript object>

In [18]:
submission_df = pd.DataFrame(results, columns=["id", "target"])
submission_df.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1


<IPython.core.display.Javascript object>

In [19]:
submission_df.to_csv("data/perceptron_results.csv", index=False)

<IPython.core.display.Javascript object>

### Train using full test dataset

- 0.73184
- 0.78363 (with scheduler fix)
- 0.77658 (500 epochs)

In [6]:
args = Namespace(
    frequency_cutoff=25,
    model_state_file="model.pth",
    tweets_csv="data/train_with_splits.csv",
    save_dir="models/perceptron",
    vectorizer_file="vectorizer.json",
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=500,
    seed=1337,
    catch_keyboard_interrupt=True,
    cuda=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)
    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)
    print("Expanded filepaths: ")
    print(f"\t{args.vectorizer_file}")
    print(f"\t{args.model_state_file}")


if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

print(f"Using Cuda: {args.cuda}")

utils.set_seed_everywhere(args.seed, args.cuda)
utils.handle_dirs(args.save_dir)

if args.reload_from_files:
    print("Loading Dataset & Vectorizer")
    dataset = TweetDataset.load_dataset_and_load_vectorizer(
        args.tweets_csv, args.vectrozier_file, use_full_dataset=True
    )
else:
    print("Loading dataset & Creating vectorizer")
    dataset = TweetDataset.load_dataset_and_make_vectorizer(
        args.tweets_csv, args.vectorizer_file, use_full_dataset=True
    )
    dataset.save_vectorizer(args.vectorizer_file)
    vectorizer = dataset.get_vectorizer()

classifier = TweetPerceptronClassifier(num_features=len(vectorizer.tweet_vocab))
print(classifier)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer, mode="min", factor=0.5, patience=1
)
train_state = utils.train_model(
    classifier, loss_func, optimizer, scheduler, dataset, args
)
train_state = utils.evaluate_test_split(
    classifier, dataset, loss_func, train_state, args
)

Expanded filepaths: 
	models/perceptron/vectorizer.json
	models/perceptron/model.pth
Using Cuda: False
Loading dataset & Creating vectorizer
TweetPerceptronClassifier(
  (fc1): Linear(in_features=3108, out_features=1, bias=True)
)


Training Routine:   0%|          | 0/500 [00:00<?, ?it/s]

split=train:   0%|          | 0/41 [00:00<?, ?it/s]

split=val: 0it [00:00, ?it/s]

--------------- 0th Epoch Stats---------------
Training Loss=0.681884057638122, Training Accuracy=66.88262195121949
Validation Loss=0.0, Validation Accuracy=0.0.
------------------------------------------------------------
--------------- 10th Epoch Stats---------------
Training Loss=0.5218462318908879, Training Accuracy=81.87881097560977
Validation Loss=0.0, Validation Accuracy=0.0.
------------------------------------------------------------
--------------- 20th Epoch Stats---------------
Training Loss=0.44711853236686894, Training Accuracy=84.56554878048779
Validation Loss=0.0, Validation Accuracy=0.0.
------------------------------------------------------------
--------------- 30th Epoch Stats---------------
Training Loss=0.4016107960445125, Training Accuracy=85.82317073170732
Validation Loss=0.0, Validation Accuracy=0.0.
------------------------------------------------------------
--------------- 40th Epoch Stats---------------
Training Loss=0.37020789341228766, Training Accuracy=

--------------- 370th Epoch Stats---------------
Training Loss=0.23578875457368245, Training Accuracy=91.65396341463413
Validation Loss=0.0, Validation Accuracy=0.0.
------------------------------------------------------------
--------------- 380th Epoch Stats---------------
Training Loss=0.23535394414169034, Training Accuracy=91.67301829268295
Validation Loss=0.0, Validation Accuracy=0.0.
------------------------------------------------------------
--------------- 390th Epoch Stats---------------
Training Loss=0.23633681419419084, Training Accuracy=91.63490853658539
Validation Loss=0.0, Validation Accuracy=0.0.
------------------------------------------------------------
--------------- 400th Epoch Stats---------------
Training Loss=0.2357822969192412, Training Accuracy=91.71112804878052
Validation Loss=0.0, Validation Accuracy=0.0.
------------------------------------------------------------
--------------- 410th Epoch Stats---------------
Training Loss=0.23670431353696963, Training 

<IPython.core.display.Javascript object>

In [7]:
test_dataset = pd.read_csv("data/test.csv")
results = []
for id, _, _, tweet in test_dataset.values:
    prediction = utils.predict_class(classifier, dataset.get_vectorizer(), tweet)
    results.append([id, prediction])
submission_df = pd.DataFrame(results, columns=["id", "target"])
submission_df.head()
submission_df.to_csv("data/perceptron_results3.csv", index=False)



<IPython.core.display.Javascript object>

In [8]:
submission_df1 = pd.read_csv("data/perceptron_results.csv")
submission_df2 = pd.read_csv("data/perceptron_results1.csv")
submission_df3 = pd.read_csv("data/perceptron_results2.csv")
submission_df4 = pd.read_csv("data/perceptron_results3.csv")

<IPython.core.display.Javascript object>

In [9]:
submission_df1.shape, submission_df1[
    submission_df1["target"] == submission_df2["target"]
].shape, submission_df1[
    submission_df1["target"] == submission_df3["target"]
].shape, submission_df2[
    submission_df3["target"] == submission_df2["target"]
].shape, submission_df1[
    submission_df1["target"] == submission_df4["target"]
].shape

((3263, 2), (2795, 2), (3130, 2), (2692, 2), (3065, 2))

<IPython.core.display.Javascript object>

## Classification using MLP

In [1]:
%load_ext nb_black

import collections
import numpy as np
import pandas as pd
import re
from argparse import Namespace
import os

import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F

import utils
from dataset import TweetDataset
from classifiers import TweetMLPClassifier

args = Namespace(
    frequency_cutoff=25,
    model_state_file="model.pth",
    tweets_csv="data/train_with_splits.csv",
    save_dir="models/mlp",
    vectorizer_file="vectorizer.json",
    hidden_dim=300,
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    catch_keyboard_interrupt=True,
    cuda=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)
    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)
    print("Expanded filepaths: ")
    print(f"\t{args.vectorizer_file}")
    print(f"\t{args.model_state_file}")


if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

print(f"Using Cuda: {args.cuda}")

utils.set_seed_everywhere(args.seed, args.cuda)
utils.handle_dirs(args.save_dir)

Expanded filepaths: 
	models/mlp/vectorizer.json
	models/mlp/model.pth
Using Cuda: False




<IPython.core.display.Javascript object>

In [2]:
if args.reload_from_files:
    print("Loading Dataset & Vectorizer")
    dataset = TweetDataset.load_dataset_and_load_vectorizer(
        args.tweets_csv, args.vectrozier_file
    )
else:
    print("Loading dataset & Creating vectorizer")
    dataset = TweetDataset.load_dataset_and_make_vectorizer(
        args.tweets_csv, args.vectorizer_file
    )
    dataset.save_vectorizer(args.vectorizer_file)
    vectorizer = dataset.get_vectorizer()

classifier = TweetMLPClassifier(
    input_dim=len(vectorizer.tweet_vocab),
    hidden_dim=args.hidden_dim,
    output_dim=1
)
print(classifier)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer, mode="min", factor=0.5, patience=1
)
train_state = utils.train_model(
    classifier, loss_func, optimizer, scheduler, dataset, args
)
train_state = utils.evaluate_test_split(
    classifier, dataset, loss_func, train_state, args
)

Loading dataset & Creating vectorizer
TweetMLPClassifier(
  (fc1): Linear(in_features=3108, out_features=300, bias=True)
  (fc2): Linear(in_features=300, out_features=1, bias=True)
)


Training Routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/41 [00:00<?, ?it/s]

split=val:   0%|          | 0/8 [00:00<?, ?it/s]

--------------- 0th Epoch Stats---------------
Training Loss=0.6277405648696712, Training Accuracy=69.68368902439026
Validation Loss=0.5224693901836872, Validation Accuracy=79.98046875.
------------------------------------------------------------
--------------- 10th Epoch Stats---------------
Training Loss=0.16384193791848853, Training Accuracy=93.95960365853658
Validation Loss=0.6141280904412268, Validation Accuracy=76.07421875.
------------------------------------------------------------
--------------- 20th Epoch Stats---------------
Training Loss=0.158365460612425, Training Accuracy=94.28353658536585
Validation Loss=0.6175562217831612, Validation Accuracy=77.05078125.
------------------------------------------------------------
--------------- 30th Epoch Stats---------------
Training Loss=0.15800704098329313, Training Accuracy=94.32164634146342
Validation Loss=0.6482495218515396, Validation Accuracy=75.78125.
------------------------------------------------------------
-----------

<IPython.core.display.Javascript object>

### Prepare Submission CSV

- `data/mlp_results1.csv` -> 0.77045

In [3]:
test_dataset = pd.read_csv("data/test.csv")
results = []
for id, _, _, tweet in test_dataset.values:
    prediction = utils.predict_class(classifier, dataset.get_vectorizer(), tweet)
    results.append([id, prediction])
submission_df1 = pd.DataFrame(results, columns=["id", "target"])
submission_df1.to_csv("data/mlp_results1.csv", index=False)



<IPython.core.display.Javascript object>

In [4]:
submission_df1.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


<IPython.core.display.Javascript object>

In [6]:
submission_df1["target"].value_counts()

0    2018
1    1245
Name: target, dtype: int64

<IPython.core.display.Javascript object>

### Train Using Full Dataset

- `data/mlp_results2.csv` -> 0.72939
- `data/mlp_results3.csv` -> 0.74195 (Correct Full Train Dataset)
- `mlp_results4.csv` -> 0.74992 (Full Dataset & Validation Set)

In [1]:
%load_ext nb_black

import collections
import numpy as np
import pandas as pd
import re
from argparse import Namespace
import os

import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F

import utils
from dataset import TweetDataset
from classifiers import TweetMLPClassifier

args = Namespace(
    frequency_cutoff=25,
    model_state_file="model.pth",
    tweets_csv="data/train_with_splits.csv",
    save_dir="models/mlp",
    vectorizer_file="vectorizer.json",
    hidden_dim=300,
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    catch_keyboard_interrupt=True,
    cuda=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,
)


if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)
    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)
    print("Expanded filepaths: ")
    print(f"\t{args.vectorizer_file}")
    print(f"\t{args.model_state_file}")


if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

print(f"Using Cuda: {args.cuda}")

utils.set_seed_everywhere(args.seed, args.cuda)
utils.handle_dirs(args.save_dir)

if args.reload_from_files:
    print("Loading Dataset & Vectorizer")
    dataset = TweetDataset.load_dataset_and_load_vectorizer(
        args.tweets_csv, args.vectrozier_file, use_full_dataset=True
    )
else:
    print("Loading dataset & Creating vectorizer")
    dataset = TweetDataset.load_dataset_and_make_vectorizer(
        args.tweets_csv, args.vectorizer_file, use_full_dataset=True
    )
    dataset.save_vectorizer(args.vectorizer_file)
    vectorizer = dataset.get_vectorizer()

classifier = TweetMLPClassifier(
    input_dim=len(vectorizer.tweet_vocab),
    hidden_dim=args.hidden_dim,
    output_dim=1
)
print(classifier)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer, mode="min", factor=0.5, patience=1
)
train_state = utils.train_model(
    classifier, loss_func, optimizer, scheduler, dataset, args
)
train_state = utils.evaluate_test_split(
    classifier, dataset, loss_func, train_state, args
)



Expanded filepaths: 
	models/mlp/vectorizer.json
	models/mlp/model.pth
Using Cuda: False
Loading dataset & Creating vectorizer
TweetMLPClassifier(
  (fc1): Linear(in_features=3108, out_features=300, bias=True)
  (fc2): Linear(in_features=300, out_features=1, bias=True)
)


Training Routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/59 [00:00<?, ?it/s]

split=val:   0%|          | 0/8 [00:00<?, ?it/s]

--------------- 0th Epoch Stats---------------
Training Loss=0.5873162120075551, Training Accuracy=73.31832627118645
Validation Loss=0.41285038366913795, Validation Accuracy=84.765625.
------------------------------------------------------------
--------------- 10th Epoch Stats---------------
Training Loss=0.15083907633009602, Training Accuracy=94.27966101694918
Validation Loss=0.12037540879100561, Validation Accuracy=96.09375.
------------------------------------------------------------
--------------- 20th Epoch Stats---------------
Training Loss=0.08511996269226076, Training Accuracy=96.61016949152544
Validation Loss=0.07143650902435182, Validation Accuracy=97.75390625.
------------------------------------------------------------
--------------- 30th Epoch Stats---------------
Training Loss=0.069224481486668, Training Accuracy=97.2457627118644
Validation Loss=0.0609111855737865, Validation Accuracy=98.046875.
------------------------------------------------------------
-------------

<IPython.core.display.Javascript object>

In [2]:
test_dataset = pd.read_csv("data/test.csv")
results = []
for id, _, _, tweet in test_dataset.values:
    prediction = utils.predict_class(classifier, dataset.get_vectorizer(), tweet)
    results.append([id, prediction])
submission_df4 = pd.DataFrame(results, columns=["id", "target"])
submission_df4.to_csv("data/mlp_results4.csv", index=False)



<IPython.core.display.Javascript object>

### 500 Epochs with splitted dataset

- `data/mlp_results5.csv` -> 0.77015

In [3]:
%load_ext nb_black

import collections
import numpy as np
import pandas as pd
import re
from argparse import Namespace
import os

import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F

import utils
from dataset import TweetDataset
from classifiers import TweetMLPClassifier

args = Namespace(
    frequency_cutoff=25,
    model_state_file="model.pth",
    tweets_csv="data/train_with_splits.csv",
    save_dir="models/mlp",
    vectorizer_file="vectorizer.json",
    hidden_dim=300,
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=500,
    seed=1337,
    catch_keyboard_interrupt=True,
    cuda=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,
)


if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)
    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)
    print("Expanded filepaths: ")
    print(f"\t{args.vectorizer_file}")
    print(f"\t{args.model_state_file}")


if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

print(f"Using Cuda: {args.cuda}")

utils.set_seed_everywhere(args.seed, args.cuda)
utils.handle_dirs(args.save_dir)

if args.reload_from_files:
    print("Loading Dataset & Vectorizer")
    dataset = TweetDataset.load_dataset_and_load_vectorizer(
        args.tweets_csv, args.vectrozier_file, use_full_dataset=False
    )
else:
    print("Loading dataset & Creating vectorizer")
    dataset = TweetDataset.load_dataset_and_make_vectorizer(
        args.tweets_csv, args.vectorizer_file, use_full_dataset=False
    )
    dataset.save_vectorizer(args.vectorizer_file)
    vectorizer = dataset.get_vectorizer()

classifier = TweetMLPClassifier(
    input_dim=len(vectorizer.tweet_vocab),
    hidden_dim=args.hidden_dim,
    output_dim=1
)
print(classifier)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer, mode="min", factor=0.5, patience=1
)
train_state = utils.train_model(
    classifier, loss_func, optimizer, scheduler, dataset, args
)
train_state = utils.evaluate_test_split(
    classifier, dataset, loss_func, train_state, args
)

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black
Expanded filepaths: 
	models/mlp/vectorizer.json
	models/mlp/model.pth
Using Cuda: False
Loading dataset & Creating vectorizer
TweetMLPClassifier(
  (fc1): Linear(in_features=3108, out_features=300, bias=True)
  (fc2): Linear(in_features=300, out_features=1, bias=True)
)


Training Routine:   0%|          | 0/500 [00:00<?, ?it/s]

split=train:   0%|          | 0/41 [00:00<?, ?it/s]

split=val:   0%|          | 0/8 [00:00<?, ?it/s]

--------------- 0th Epoch Stats---------------
Training Loss=0.6277405648696712, Training Accuracy=69.68368902439026
Validation Loss=0.5224693901836872, Validation Accuracy=79.98046875.
------------------------------------------------------------
--------------- 10th Epoch Stats---------------
Training Loss=0.16384193791848853, Training Accuracy=93.95960365853658
Validation Loss=0.6141280904412268, Validation Accuracy=76.07421875.
------------------------------------------------------------
--------------- 20th Epoch Stats---------------
Training Loss=0.158365460612425, Training Accuracy=94.28353658536585
Validation Loss=0.6175562217831612, Validation Accuracy=77.05078125.
------------------------------------------------------------
--------------- 30th Epoch Stats---------------
Training Loss=0.15800704098329313, Training Accuracy=94.32164634146342
Validation Loss=0.6482495218515396, Validation Accuracy=75.78125.
------------------------------------------------------------
-----------

--------------- 330th Epoch Stats---------------
Training Loss=0.15779515755612675, Training Accuracy=94.34070121951218
Validation Loss=0.6351474225521088, Validation Accuracy=75.87890625000001.
------------------------------------------------------------
--------------- 340th Epoch Stats---------------
Training Loss=0.1573032227958121, Training Accuracy=94.34070121951221
Validation Loss=0.6388144046068192, Validation Accuracy=75.9765625.
------------------------------------------------------------
--------------- 350th Epoch Stats---------------
Training Loss=0.15790210537067279, Training Accuracy=94.32164634146342
Validation Loss=0.6140845566987991, Validation Accuracy=76.953125.
------------------------------------------------------------
--------------- 360th Epoch Stats---------------
Training Loss=0.15820107227418487, Training Accuracy=94.30259146341461
Validation Loss=0.6285732388496399, Validation Accuracy=76.46484375.
-----------------------------------------------------------

<IPython.core.display.Javascript object>

In [4]:
test_dataset = pd.read_csv("data/test.csv")
results = []
for id, _, _, tweet in test_dataset.values:
    prediction = utils.predict_class(classifier, dataset.get_vectorizer(), tweet)
    results.append([id, prediction])
submission_df5 = pd.DataFrame(results, columns=["id", "target"])
submission_df5.to_csv("data/mlp_results5.csv", index=False)

<IPython.core.display.Javascript object>

### 3 Layer MLP Classifier

In [1]:
%load_ext nb_black

import collections
import numpy as np
import pandas as pd
import re
from argparse import Namespace
import os

import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F

import utils
from dataset import TweetDataset
from classifiers import TweetMLPClassifier, TweetMLPClassifier1

args = Namespace(
    frequency_cutoff=25,
    model_state_file="model.pth",
    tweets_csv="data/train_with_splits.csv",
    save_dir="models/mlp",
    vectorizer_file="vectorizer.json",
    hidden_dim1=1500,
    hidden_dim2=300,
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    catch_keyboard_interrupt=True,
    cuda=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,
)


if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)
    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)
    print("Expanded filepaths: ")
    print(f"\t{args.vectorizer_file}")
    print(f"\t{args.model_state_file}")


if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

print(f"Using Cuda: {args.cuda}")

utils.set_seed_everywhere(args.seed, args.cuda)
utils.handle_dirs(args.save_dir)

if args.reload_from_files:
    print("Loading Dataset & Vectorizer")
    dataset = TweetDataset.load_dataset_and_load_vectorizer(
        args.tweets_csv, args.vectrozier_file, use_full_dataset=False
    )
else:
    print("Loading dataset & Creating vectorizer")
    dataset = TweetDataset.load_dataset_and_make_vectorizer(
        args.tweets_csv, args.vectorizer_file, use_full_dataset=False
    )
    dataset.save_vectorizer(args.vectorizer_file)
    vectorizer = dataset.get_vectorizer()

classifier = TweetMLPClassifier1(
    input_dim=len(vectorizer.tweet_vocab),
    hidden_dim1=args.hidden_dim1,
    hidden_dim2=args.hidden_dim2,
    output_dim=1
)
print(classifier)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer, mode="min", factor=0.5, patience=1
)
train_state = utils.train_model(
    classifier, loss_func, optimizer, scheduler, dataset, args
)
train_state = utils.evaluate_test_split(
    classifier, dataset, loss_func, train_state, args
)



Expanded filepaths: 
	models/mlp/vectorizer.json
	models/mlp/model.pth
Using Cuda: False
Loading dataset & Creating vectorizer
TweetMLPClassifier1(
  (fc1): Linear(in_features=3108, out_features=1500, bias=True)
  (fc2): Linear(in_features=1500, out_features=300, bias=True)
  (fc3): Linear(in_features=300, out_features=1, bias=True)
)


Training Routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/41 [00:00<?, ?it/s]

split=val:   0%|          | 0/8 [00:00<?, ?it/s]

--------------- 0th Epoch Stats---------------
Training Loss=0.5541431540396154, Training Accuracy=72.25609756097563
Validation Loss=0.4648152366280556, Validation Accuracy=79.58984375.
------------------------------------------------------------
--------------- 10th Epoch Stats---------------
Training Loss=0.0504790757124017, Training Accuracy=97.73246951219511
Validation Loss=1.0860364064574244, Validation Accuracy=76.171875.
------------------------------------------------------------
--------------- 20th Epoch Stats---------------
Training Loss=0.0471876059863262, Training Accuracy=97.90396341463415
Validation Loss=1.1171079427003863, Validation Accuracy=76.5625.
------------------------------------------------------------
--------------- 30th Epoch Stats---------------
Training Loss=0.047114420773052584, Training Accuracy=97.9420731707317
Validation Loss=1.1491985395550728, Validation Accuracy=75.87890625.
------------------------------------------------------------
--------------

<IPython.core.display.Javascript object>

In [2]:
test_dataset = pd.read_csv("data/test.csv")
results = []
for id, _, _, tweet in test_dataset.values:
    prediction = utils.predict_class(classifier, dataset.get_vectorizer(), tweet)
    results.append([id, prediction])
submission_df6 = pd.DataFrame(results, columns=["id", "target"])
submission_df6.to_csv("data/mlp_results6.csv", index=False)



<IPython.core.display.Javascript object>