## Classification using Perceptron

In [1]:
%load_ext nb_black

import collections
import numpy as np
import pandas as pd
import re
from argparse import Namespace
import os

import torch.optim as optim
import torch
import torch.nn as nn
import torch.nn.functional as F

import utils
from dataset import TweetDataset
from classifiers import TweetPerceptronClassifier

args = Namespace(
    frequency_cutoff=25,
    model_state_file="model.pth",
    tweets_csv="data/train_with_splits.csv",
    save_dir="models/perceptron",
    vectorizer_file="vectorizer.json",
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    catch_keyboard_interrupt=True,
    cuda=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)
    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)
    print("Expanded filepaths: ")
    print(f"\t{args.vectorizer_file}")
    print(f"\t{args.model_state_file}")


if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

print(f"Using Cuda: {args.cuda}")

utils.set_seed_everywhere(args.seed, args.cuda)
utils.handle_dirs(args.save_dir)

Expanded filepaths: 
	models/perceptron/vectorizer.json
	models/perceptron/model.pth
Using Cuda: False




<IPython.core.display.Javascript object>

In [2]:
if args.reload_from_files:
    print("Loading Dataset & Vectorizer")
    dataset = TweetDataset.load_dataset_and_load_vectorizer(
        args.tweets_csv, args.vectrozier_file
    )
else:
    print("Loading dataset & Creating vectorizer")
    dataset = TweetDataset.load_dataset_and_make_vectorizer(
        args.tweets_csv, args.vectorizer_file
    )
    dataset.save_vectorizer(args.vectorizer_file)
    vectorizer = dataset.get_vectorizer()

classifier = TweetPerceptronClassifier(num_features=len(vectorizer.tweet_vocab))
print(classifier)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer, mode="min", factor=0.5, patience=1
)

Loading dataset & Creating vectorizer
TweetPerceptronClassifier(
  (fc1): Linear(in_features=3108, out_features=1, bias=True)
)


<IPython.core.display.Javascript object>

In [3]:
train_state = utils.train_model(
    classifier, loss_func, optimizer, scheduler, dataset, args
)
train_state = utils.evaluate_test_split(
    classifier, dataset, loss_func, train_state, args
)

Training Routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/41 [00:00<?, ?it/s]

split=val:   0%|          | 0/8 [00:00<?, ?it/s]

--------------- 0th Epoch Stats---------------
Training Loss=0.681884057638122, Training Accuracy=66.88262195121949
Validation Loss=0.6708768680691719, Validation Accuracy=71.19140625.
------------------------------------------------------------
--------------- 10th Epoch Stats---------------
Training Loss=0.520560126479079, Training Accuracy=81.85975609756099
Validation Loss=0.5529165044426918, Validation Accuracy=78.515625.
------------------------------------------------------------
--------------- 20th Epoch Stats---------------
Training Loss=0.447150328537313, Training Accuracy=84.71798780487804
Validation Loss=0.5127294063568115, Validation Accuracy=79.58984374999999.
------------------------------------------------------------
--------------- 30th Epoch Stats---------------
Training Loss=0.4019845971247045, Training Accuracy=85.78506097560975
Validation Loss=0.48284977674484253, Validation Accuracy=80.17578125.
------------------------------------------------------------
-------

<IPython.core.display.Javascript object>

In [4]:
import utils

tweet = "The Campaign: Will Ferrell and Zach Galifianakis commit comic mayhem in this hilarious political farce. 4* http://t.co/tQ3j2qGtZQ'"
utils.predict_class(classifier, dataset.get_vectorizer(), tweet)



0

<IPython.core.display.Javascript object>

### Prepare test dataset results

- Got 0.78823 for perceptron_results.csv submission

In [5]:
test_dataset = pd.read_csv("data/test.csv")

<IPython.core.display.Javascript object>

In [7]:
test_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


<IPython.core.display.Javascript object>

In [16]:
results = []
for id, _, _, tweet in test_dataset.values:
    prediction = utils.predict_class(classifier, dataset.get_vectorizer(), tweet)
    results.append([id, prediction])
print(results[:10])



[[0, 1], [2, 1], [3, 1], [9, 0], [11, 1], [12, 1], [21, 0], [22, 0], [27, 0], [29, 0]]


<IPython.core.display.Javascript object>

In [18]:
submission_df = pd.DataFrame(results, columns=["id", "target"])
submission_df.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1


<IPython.core.display.Javascript object>

In [19]:
submission_df.to_csv("data/perceptron_results.csv", index=False)

<IPython.core.display.Javascript object>

### Train using full test dataset

- 0.73184
- 0.78363 (with scheduler fix)
- 0.77658 (500 epochs)

In [6]:
args = Namespace(
    frequency_cutoff=25,
    model_state_file="model.pth",
    tweets_csv="data/train_with_splits.csv",
    save_dir="models/perceptron",
    vectorizer_file="vectorizer.json",
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=500,
    seed=1337,
    catch_keyboard_interrupt=True,
    cuda=True,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir, args.vectorizer_file)
    args.model_state_file = os.path.join(args.save_dir, args.model_state_file)
    print("Expanded filepaths: ")
    print(f"\t{args.vectorizer_file}")
    print(f"\t{args.model_state_file}")


if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

print(f"Using Cuda: {args.cuda}")

utils.set_seed_everywhere(args.seed, args.cuda)
utils.handle_dirs(args.save_dir)

if args.reload_from_files:
    print("Loading Dataset & Vectorizer")
    dataset = TweetDataset.load_dataset_and_load_vectorizer(
        args.tweets_csv, args.vectrozier_file, use_full_dataset=True
    )
else:
    print("Loading dataset & Creating vectorizer")
    dataset = TweetDataset.load_dataset_and_make_vectorizer(
        args.tweets_csv, args.vectorizer_file, use_full_dataset=True
    )
    dataset.save_vectorizer(args.vectorizer_file)
    vectorizer = dataset.get_vectorizer()

classifier = TweetPerceptronClassifier(num_features=len(vectorizer.tweet_vocab))
print(classifier)
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer, mode="min", factor=0.5, patience=1
)
train_state = utils.train_model(
    classifier, loss_func, optimizer, scheduler, dataset, args
)
train_state = utils.evaluate_test_split(
    classifier, dataset, loss_func, train_state, args
)

Expanded filepaths: 
	models/perceptron/vectorizer.json
	models/perceptron/model.pth
Using Cuda: False
Loading dataset & Creating vectorizer
TweetPerceptronClassifier(
  (fc1): Linear(in_features=3108, out_features=1, bias=True)
)


Training Routine:   0%|          | 0/500 [00:00<?, ?it/s]

split=train:   0%|          | 0/41 [00:00<?, ?it/s]

split=val: 0it [00:00, ?it/s]

--------------- 0th Epoch Stats---------------
Training Loss=0.681884057638122, Training Accuracy=66.88262195121949
Validation Loss=0.0, Validation Accuracy=0.0.
------------------------------------------------------------
--------------- 10th Epoch Stats---------------
Training Loss=0.5218462318908879, Training Accuracy=81.87881097560977
Validation Loss=0.0, Validation Accuracy=0.0.
------------------------------------------------------------
--------------- 20th Epoch Stats---------------
Training Loss=0.44711853236686894, Training Accuracy=84.56554878048779
Validation Loss=0.0, Validation Accuracy=0.0.
------------------------------------------------------------
--------------- 30th Epoch Stats---------------
Training Loss=0.4016107960445125, Training Accuracy=85.82317073170732
Validation Loss=0.0, Validation Accuracy=0.0.
------------------------------------------------------------
--------------- 40th Epoch Stats---------------
Training Loss=0.37020789341228766, Training Accuracy=

--------------- 370th Epoch Stats---------------
Training Loss=0.23578875457368245, Training Accuracy=91.65396341463413
Validation Loss=0.0, Validation Accuracy=0.0.
------------------------------------------------------------
--------------- 380th Epoch Stats---------------
Training Loss=0.23535394414169034, Training Accuracy=91.67301829268295
Validation Loss=0.0, Validation Accuracy=0.0.
------------------------------------------------------------
--------------- 390th Epoch Stats---------------
Training Loss=0.23633681419419084, Training Accuracy=91.63490853658539
Validation Loss=0.0, Validation Accuracy=0.0.
------------------------------------------------------------
--------------- 400th Epoch Stats---------------
Training Loss=0.2357822969192412, Training Accuracy=91.71112804878052
Validation Loss=0.0, Validation Accuracy=0.0.
------------------------------------------------------------
--------------- 410th Epoch Stats---------------
Training Loss=0.23670431353696963, Training 

<IPython.core.display.Javascript object>

In [7]:
test_dataset = pd.read_csv("data/test.csv")
results = []
for id, _, _, tweet in test_dataset.values:
    prediction = utils.predict_class(classifier, dataset.get_vectorizer(), tweet)
    results.append([id, prediction])
submission_df = pd.DataFrame(results, columns=["id", "target"])
submission_df.head()
submission_df.to_csv("data/perceptron_results3.csv", index=False)



<IPython.core.display.Javascript object>

In [8]:
submission_df1 = pd.read_csv("data/perceptron_results.csv")
submission_df2 = pd.read_csv("data/perceptron_results1.csv")
submission_df3 = pd.read_csv("data/perceptron_results2.csv")
submission_df4 = pd.read_csv("data/perceptron_results3.csv")

<IPython.core.display.Javascript object>

In [9]:
submission_df1.shape, submission_df1[
    submission_df1["target"] == submission_df2["target"]
].shape, submission_df1[
    submission_df1["target"] == submission_df3["target"]
].shape, submission_df2[
    submission_df3["target"] == submission_df2["target"]
].shape, submission_df1[
    submission_df1["target"] == submission_df4["target"]
].shape

((3263, 2), (2795, 2), (3130, 2), (2692, 2), (3065, 2))

<IPython.core.display.Javascript object>