## Basic EDA

In [1]:
import os
import pandas as pd
from datasets import Dataset
import numpy as np
import pickle
from transformers import (AutoTokenizer,
                   DataCollatorWithPadding,
                   AutoModelForSequenceClassification,
                   TrainingArguments,
                   Trainer)
import evaluate
import torch
from torch.nn import CrossEntropyLoss


from utils import (read_twitter_file,
                   create_datasets)

%load_ext autoreload
%autoreload 2

In [7]:
## create dataset with 200_000 samples
ds = create_datasets(sub_sampling=20_000)
ds = ds.train_test_split(test_size=.3)

##  Electra For Sequence Classification

In [8]:
ELECTRA_MODEL = 'google/electra-small-discriminator'

tokenizer = AutoTokenizer.from_pretrained(ELECTRA_MODEL, batch_size=16)

train_ds = ds["train"].map(lambda x : tokenizer(x["tweet"], return_tensors="pt", truncation=True, padding='max_length', max_length=512))
test_ds = ds["test"].map(lambda x : tokenizer(x["tweet"], return_tensors="pt", truncation=True, padding='max_length', max_length=512))

train_ds = train_ds.remove_columns(["tweet"])
test_ds = test_ds.remove_columns(["tweet"])


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/14000 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [9]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

model = AutoModelForSequenceClassification.from_pretrained(
    ELECTRA_MODEL, num_labels=2
).to("mps")

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        labels = labels.long().to("mps")
        # forward pass
        outputs = model(input_ids=inputs["input_ids"].squeeze(1).to("mps"), attention_mask=inputs["attention_mask"].squeeze(1).to("mps"))
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits, labels.long())
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir="electra_tweet_analysis",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    use_mps_device=True,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1750 [00:00<?, ?it/s]

You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.5626, 'learning_rate': 1.4285714285714287e-05, 'epoch': 0.57}


  0%|          | 0/375 [00:00<?, ?it/s]

{'eval_loss': 0.3971155881881714, 'eval_accuracy': 0.8253333333333334, 'eval_runtime': 65.1573, 'eval_samples_per_second': 92.085, 'eval_steps_per_second': 5.755, 'epoch': 1.0}
{'loss': 0.4271, 'learning_rate': 8.571428571428571e-06, 'epoch': 1.14}
{'loss': 0.3905, 'learning_rate': 2.8571428571428573e-06, 'epoch': 1.71}


  0%|          | 0/375 [00:00<?, ?it/s]

{'eval_loss': 0.38515207171440125, 'eval_accuracy': 0.8313333333333334, 'eval_runtime': 63.7543, 'eval_samples_per_second': 94.111, 'eval_steps_per_second': 5.882, 'epoch': 2.0}
{'train_runtime': 1612.1056, 'train_samples_per_second': 17.369, 'train_steps_per_second': 1.086, 'train_loss': 0.45063802228655137, 'epoch': 2.0}


TrainOutput(global_step=1750, training_loss=0.45063802228655137, metrics={'train_runtime': 1612.1056, 'train_samples_per_second': 17.369, 'train_steps_per_second': 1.086, 'train_loss': 0.45063802228655137, 'epoch': 2.0})

### Evaluation

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    "./electra_tweet_analysis/checkpoint-1750", num_labels=2
).to("mps")

In [14]:
with open("data/twitter-datasets/test_data.txt") as test_file:
  test_id, test_tweets = zip(*[(x.split(",")[0], ",".join(x.split(",")[1:])) for x in test_file.read().split("\n")])

In [15]:
test_df = pd.DataFrame({'Id':test_id, 'tweet': test_tweets}).set_index("Id")
test_df.head(5)

Unnamed: 0_level_0,tweet
Id,Unnamed: 1_level_1
1,sea doo pro sea scooter ( sports with the port...
2,<user> shucks well i work all week so now i ca...
3,i cant stay away from bug thats my baby
4,<user> no ma'am ! ! ! lol im perfectly fine an...
5,"whenever i fall asleep watching the tv , i alw..."


In [16]:
test_ds = Dataset.from_pandas(test_df)

In [17]:
test_ds = test_ds.map(lambda x : tokenizer(x["tweet"], truncation=True, padding="max_length", max_length=512, return_tensors="pt"))

Map:   0%|          | 0/10001 [00:00<?, ? examples/s]

In [18]:
test_ds

Dataset({
    features: ['tweet', 'Id', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10001
})

In [21]:
# from tdqm.notebook import tqdm

predictions = []
Ids = []
for i, test_sample in enumerate(test_ds):
  print(f"{i} / 100001", end="\r")
  input_ids = test_sample.get("input_ids")
  Ids.append(test_sample.get("Id"))
  attention_mask = test_sample.get("attention_mask")
  outputs = model(input_ids=torch.tensor(input_ids).squeeze(1).to("mps"), attention_mask=torch.tensor(attention_mask).squeeze(1).to("mps"))
  logits = outputs.get("logits").detach().cpu().numpy()

  predictions.append(logits)




10000 / 100001

In [22]:
predictions = [-1 if pred.argmax() == 0 else 1 for pred in predictions]

In [23]:
pd.DataFrame({"Id": Ids, "Prediction": predictions}).set_index("Id").to_csv("baseline_submit.csv")