### Colab set up

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import os

os.chdir("drive/MyDrive/ML_Project_2")
os.getcwd()

'/content/drive/.shortcut-targets-by-id/1tcdzyCDFmKKzUubHdvRmZ_pmeskKu0W9/ML_Project_2'

In [5]:
!pip install torch==1.13.1+cu116 torchvision==0.14.1+cu116 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
!pip install transformers
!pip install accelerate
!pip install datasets
!pip install seaborn
!pip install pandas
!pip install evaluate


Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu116
Collecting torch==1.13.1+cu116
  Downloading https://download.pytorch.org/whl/cu116/torch-1.13.1%2Bcu116-cp310-cp310-linux_x86_64.whl (1977.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 GB[0m [31m657.3 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.14.1+cu116
  Downloading https://download.pytorch.org/whl/cu116/torchvision-0.14.1%2Bcu116-cp310-cp310-linux_x86_64.whl (24.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.2/24.2 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==0.13.1
  Downloading https://download.pytorch.org/whl/cu116/torchaudio-0.13.1%2Bcu116-cp310-cp310-linux_x86_64.whl (4.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m73.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch, torchvision, torchaudio
  Attempting uninstall: torch
   

In [6]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


## Basic EDA

In [7]:
import os
import pandas as pd
from datasets import Dataset
import numpy as np
import pickle
from transformers import (AutoTokenizer,
                   DataCollatorWithPadding,
                   AutoModelForSequenceClassification,
                   TrainingArguments,
                   Trainer)
import evaluate
import torch
from torch.nn import CrossEntropyLoss


from utils import (read_twitter_file,
                   create_datasets)

%load_ext autoreload
%autoreload 2

In [11]:
## create dataset with 200_000 samples
ds = create_datasets(sub_sampling=200_000)
ds = ds.train_test_split(test_size=.3)

##  Electra For Sequence Classification

In [12]:
ELECTRA_MODEL = 'google/electra-small-discriminator'

tokenizer = AutoTokenizer.from_pretrained(ELECTRA_MODEL, batch_size=16)

train_ds = ds["train"].map(lambda x : tokenizer(x["tweet"], return_tensors="pt", truncation=True, padding='max_length', max_length=512))
test_ds = ds["test"].map(lambda x : tokenizer(x["tweet"], return_tensors="pt", truncation=True, padding='max_length', max_length=512))

train_ds = train_ds.remove_columns(["tweet"])
test_ds = test_ds.remove_columns(["tweet"])


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/140000 [00:00<?, ? examples/s]

Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

model = AutoModelForSequenceClassification.from_pretrained(
    ELECTRA_MODEL, num_labels=2
).to("cuda:0")

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        labels = labels.long().to("cuda:0")
        # forward pass
        outputs = model(input_ids=inputs["input_ids"].squeeze(1).to("cuda:0"), attention_mask=inputs["attention_mask"].squeeze(1).to("cuda:0"))
        logits = outputs.get("logits")
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits, labels.long())
        return (loss, outputs) if return_outputs else loss

training_args = TrainingArguments(
    output_dir="electra_tweet_analysis",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-small-discriminator and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a ElectraTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


### Evaluation

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    "./baseline_tweet_analysis/checkpoint-8750", num_labels=2
).to("cuda:0")

In [None]:
with open("./twitter-datasets/test_data.txt") as test_file:
  test_id, test_tweets = zip(*[(x.split(",")[0], ",".join(x.split(",")[1:])) for x in test_file.read().split("\n")])

In [None]:
test_df = pd.DataFrame({'Id':test_id, 'tweet': test_tweets}).set_index("Id")
test_df.head(5)

Unnamed: 0_level_0,tweet
Id,Unnamed: 1_level_1
1,sea doo pro sea scooter ( sports with the port...
2,<user> shucks well i work all week so now i ca...
3,i cant stay away from bug thats my baby
4,<user> no ma'am ! ! ! lol im perfectly fine an...
5,"whenever i fall asleep watching the tv , i alw..."


In [None]:
test_ds = Dataset.from_pandas(test_df)

In [None]:
test_ds = test_ds.map(lambda x : tokenizer(x["tweet"], truncation=True, padding="max_length", max_length=512, return_tensors="pt"))

Map:   0%|          | 0/10001 [00:00<?, ? examples/s]

In [None]:
test_ds

Dataset({
    features: ['tweet', 'Id', 'input_ids', 'attention_mask'],
    num_rows: 10001
})

In [None]:
# from tdqm.notebook import tqdm

predictions = []
Ids = []
for i, test_sample in enumerate(test_ds):
  print(f"{i} / 100001", end="\r")
  input_ids = test_sample.get("input_ids")
  Ids.append(test_sample.get("Id"))
  attention_mask = test_sample.get("attention_mask")
  outputs = model(input_ids=torch.tensor(input_ids).squeeze(1).to("cuda:0"), attention_mask=torch.tensor(attention_mask).squeeze(1).to("cuda:0"))
  logits = outputs.get("logits").detach().cpu().numpy()

  predictions.append(logits)






In [None]:
predictions = [-1 if pred.argmax() == 0 else 1 for pred in predictions]

In [None]:
pd.DataFrame({"Id": Ids, "Prediction": predictions}).set_index("Id").to_csv("baseline_submit.csv")