Run the following cell only if you are on Google Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# Change depending on organization of Google drive
cd /content/drive/MyDrive/Colab Notebooks/cil/whitewashed/src

In [None]:
pip install wordsegment

Collecting wordsegment
  Downloading wordsegment-1.3.1-py2.py3-none-any.whl.metadata (7.7 kB)
Downloading wordsegment-1.3.1-py2.py3-none-any.whl (4.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/4.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m4.8/4.8 MB[0m [31m169.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m97.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: wordsegment
Successfully installed wordsegment-1.3.1


In [None]:
from utils import *
from torch.utils.data import DataLoader, TensorDataset
import torch
from models.bert import *
from transformers import AutoTokenizer, BertTokenizer, BertForSequenceClassification
from preprocessing import *
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer

Change `full = False` to `full = True` to use the entire dataset (depending on the memory capacities available)

In [None]:
full = False

tweets = []
labels = []

if(not full):
  load_tweets(SMALL_TRAIN_POS, 0, tweets, labels)
  load_tweets(SMALL_TRAIN_NEG, 1, tweets, labels)
else:
  load_tweets(TRAIN_POS, 0, tweets, labels)
  load_tweets(TRAIN_NEG, 0, tweets, labels)

If you want to use pre-processed tweets, run the following cell. Otherwise don't.

In [None]:
tweets, labels = preprocess(tweets, labels)
separator = " "  # Define the separator, which in this case is a space
tweets = [separator.join(tweet) for tweet in tweets]

In [None]:
tweets = np.array(tweets)
labels = np.array(labels)
X_train, X_val, Y_train, Y_val = split_train_test(tweets, labels, 1)

In [None]:
# Dataset of tweets suitable for BERT training
class TweetDataset(Dataset):
    def __init__(self, tweets, labels, tokenizer, max_length):
        self.tweets = tweets
        self.tokenizer = tokenizer
        self.labels = labels
        self.max_length = max_length

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        tweet = self.tweets[idx]
        label = self.labels[idx]
        tokenized_tweet = self.tokenizer(
            tweet,
            padding='max_length',
            truncation=True,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': tokenized_tweet['input_ids'].squeeze(0),
            'attention_mask': tokenized_tweet['attention_mask'].squeeze(0),
            'label': torch.tensor(label)
        }


In [None]:
# Not necessary if on Google Colab
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Choose one of the following two cells to run depending on if you want to use BERT or BERTweet

In [None]:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0


In [None]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
train_dataset = TweetDataset(X_train.tolist(), Y_train.tolist(), tokenizer, max_length=103)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataset = TweetDataset(X_val.tolist(), Y_val.tolist(), tokenizer, max_length=103)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
model = train_bert(train_dataloader, bert_tweet = True)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Loss: 0.2703847948839267
Epoch 2/3, Loss: 0.26530471302817266
Epoch 3/3, Loss: 0.12176523220108615


In [None]:
test_bert(val_dataloader, model)

Validation Accuracy: 0.9058
              precision    recall  f1-score   support

           0       0.91      0.90      0.91     10008
           1       0.90      0.91      0.91      9992

    accuracy                           0.91     20000
   macro avg       0.91      0.91      0.91     20000
weighted avg       0.91      0.91      0.91     20000



The following cells are to make a valid submission for the valid contest

In [None]:
tweets = []
dummy = []

indices = load_tweets(TEST_SET, 0, tweets, dummy)
test_dataset = TweetDataset(tweets, [0]*len(tweets), tokenizer, max_length=103)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
model.eval()
all_preds = []
with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids=input_ids, attention_mask = attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        all_preds.extend(preds.cpu().numpy())


labels = [-1 if pred > 0.5 else 1 for pred in all_preds]

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame({
    'Id': indices,
    'Prediction': labels
})

df.to_csv("submission.csv", index = False)