In [1]:
import pandas as pd
import numpy as np

In [2]:
!pip install opendatasets --quiet

In [3]:
import opendatasets as od

od.download("https://www.kaggle.com/datasets/misakrug/nlpgettingstarted/")

Skipping, found downloaded files in "./nlpgettingstarted" (use force=True to force download)


In [4]:
tweets = pd.read_csv("/content/nlpgettingstarted/train.csv")
tweets.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
tweets.shape

(7613, 5)

In [6]:
test_df = pd.read_csv("/content/nlpgettingstarted/test.csv")
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
test_df.shape

(3263, 4)

In [8]:
import re
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

In [9]:
tweets.at[0,'text']

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [10]:
def preprocess_text(text):
  text = text.lower()
  text = re.sub('[^A-Za-z0-9]+',' ',text)
  return text

In [11]:
tweets['text'] = tweets['text'].apply(preprocess_text)

In [12]:
tweets.at[0,'text']

'our deeds are the reason of this earthquake may allah forgive us all'

In [13]:
tweets.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this earthquake ma...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,13 000 people receive wildfires evacuation ord...,1
4,7,,,just got sent this photo from ruby alaska as s...,1


In [14]:
test_df['text'] =  test_df['text'].apply(preprocess_text)

In [15]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,just happened a terrible car crash
1,2,,,heard about earthquake is different cities sta...
2,3,,,there is a forest fire at spot pond geese are ...
3,9,,,apocalypse lighting spokane wildfires
4,11,,,typhoon soudelor kills 28 in china and taiwan


In [16]:
tweets.isna().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [17]:
test_df.isna().sum()

id             0
keyword       26
location    1105
text           0
dtype: int64

In [18]:
tweets.drop('location',axis=1,inplace=True)

In [19]:
tweets.head()

Unnamed: 0,id,keyword,text,target
0,1,,our deeds are the reason of this earthquake ma...,1
1,4,,forest fire near la ronge sask canada,1
2,5,,all residents asked to shelter in place are be...,1
3,6,,13 000 people receive wildfires evacuation ord...,1
4,7,,just got sent this photo from ruby alaska as s...,1


In [20]:
test_df.drop('location',axis=1,inplace=True)

In [21]:
test_df.head()

Unnamed: 0,id,keyword,text
0,0,,just happened a terrible car crash
1,2,,heard about earthquake is different cities sta...
2,3,,there is a forest fire at spot pond geese are ...
3,9,,apocalypse lighting spokane wildfires
4,11,,typhoon soudelor kills 28 in china and taiwan


In [22]:
train_data, test_data = train_test_split(tweets,test_size=0.2,random_state=42)
train_data, val_data = train_test_split(train_data,test_size=0.2,random_state=42)

In [23]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [24]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
def tokenize_texts(texts):
  input_ids = []
  attention_masks = []
  for text in texts:
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens = True,
        max_length = 64,
        truncation = True,
        pad_to_max_length = True,
        return_attention_mask = True,
        return_tensors = 'pt'
    )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
  input_ids = torch.cat(input_ids,dim=0)
  attention_masks = torch.cat(attention_masks,dim=0)
  return input_ids,attention_masks

In [26]:
train_input_ids, train_attention_masks = tokenize_texts(train_data['text'])
val_input_ids, val_attention_masks = tokenize_texts(val_data['text'])
test_input_ids, test_attention_masks = tokenize_texts(test_data['text'])




In [27]:
train_labels = torch.tensor(train_data['target'].values)
test_labels = torch.tensor(test_data['target'].values)
val_labels = torch.tensor(val_data['target'].values)

In [28]:
train_data = torch.utils.data.TensorDataset(train_input_ids, train_attention_masks, train_labels)
train_sampler = torch.utils.data.RandomSampler(train_data)
train_loader = torch.utils.data.DataLoader(train_data, sampler=train_sampler, batch_size=32)

In [29]:
val_data = torch.utils.data.TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_sampler = torch.utils.data.SequentialSampler(val_data)
val_loader = torch.utils.data.DataLoader(val_data, sampler=val_sampler, batch_size=32)

In [30]:
test_data = torch.utils.data.TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_sampler = torch.utils.data.SequentialSampler(test_data)
test_loader = torch.utils.data.DataLoader(test_data, sampler=test_sampler, batch_size=32)

In [31]:
def train(model, train_loader, optimizer, scheduler):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_loader)
    return avg_train_loss

In [32]:
def evaluate(model, eval_loader):
    model.eval()
    total_loss = 0
    total_preds = []
    with torch.no_grad():
        for batch in eval_loader:
            input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            logits = outputs[1]
            total_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            total_preds.append(logits)

    avg_val_loss = total_loss / len(eval_loader)
    total_preds = np.concatenate(total_preds, axis=0)
    return avg_val_loss, total_preds

In [33]:
class EarlyStopping:
    def __init__(self, patience=5, delta=0.0):
        self.patience = patience
        self.delta = delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf

    def __call__(self, val_loss, model):
        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        torch.save(model.state_dict(), 'best_model_4.pt')
        self.val_loss_min = val_loss

In [34]:
import torch.nn as nn

learning_rate = 1e-5
epochs = 10
batch_size = 32
gradient_accumulation_steps = 1
mixup_alpha = 0.1
early_stopping_patience = 2

optimizer = AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
total_steps = len(train_loader) * epochs // gradient_accumulation_steps
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

criterion = nn.CrossEntropyLoss()

best_val_loss = float('inf')
early_stopping = EarlyStopping(patience=5, delta=0.0)
for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer, scheduler)
    val_loss, val_preds = evaluate(model, val_loader)
    print(f'Epoch {epoch + 1}: train_loss = {train_loss:.3f}, val_loss = {val_loss:.3f}')

    early_stopping(val_loss, model)
    if early_stopping.early_stop:
        print("Early stopping")
        break

    if val_loss < best_val_loss:
        torch.save(model.state_dict(), 'best_model.pt')
        best_val_loss = val_loss



Epoch 1: train_loss = 0.495, val_loss = 0.393
Epoch 2: train_loss = 0.364, val_loss = 0.384
Epoch 3: train_loss = 0.305, val_loss = 0.399
Epoch 4: train_loss = 0.266, val_loss = 0.440
Epoch 5: train_loss = 0.214, val_loss = 0.461
Epoch 6: train_loss = 0.177, val_loss = 0.534
Epoch 7: train_loss = 0.148, val_loss = 0.574
Early stopping


In [35]:
best_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)
best_model.load_state_dict(torch.load('best_model_4.pt'))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [36]:
best_model.eval()
test_loss, test_preds = evaluate(best_model, test_loader)
test_preds = np.argmax(test_preds, axis=1)
test_accuracy = (test_preds == test_labels.numpy()).mean()
print(f'Test accuracy: {test_accuracy:.3f}')

Test accuracy: 0.842


In [39]:
test_input_ids, test_attention_masks = tokenize_texts(test_df['text'])



In [40]:
test_data = torch.utils.data.TensorDataset(test_input_ids, test_attention_masks)
test_sampler = torch.utils.data.SequentialSampler(test_data)
test_loader = torch.utils.data.DataLoader(test_data, sampler=test_sampler, batch_size=32)

In [41]:
best_model.eval()

all_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask = tuple(t.to(device) for t in batch)
        outputs = best_model(input_ids, attention_mask=attention_mask)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        all_preds.append(logits)

all_preds = np.concatenate(all_preds, axis=0)

pred_labels = np.argmax(all_preds, axis=1)

In [42]:
sub_df = pd.DataFrame({'id': test_df['id'], 'target': pred_labels})
sub_df.to_csv('submission3.csv', index=False)