kaggle competition https://www.kaggle.com/competitions/nlp-getting-started/submissions
public score 0.84

In [None]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [None]:
df_test = pd.read_csv('test.csv')

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df.target.dtype

dtype('int64')

In [None]:
def load_data(data_file):
    df = pd.read_csv(data_file)
    texts = df['text'].tolist()
    labels = df['target'].tolist()
    return texts, labels

In [None]:
data_file = "train.csv"
texts, labels = load_data(data_file)

In [None]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout1 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
        self.dropout2 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
        self.dropout3 = nn.Dropout(0.5)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        #self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x1 = self.dropout1(pooled_output)
        out1 = nn.functional.relu(self.fc1(x1))
        x2 = self.dropout2(out1)
        out2 = nn.functional.relu(self.fc2(x2))
        x3 = self.dropout3(out2)
        logits = self.classifier(x3)

        return logits

In [None]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [None]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [None]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return preds.item()

In [None]:
bert_model_name = 'bert-base-uncased'
bertweet_model_name = 'vinai/bertweet-base'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 1e-5

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

In [None]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
device = 'cuda'
model = BERTClassifier(bert_model_name, num_classes).to(device)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
#scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
for epoch in (range(num_epochs)):
        print(f"Epoch {epoch + 1}/{num_epochs}")
        train(model, train_dataloader, optimizer, scheduler, device)
        accuracy, report = evaluate(model, val_dataloader, device)
        print(f"Validation Accuracy: {accuracy:.4f}")
        print(report)

Epoch 1/4
Validation Accuracy: 0.8109
              precision    recall  f1-score   support

           0       0.85      0.81      0.83       874
           1       0.76      0.81      0.78       649

    accuracy                           0.81      1523
   macro avg       0.81      0.81      0.81      1523
weighted avg       0.81      0.81      0.81      1523

Epoch 2/4
Validation Accuracy: 0.8299
              precision    recall  f1-score   support

           0       0.86      0.84      0.85       874
           1       0.79      0.81      0.80       649

    accuracy                           0.83      1523
   macro avg       0.83      0.83      0.83      1523
weighted avg       0.83      0.83      0.83      1523

Epoch 3/4
Validation Accuracy: 0.8418
              precision    recall  f1-score   support

           0       0.84      0.90      0.87       874
           1       0.85      0.77      0.81       649

    accuracy                           0.84      1523
   macro avg  

In [None]:
torch.save(model.state_dict(), "bert_classifier.pth")

In [None]:
test_text = "holocaust"
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print(test_text)
print(f"Predicted sentiment: {sentiment}")

holocaust
Predicted sentiment: 1


In [None]:
df_test = pd.read_csv('test.csv')

In [None]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
def predict_target(row):
  return predict_sentiment(row['text'], model, tokenizer, device)

df_test['target'] = df_test.apply(predict_target, axis = 1)

In [None]:
df_test['target'].value_counts()

0    1943
1    1320
Name: target, dtype: int64

In [None]:
df_test.drop(columns = ['keyword', 'location', 'text'], inplace = True)

In [None]:
df_test.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [None]:
df_test.to_csv('submission3.csv', index = False)