In [1]:
import sqlite3
import pandas as pd

In [2]:
conn = sqlite3.connect("../scripts/data/clean/correspondence.db")
sql_query = "SELECT * FROM correspondence"
df_corr = pd.read_sql_query(sql_query, conn)
# df_corr["name_external"] = df_corr["name_external"] + ", " + df_corr["source_external"]
arr_bonsai= df_corr["name_bonsai"].tolist()
arr_external= df_corr["name_external"].tolist()

# Fine-tuning models

In [3]:
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertModel
import torch
from torch.optim import AdamW
from torch.nn.functional import cosine_similarity
import pandas as pd
from torch import Tensor, device
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [4]:
model_name = 'bert-base-uncased'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained(model_name)

In [5]:
inputs = tokenizer(arr_external, padding=True, truncation=True, return_tensors="pt")
input_ids = inputs["input_ids"]
attention_masks = inputs["attention_mask"]

# Convert categorical labels to numerical labels
label_encoder = LabelEncoder()
numerical_labels = label_encoder.fit_transform(arr_bonsai)  # arr_bonsai should be a list of category names
labels_tensor = torch.tensor(numerical_labels)


In [56]:
train_external, validation_external, train_bonsai, validation_bonsai = train_test_split(input_ids, labels_tensor, random_state=2018, test_size=0.1)

train_masks, validation_masks = train_test_split(
    attention_masks, random_state=2018, test_size=0.1
)

In [61]:
train_data = TensorDataset(train_external, train_bonsai, train_masks)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

validation_data = TensorDataset(validation_external, validation_bonsai, validation_masks)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=32)

In [62]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(set(arr_bonsai)))
model.cuda()

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
# Define the training loop
epochs = 4
for _ in range(epochs):
    # Training
    model.train()
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_labels, b_attention_masks = batch
        optimizer.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_attention_masks,labels=b_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    
    # Validation
    model.eval()
    predictions, true_labels = [], []
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_labels, b_attention_masks = batch
        with torch.no_grad():
            outputs = model(b_input_ids, attention_mask=b_attention_masks)
        
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        predictions.append(logits)
        true_labels.append(label_ids)



In [64]:
# Concatenate all the predictions and true labels
predictions = np.concatenate(predictions, axis=0)
true_labels = np.concatenate(true_labels, axis=0)

# Calculate the accuracy of our predictions vs labels
predicted_labels = np.argmax(predictions, axis=1)
accuracy = accuracy_score(true_labels, predicted_labels)
accuracy

0.045454545454545456

In [13]:
torch.save(model.state_dict(), 'bert-base-uncased_fine_tuning_state_dict.pt')

In [14]:
# Initialize the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(arr_bonsai)))

# Load the state dictionary
model.load_state_dict(torch.load('bert-base-uncased_fine_tuning_state_dict.pt'))

# Make sure to call model.to(device) to move the model to the desired device
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,