In [1]:
import os, random, gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("CWD:", Path().resolve())

CWD: C:\Users\Nolan\Documents\My dokuments\GoIT\Deep_learning\petfinder_project\kaggle_notebooks


In [3]:
print("Before:", Path().resolve())
os.chdir("..")
print("After :", Path().resolve())

Before: C:\Users\Nolan\Documents\My dokuments\GoIT\Deep_learning\petfinder_project\kaggle_notebooks
After : C:\Users\Nolan\Documents\My dokuments\GoIT\Deep_learning\petfinder_project


In [4]:
COMP_PATH = Path("../petfinder_project/data/raw")
FOLDS_PATH = Path("../petfinder_project/data/artifacts/train_folds.csv") 

train_csv = COMP_PATH / "train.csv"
test_csv = COMP_PATH / "test.csv"

print("train exists:", train_csv.exists())
print("test exists :", test_csv.exists())
print("folds exists:", FOLDS_PATH.exists())

train exists: True
test exists : True
folds exists: True


In [5]:
train_df = pd.read_csv(train_csv)
test_df  = pd.read_csv(test_csv)
folds_df = pd.read_csv(FOLDS_PATH)

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
print("Folds shape:", folds_df.shape)

Train shape: (6431, 3)
Test shape : (1891, 2)
Folds shape: (6431, 3)


In [56]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6431 entries, 0 to 6430
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   PetID          6431 non-null   object
 1   Description    6426 non-null   object
 2   AdoptionSpeed  6431 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 150.9+ KB


In [57]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1891 entries, 0 to 1890
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PetID        1891 non-null   object
 1   Description  1890 non-null   object
dtypes: object(2)
memory usage: 29.7+ KB


In [58]:
train_df["Description"] = train_df["Description"].fillna("")
test_df["Description"] = test_df["Description"].fillna("")

In [59]:
print(train_df["AdoptionSpeed"].value_counts().sort_index())


AdoptionSpeed
1    1197
2    1773
3    1328
4    2133
Name: count, dtype: int64


In [60]:
print("train dataset:")
print(train_df.head(), "\n")
print("test dataset:")
print(test_df.head(), "\n")
print("folds dataset:")
print(folds_df.head())

train dataset:
       PetID                                        Description  AdoptionSpeed
0  d3b4f29f8  Mayleen and Flo are two lovely adorable sister...              2
1  e9dc82251  A total of 5 beautiful Tabbys available for ad...              2
2  8111f6d4a  Two-and-a-half month old girl. Very manja and ...              2
3  693a90fda  Neil is a healthy and active ~2-month-old fema...              2
4  9d08c85ef  Gray kitten available for adoption in sungai p...              2 

test dataset:
       PetID                                        Description
0  6697a7f62  This cute little puppy is looking for a loving...
1  23b64fe21  These 3 puppies was rescued from a mechanic sh...
2  41e824cbe  Ara needs a forever home! Believe me, he's a r...
3  6c3d7237b  i rescue this homeless dog 2 years ago but my ...
4  97b0b5d92  We found him at a shopping mall at a very clea... 

folds dataset:
       PetID  AdoptionSpeed  fold
0  d3b4f29f8              2     3
1  e9dc82251              

In [61]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [None]:
model_name = "distilbert-base-cased"
num_classes = 4

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: 7b321ad5-ad70-4741-8cee-d0e726c935fb)')' thrown while requesting HEAD https://huggingface.co/distilbert-base-cased/resolve/main/config.json
Retrying in 1s [Retry 1/5].
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
text = train_df.loc[0, "Description"]  
inputs = tokenizer(text)
tokenizer.decode(inputs["input_ids"])

'[CLS] Mayleen and Flo are two lovely adorable sisters. They are very friendly and affectionate, but wary of strangers and make good watchdogs. Mayleen has golden hues on her face, making her a husky look - alike. Flo has a darker face with brown feet, and is the more outgoing and dominat of the two. Looking for good homes. Adopters must vaccinate and spay them. [SEP]'

In [64]:
inputs = tokenizer( 
    text, 
    max_length=200, 
    truncation=True,
    return_overflowing_tokens=True, 
) 
  
for ids in inputs["input_ids"]: 
    print(tokenizer.decode(ids)); print()

[CLS] Mayleen and Flo are two lovely adorable sisters. They are very friendly and affectionate, but wary of strangers and make good watchdogs. Mayleen has golden hues on her face, making her a husky look - alike. Flo has a darker face with brown feet, and is the more outgoing and dominat of the two. Looking for good homes. Adopters must vaccinate and spay them. [SEP]



In [65]:
inputs = tokenizer( 
    text, 
    max_length=200, 
    truncation=True, 
    return_overflowing_tokens=True, 
    return_offsets_mapping=True, 
 ) 
inputs.keys()
print(inputs.keys())
print("num chunks:", len(inputs["input_ids"]))

KeysView({'input_ids': [[101, 1318, 21180, 1105, 143, 2858, 1132, 1160, 9020, 27627, 5919, 119, 1220, 1132, 1304, 4931, 1105, 12721, 2193, 117, 1133, 16970, 1104, 15712, 1105, 1294, 1363, 2824, 14082, 1116, 119, 1318, 21180, 1144, 5404, 177, 10589, 1113, 1123, 1339, 117, 1543, 1123, 170, 24418, 1440, 118, 11609, 119, 143, 2858, 1144, 170, 9934, 1339, 1114, 3058, 1623, 117, 1105, 1110, 1103, 1167, 25194, 1105, 1202, 14503, 1204, 1104, 1103, 1160, 119, 8540, 1111, 1363, 4481, 119, 24930, 4184, 5759, 1538, 191, 7409, 16430, 2193, 1105, 22620, 1183, 1172, 119, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'offset_mapping': [[(0, 0), (0, 3), (3, 7), (8, 11), (12, 13), (13, 15), (16, 19), (20, 23), (24, 30), (31, 39), (40, 47), (47, 48), (4

In [66]:
lengths = train_df["Description"].fillna("").astype(str).apply(
    lambda x: len(tokenizer(x, add_special_tokens=True, truncation=False)["input_ids"])
)

print("count:", lengths.shape[0])
print("min  :", lengths.min())
print("p50  :", int(lengths.quantile(0.50)))
print("p90  :", int(lengths.quantile(0.90)))
print("p95  :", int(lengths.quantile(0.95)))
print("p99  :", int(lengths.quantile(0.99)))
print("max  :", lengths.max())


Token indices sequence length is longer than the specified maximum sequence length for this model (1173 > 512). Running this sequence through the model will result in indexing errors


count: 6431
min  : 2
p50  : 66
p90  : 199
p95  : 287
p99  : 557
max  : 1487


In [67]:
class PetFinderDataset(Dataset):
    def __init__(self, texts, labels=None, max_length=287):
        self.texts = texts
        self.labels = labels
        self.max_length = max_length
       
    def __len__(self):
        return len(self.texts)
   
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = tokenizer(
            text,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
       
        item = {key: val.squeeze(0) for key, val in inputs.items()}
       
        if self.labels is not None:
            item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
       
        return item

In [69]:
train_df["Description"].isna().sum()

np.int64(0)

In [80]:
batch_size = 16
max_length = 288
n_folds = 5
num_workers = 0 
epochs = 3
lr = 2e-5     
pin_memory = False

In [81]:
master_df = train_df.merge(
    folds_df[["PetID", "fold"]],
    on="PetID",
    how="left"
)
print("shape:", master_df.shape)
print("missing fold:", master_df["fold"].isna().sum())


shape: (6431, 4)
missing fold: 0


In [91]:
master_df["label"] = master_df["AdoptionSpeed"] - 1  # Convert to 0-3
print(master_df["AdoptionSpeed"].min(), master_df["AdoptionSpeed"].max())
print(master_df["label"].min(), master_df["label"].max())


1 4
0 3


In [97]:
for fold in range(n_folds):
    print("№ Fold:", fold)

    train = master_df[master_df["fold"] != fold]
    val = master_df[master_df["fold"] == fold]

    train_texts = train["Description"].values
    train_labels = train["label"].values

    val_texts = val["Description"].values
    val_labels = val["label"].values

    train_dataset = PetFinderDataset(
        texts=train_texts,
        labels=train_labels,
        max_length=max_length
    )

    val_dataset = PetFinderDataset(
        texts=val_texts,
        labels=val_labels,
        max_length=max_length
    )

    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=pin_memory
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=pin_memory
    )

    print("Train batches:", len(train_loader))
    print(" Val batches:", len(val_loader), "\n")

№ Fold: 0
Train batches: 322
 Val batches: 81 

№ Fold: 1
Train batches: 322
 Val batches: 81 

№ Fold: 2
Train batches: 322
 Val batches: 81 

№ Fold: 3
Train batches: 322
 Val batches: 81 

№ Fold: 4
Train batches: 322
 Val batches: 81 



In [98]:
batch = next(iter(train_loader))

for k, v in batch.items():
    print(k, v.shape, v.dtype)


input_ids torch.Size([16, 288]) torch.int64
attention_mask torch.Size([16, 288]) torch.int64
labels torch.Size([16]) torch.int64


In [99]:
print("labels min/max:", batch["labels"].min().item(), batch["labels"].max().item())
print("unique (first 50):", torch.unique(batch["labels"])[:50])


labels min/max: 0 3
unique (first 50): tensor([0, 1, 2, 3])


In [100]:
model.eval()

with torch.no_grad():
    outputs = model(
        input_ids=batch["input_ids"].to(device),
        attention_mask=batch["attention_mask"].to(device),
        labels=batch["labels"].to(device),
    )

print("loss:", outputs.loss)
print("logits shape:", outputs.logits.shape)


loss: tensor(1.4373)
logits shape: torch.Size([16, 4])


In [101]:
model.train()
optimizer = AdamW(model.parameters(), lr=lr)

batch = next(iter(train_loader))
outputs = model(
    input_ids=batch["input_ids"].to(device),
    attention_mask=batch["attention_mask"].to(device),
    labels=batch["labels"].to(device),
)
loss = outputs.loss

optimizer.zero_grad()
loss.backward()
optimizer.step()

print("train step loss:", loss.item())

train step loss: 1.456268548965454


In [None]:
def evaluate_qwk(model, val_loader, device):
    model.eval()
    all_preds = []
    all_labels = []