<a href="https://colab.research.google.com/github/nh273/WRaaS/blob/main/notebooks/Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install transformers

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 5.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 43.0 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 47.4 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninsta

In [2]:
!pip install datasets

Collecting datasets
  Downloading datasets-1.11.0-py3-none-any.whl (264 kB)
[K     |████████████████████████████████| 264 kB 4.9 MB/s 
Collecting tqdm>=4.42
  Downloading tqdm-4.62.0-py2.py3-none-any.whl (76 kB)
[K     |████████████████████████████████| 76 kB 3.8 MB/s 
Collecting fsspec>=2021.05.0
  Downloading fsspec-2021.7.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 9.2 MB/s 
Collecting xxhash
  Downloading xxhash-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 10.5 MB/s 
Installing collected packages: tqdm, xxhash, fsspec, datasets
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.41.1
    Uninstalling tqdm-4.41.1:
      Successfully uninstalled tqdm-4.41.1
Successfully installed datasets-1.11.0 fsspec-2021.7.0 tqdm-4.62.0 xxhash-2.0.2


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np

import torch
from transformers import (DistilBertTokenizerFast,
                          DistilBertForSequenceClassification, Trainer, TrainingArguments)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import load_metric


In [5]:
drive_path = '/content/drive/MyDrive/WRaaS'
df = pd.read_csv(drive_path + '/raw/winemag-data-130k-v2.csv', index_col=None)

def _filter_classes_with_too_few_samples(df, label_col, threshold):
    count = df[label_col].value_counts()
    classes_to_keep = count[count > threshold].index
    print(f"Filtering to {len(classes_to_keep)} classes")
    return df[df[label_col].isin(classes_to_keep)]

def _prep_data(df, target_col, feature_col, class_threshold):
    encoder = LabelEncoder()
    df = _filter_classes_with_too_few_samples(df, target_col, class_threshold)
    df = df[[target_col, feature_col]]
    df = df.dropna().drop_duplicates()
    df[target_col] = encoder.fit_transform(df[target_col])
    return df

df = _prep_data(df, 'variety', 'description', 500)


Filtering to 40 classes


In [6]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['description'].tolist(), df['variety'].tolist(), test_size=.2)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_texts, train_labels, test_size=.2)

In [7]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')


Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

In [8]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [9]:
class WineDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = WineDataset(train_encodings, train_labels)
val_dataset = WineDataset(val_encodings, val_labels)
test_dataset = WineDataset(test_encodings, test_labels)

In [10]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

In [11]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/WRaaS/models',          
    num_train_epochs=3,              
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",
)

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=40)
trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics            
)

trainer.train(resume_from_checkpoint=True)

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

  0%|          | 0/3736 [00:00<?, ?it/s]

Epoch,Training Loss,Validation Loss,Accuracy
3,0.7897,0.651977,0.799092


***** Running Evaluation *****
  Num examples = 16525
  Batch size = 64


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=12396, training_loss=0.022375260737296957, metrics={'train_runtime': 10410.1111, 'train_samples_per_second': 19.048, 'train_steps_per_second': 1.191, 'total_flos': 9669541631486880.0, 'train_loss': 0.022375260737296957, 'epoch': 3.0})

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 20656
  Batch size = 64


{'eval_accuracy': 0.7936192873741286,
 'eval_loss': 0.6803699731826782,
 'eval_runtime': 5419.3098,
 'eval_samples_per_second': 3.812,
 'eval_steps_per_second': 0.06}