In [1]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
)

from datasets import load_dataset
from tqdm.autonotebook import tqdm
import torch
import numpy as np
import pandas as pd
import os
import evaluate

In [2]:
os.environ['WANDB_DISABLED'] = 'true'

In [3]:
bnb_config = BitsAndBytesConfig(
		load_in_4bit=True,
		bnb_4bit_quant_type='nf4',
		bnb_4bit_compute_dtype=torch.float16,
		bnb_4bit_use_double_quant=False,
	)

In [4]:
model_name='microsoft/phi-2'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"":0},
    trust_remote_code=True,
    num_labels=3,
    low_cpu_mem_usage=True
)

model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model.config.pad_token_id = tokenizer.pad_token_id

In [6]:
def select_samples(dataset, step):
    dataset = dataset.select(range(0, len(dataset), step))
    return dataset

dataset = load_dataset('stanfordnlp/snli')
dataset['train'] = select_samples(dataset['train'], 550)
dataset['validation'] = select_samples(dataset['validation'], 100)
dataset['test'] = select_samples(dataset['test'], 100)

### Training

In [7]:
def get_inference(sample):
	premise, hypothesis, label = sample['premise'], sample['hypothesis'], sample['label']
	inputs = tokenizer(premise, hypothesis, return_tensors='pt', padding='max_length', truncation=True, max_length=128).to(model.device)
	with torch.no_grad():
		outputs = model(**inputs)
		logits = outputs.logits
		predictions = torch.argmax(logits, dim=1).item()
            
	return predictions, label

get_inference(dataset['train'][5])

(2, 0)

In [8]:
metric = evaluate.load("accuracy")

def compute_accuracy(dataset):
	predictions = []
	labels = []
	for sample in tqdm(dataset):
		prediction, label = get_inference(sample)
		predictions.append(prediction)
		labels.append(label)
	

	return metric.compute(predictions=predictions, references=labels)

In [11]:
compute_accuracy(dataset['train'])

  0%|          | 0/1001 [00:00<?, ?it/s]

{'accuracy': 0.3276723276723277}

In [9]:
compute_accuracy(dataset['validation'])

  0%|          | 0/100 [00:00<?, ?it/s]

{'accuracy': 0.31}

In [10]:
compute_accuracy(dataset['test'])

  0%|          | 0/100 [00:00<?, ?it/s]

{'accuracy': 0.37}

In [9]:
predictions = []
labels = []

for sample in tqdm(dataset['test']):
	prediction, label = get_inference(sample)
	predictions.append(prediction)
	labels.append(label)

# save a csv file with the predictions and index
df = pd.DataFrame({'predictions': predictions, 'labels': labels})
df.to_csv('train_predictions.csv')

  0%|          | 0/100 [00:00<?, ?it/s]