In [1]:
from transformers import  AutoModelForSequenceClassification, AutoTokenizer
# Load the pretrained DistilBERT model and tokenizer
checkpoint = "distilbert-base-cased" 
classifier = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(checkpoint) 

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
# Load dataset
import pandas as pd
file_path = "path-to-train_dataset"
df = pd.read_excel(file_path, engine='openpyxl')

In [15]:
# Encode the labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df["class"] = label_encoder.fit_transform(df["class"])

In [16]:
# Split dataset as training and evaluation sets
from sklearn.model_selection import train_test_split
df_train, df_eval = train_test_split(df, train_size=0.8,stratify=df["class"], random_state=42)

In [17]:
# Create Hugging Face datasets
from datasets import Dataset, DatasetDict
raw_datasets = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "eval": Dataset.from_pandas(df_eval)
})

In [18]:
print("Dataset Dict:\n", raw_datasets)
print("\n\nTrain's features:\n", raw_datasets["train"].features)
print("\n\nFirst row of Train:\n", raw_datasets["train"][0])

Dataset Dict:
 DatasetDict({
    train: Dataset({
        features: ['text', 'class', '__index_level_0__'],
        num_rows: 93349
    })
    eval: Dataset({
        features: ['text', 'class', '__index_level_0__'],
        num_rows: 23338
    })
})


Train's features:
 {'text': Value(dtype='string', id=None), 'class': Value(dtype='int64', id=None), '__index_level_0__': Value(dtype='int64', id=None)}


First row of Train:
 {'text': "Feeling done with life rnNo friends, no family that cares about me, nothing just emptiness. No happiness, no smiles just fake emotions. Thinking about ending it all soon. I've tried once before and it didn't work. Can't even kill myself right  I can't do anything right.", 'class': 1, '__index_level_0__': 89440}


In [19]:
# Make sure the text is string and tokenize the datasets
raw_datasets = raw_datasets.map(lambda dataset: {'text': str(dataset['text'])}, batched=False)
tokenized_datasets = raw_datasets.map(lambda dataset: tokenizer(dataset['text'], truncation=True), batched=True)

Map:   0%|          | 0/93349 [00:00<?, ? examples/s]

Map:   0%|          | 0/23338 [00:00<?, ? examples/s]

Map:   0%|          | 0/93349 [00:00<?, ? examples/s]

Map:   0%|          | 0/23338 [00:00<?, ? examples/s]

In [20]:
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['text', 'class', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 93349
    })
    eval: Dataset({
        features: ['text', 'class', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 23338
    })
})


In [21]:
print(tokenized_datasets["train"][0])

{'text': "Feeling done with life rnNo friends, no family that cares about me, nothing just emptiness. No happiness, no smiles just fake emotions. Thinking about ending it all soon. I've tried once before and it didn't work. Can't even kill myself right  I can't do anything right.", 'class': 1, '__index_level_0__': 89440, 'input_ids': [101, 13085, 1694, 1114, 1297, 187, 1179, 2249, 1186, 2053, 117, 1185, 1266, 1115, 16903, 1164, 1143, 117, 1720, 1198, 27781, 119, 1302, 9266, 117, 1185, 8402, 1198, 8406, 6288, 119, 16204, 1164, 3830, 1122, 1155, 1770, 119, 146, 112, 1396, 1793, 1517, 1196, 1105, 1122, 1238, 112, 189, 1250, 119, 2825, 112, 189, 1256, 2311, 1991, 1268, 146, 1169, 112, 189, 1202, 1625, 1268, 119, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [22]:
# Remove unnecessary columns if there is any and rename the class column as labels
tokenized_datasets = tokenized_datasets.remove_columns(["text", "__index_level_0__"])
tokenized_datasets = tokenized_datasets.rename_column("class", "labels")
print(tokenized_datasets)

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 93349
    })
    eval: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 23338
    })
})


In [23]:
# Disable parallelism for tokenizers
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [11]:
# Define checkpoint path
checkpoint_dir = os.path.join("/data1/ma2", "checkpoints")

if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
    print(f"Checkpoint file is created: {checkpoint_dir}")
else:
    print(f"Checkpoint file already exists: {checkpoint_dir}")

In [24]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback
import numpy as np
import evaluate

# Set up training arguments and trainer

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training args 
training_args = TrainingArguments(
    output_dir=checkpoint_dir,              
    num_train_epochs=5,                    
    eval_strategy="epoch",            
    weight_decay=5e-4,                    
    save_strategy="epoch",         
    save_total_limit=3,                
    report_to="none",                      
    load_best_model_at_end=True,         
    metric_for_best_model="accuracy"   
)

# Define metrics for evaluation
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc") # F1 and Accuracy
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Loss function
loss_fn = nn.CrossEntropyLoss()

# Define trainer
trainer = Trainer(
    classifier,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    compute_loss=lambda model, inputs: loss_fn(model(inputs).logits, inputs['labels']),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] 
)

In [25]:
# Start training
trainer.train()

Epoch,Training Loss,Validation Loss


RuntimeError: MPS backend out of memory (MPS allocated: 10.02 GB, other allocations: 8.11 GB, max allowed: 18.13 GB). Tried to allocate 96.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
# Cross-validation
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Veriyi yükleyin
X = df['text']
y = df['class']

# Cross-validation ayarları
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

train_losses = []
eval_losses = []
eval_accuracies = []

for train_index, val_index in kf.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # Tokenize the data
    train_encodings = tokenizer(list(X_train), truncation=True, padding=True)
    val_encodings = tokenizer(list(X_val), truncation=True, padding=True)

    # Create datasets
    train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': y_train})
    val_dataset = Dataset.from_dict({'input_ids': val_encodings['input_ids'], 'attention_mask': val_encodings['attention_mask'], 'labels': y_val})

    # Trainer and training
    trainer = Trainer(
        model=classifier,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Record the results
    train_losses.append(trainer.state.log_history[-1]['loss'])
    eval_result = trainer.evaluate()
    eval_losses.append(eval_result['eval_loss'])
    eval_accuracies.append(eval_result['eval_accuracy'])

# Print the Cross-validation results
print(f"Cross-validation Train Loss: {np.mean(train_losses)}")
print(f"Cross-validation Eval Loss: {np.mean(eval_losses)}")
print(f"Cross-validation Eval Accuracy: {np.mean(eval_accuracies)}")