In [None]:
from transformers import  AutoModelForSequenceClassification, AutoTokenizer
# Load the pretrained DistilBERT model and tokenizer
checkpoint = "distilbert-base-cased" 
classifier = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(checkpoint) 

In [None]:
# Load dataset
import pandas as pd
file_path = "path-to-train_dataset"
df = pd.read_excel(file_path, engine='openpyxl')

In [None]:
# Encode the labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df["class"] = label_encoder.fit_transform(df["class"])

In [None]:
# Split dataset as training and evaluation sets
from sklearn.model_selection import train_test_split
df_train, df_eval = train_test_split(df, train_size=0.8,stratify=df["class"], random_state=42)

In [None]:
# Create Hugging Face datasets
from datasets import Dataset, DatasetDict
raw_datasets = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "eval": Dataset.from_pandas(df_eval)
})

In [None]:
print("Dataset Dict:\n", raw_datasets)
print("\n\nTrain's features:\n", raw_datasets["train"].features)
print("\n\nFirst row of Train:\n", raw_datasets["train"][0])

In [None]:
# Make sure the text is string and tokenize the datasets
raw_datasets = raw_datasets.map(lambda dataset: {'text': str(dataset['text'])}, batched=False)
tokenized_datasets = raw_datasets.map(lambda dataset: tokenizer(dataset['text'], truncation=True), batched=True)

In [None]:
print(tokenized_datasets)

In [None]:
print(tokenized_datasets["train"][0])

In [None]:
# Remove unnecessary columns if there is any and rename the class column as labels
tokenized_datasets = tokenized_datasets.remove_columns(["text", "__index_level_0__"])
tokenized_datasets = tokenized_datasets.rename_column("class", "labels")
print(tokenized_datasets)

In [None]:
# Disable parallelism for tokenizers
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
# Define checkpoint path
checkpoint_dir = os.path.join("/data1/ma2", "checkpoints")

if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)
    print(f"Checkpoint file is created: {checkpoint_dir}")
else:
    print(f"Checkpoint file already exists: {checkpoint_dir}")

In [None]:
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback
import numpy as np
import evaluate

# Set up training arguments and trainer

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training args 
training_args = TrainingArguments(
    output_dir=checkpoint_dir,              
    num_train_epochs=5,                    
    eval_strategy="epoch",            
    weight_decay=5e-4,                    
    save_strategy="epoch",         
    save_total_limit=3,                
    report_to="none",                      
    load_best_model_at_end=True,         
    metric_for_best_model="accuracy"   
)

# Define metrics for evaluation
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc") # F1 and Accuracy
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Loss function
loss_fn = nn.CrossEntropyLoss()

# Define trainer
trainer = Trainer(
    classifier,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["eval"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    compute_loss=lambda model, inputs: loss_fn(model(inputs).logits, inputs['labels']),
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] 
)

In [None]:
# Start training
trainer.train()

In [None]:
# Cross-validation
from sklearn.model_selection import StratifiedKFold
import numpy as np

# Veriyi yükleyin
X = df['text']
y = df['class']

# Cross-validation ayarları
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

train_losses = []
eval_losses = []
eval_accuracies = []

for train_index, val_index in kf.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # Tokenize the data
    train_encodings = tokenizer(list(X_train), truncation=True, padding=True)
    val_encodings = tokenizer(list(X_val), truncation=True, padding=True)

    # Create datasets
    train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': y_train})
    val_dataset = Dataset.from_dict({'input_ids': val_encodings['input_ids'], 'attention_mask': val_encodings['attention_mask'], 'labels': y_val})

    # Trainer and training
    trainer = Trainer(
        model=classifier,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    # Record the results
    train_losses.append(trainer.state.log_history[-1]['loss'])
    eval_result = trainer.evaluate()
    eval_losses.append(eval_result['eval_loss'])
    eval_accuracies.append(eval_result['eval_accuracy'])

# Print the Cross-validation results
print(f"Cross-validation Train Loss: {np.mean(train_losses)}")
print(f"Cross-validation Eval Loss: {np.mean(eval_losses)}")
print(f"Cross-validation Eval Accuracy: {np.mean(eval_accuracies)}")