In [1]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support
from datasets import Dataset

# Load dataset
file_path = '/kaggle/input/made-up-dataset/train_made_up_dataset.csv'
df = pd.read_csv(file_path)
# Drop the unnecessary column and encode the labels
df = df[['Code', 'Smell']]
label_encoder = LabelEncoder()
df['Smell'] = label_encoder.fit_transform(df['Smell'])

# Split the dataset into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Prepare the datasets for Hugging Face's Dataset format
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Tokenizer and model setup
model_name = 'microsoft/codebert-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

# Tokenization function
def tokenize_function(examples):
    encoding = tokenizer(examples['Code'], truncation=True, padding='max_length', max_length=512)
    encoding['labels'] = examples['Smell']
    return encoding

# Tokenize datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

# Define metrics for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='macro')
    return {'f1_macro': f1}

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro'
)

# Trainer 0e92fed1c003f0fdabc054357fbb76beb18ff2dc
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save the best model
trainer.save_model('./best_model')


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/307 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112701011112829, max=1.0…

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,1.214828,0.805325
2,No log,0.498364,0.941477
3,No log,0.21008,0.946969
4,No log,0.127997,0.978182
5,No log,0.161948,0.946581
6,No log,0.157388,0.942051
7,No log,0.103234,0.965855
8,No log,0.125799,0.942051
9,No log,0.129473,0.942051
10,No log,0.129023,0.942051


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


In [3]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, matthews_corrcoef
from datasets import Dataset


test_df = pd.read_csv("/kaggle/input/made-up-dataset/test_made_up_dataset.csv")
test_df.dropna(inplace=True)
# Prepare the test dataset for Hugging Face's Dataset format
test_dataset = Dataset.from_pandas(test_df)
# Ensure the 'smell' column exists in the test DataFrame
if 'Smell' not in test_df.columns:
    raise ValueError("The test DataFrame does not contain the 'smell' column, which is required for evaluation.")

# Tokenization function with labels included
def tokenize_function(examples):
    encoding = tokenizer(examples['Code'], truncation=True, padding='max_length', max_length=512)
    encoding['labels'] = examples['Smell']  # Assign labels during tokenization
    return encoding

# Tokenize the test dataset
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/96 [00:00<?, ? examples/s]

In [4]:

# Re-run the evaluation process
predictions = trainer.predict(tokenized_test_dataset)
preds = predictions.predictions.argmax(axis=-1)
test_df['Smell'] = label_encoder.transform(test_df['Smell'])

labels = test_df['Smell'].values

# Validate if labels are still None
if labels is None:
    print("Labels are still None, indicating an issue during tokenization.")
else:
    # Calculate metrics if labels are found
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1_macro, _ = precision_recall_fscore_support(labels, preds, average='macro')
    mcc = matthews_corrcoef(labels, preds)

    # Print the results
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Macro: {f1_macro:.4f}")
    print(f"MCC: {mcc:.4f}")



  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Accuracy: 0.9271
Precision: 0.9377
Recall: 0.9271
F1 Macro: 0.9267
MCC: 0.9148
