In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import numpy as np

# Load the combined narrative data
data_dir = '/content/drive/MyDrive/Colab Notebooks/Berkeley/266/FinalProject/Data/'
original_data = os.path.join(data_dir, 'sentences_with_labels.csv')

df = pd.read_csv(original_data)

# Fix Nan to ""
df = df.fillna("")

display(df.head())

print(df.shape)

Unnamed: 0,sentence1,sentence2,sentence3,sentence4,sentence5,mistral_sentence5,llama_sentence5,label
0,Frank had been drinking beer.,"He got a call from his girlfriend, asking wher...",Frank suddenly realized he had a date that night.,"Since Frank was already a bit drunk, he could ...",Frank spent the rest of the night drinking mor...,Frank called an Uber and was able to make it t...,"He called a taxi to pick him up, but it took a...",0
1,Dave was in the Bahamas on vacation.,He decided to go snorkeling on his second day.,"While snorkeling, he saw a cave up ahead.","He went into the cave, and he was terrified wh...","Dave swam away as fast as he could, but the sh...","Dave quickly swam back to the surface, scared ...",He swam back to the surface and quickly paddle...,1
2,Sunny enjoyed going to the beach.,"As she stepped out of her car, she realized sh...",It was quite sunny and she forgot her sunglasses.,Sunny got back into her car and heading toward...,Sunny found some sunglasses and headed back to...,Sunny bought a new pair of sunglasses at the m...,She went to the mall to buy a new pair of sung...,0
3,Sally was happy when her widowed mom found a n...,She discovered her siblings didn't feel the same.,Sally flew to visit her mom and her mom's new ...,"Although her mom was obviously in love, he was...",Sally went home and wondered about her parents...,"Although Sally struggled at first, she eventua...","Her mom's new husband seemed kind, but Sally n...",1
4,Dan hit his golf ball and watched it go.,The ball bounced on the grass and into the san...,Dan pretended that his ball actually landed on...,His friends were not paying attention so they ...,Dan snuck a ball on the green and made his put...,Dan's friends eventually noticed that his ball...,"His friends were impressed by his fake shot, b...",0


(40000, 8)


In [3]:
!pip install transformers datasets -q

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
import torch

In [5]:
# Load your dataset
df = df[['sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5',
         'mistral_sentence5', 'llama_sentence5', 'label']]

# Combine sentences into single inputs
def prepare_inputs(row):
    """
    Combine the context sentences with each possible ending into a format suitable for BERT.
    """
    mistral_input = f"{row['sentence1']} {row['sentence2']} {row['sentence3']} {row['sentence4']} {row['mistral_sentence5']}"
    llama_input = f"{row['sentence1']} {row['sentence2']} {row['sentence3']} {row['sentence4']} {row['llama_sentence5']}"
    return mistral_input, llama_input

df['mistral_input'], df['llama_input'] = zip(*df.apply(prepare_inputs, axis=1))

# Split dataset into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [6]:
# Tokenizer and preprocessing
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
def tokenize_function(examples):
    return tokenizer(
        examples['mistral_input'],
        examples['llama_input'],
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors='pt'
    )

# Convert pandas DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[['mistral_input', 'llama_input', 'label']])
val_dataset = Dataset.from_pandas(val_df[['mistral_input', 'llama_input', 'label']])

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# Remove unnecessary columns
train_dataset = train_dataset.remove_columns(['mistral_input', 'llama_input'])
val_dataset = val_dataset.remove_columns(['mistral_input', 'llama_input'])

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

In [8]:
# Model
def get_model():
  return BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
#model = get_model()

In [9]:
def get_trainer(
    model,
    train_dataset,
    val_dataset,
    tokenizer,
    learning_rate=2e-5,
    batch_size=8,
    weight_decay=0.01,
    num_train_epochs=5
):
    training_args = TrainingArguments(
        output_dir='./results',
        eval_strategy='epoch',
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        save_strategy='epoch',
        logging_dir='./logs',
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
        push_to_hub=False,
        report_to=["none"],  # Disable W&B logging
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = torch.argmax(torch.tensor(logits), dim=-1)
        accuracy = (predictions == labels).float().mean().item()
        return {'accuracy': accuracy}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )
    return trainer


In [10]:
results = {'LearningRate': [], 'BatchSize': [], 'Loss': [], 'Accuracy': []}
learning_rate = [1e-5, 2e-5]
batch_size = [8, 16, 32]

for lr in learning_rate:
    for bs in batch_size:
        print(f"Testing: LR={lr}, BS={bs}")

        # Initialize a new model for each configuration
        model = get_model()

        # Create the trainer with the specified hyperparameters
        trainer = get_trainer(
            model=model,
            train_dataset=train_dataset,
            val_dataset=val_dataset,
            tokenizer=tokenizer,
            learning_rate=lr,
            batch_size=bs
        )

        # Train the model
        trainer.train()

        # Evaluate the model
        eval_results = trainer.evaluate()

        # Log results
        print(f"Validation Results for LR={lr}, BS={bs}: {eval_results}")
        results['LearningRate'].append(lr)
        results['BatchSize'].append(bs)
        results['Loss'].append(eval_results.get('eval_loss', None))
        results['Accuracy'].append(eval_results.get('eval_accuracy', None))


Testing: LR=1e-05, BS=8


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6641,0.645657,0.616375
2,0.5119,0.643939,0.632625
3,0.4967,0.715385,0.622875
4,0.4169,0.892362,0.625
5,0.2711,1.186266,0.622875


Validation Results for LR=1e-05, BS=8: {'eval_loss': 0.6439393162727356, 'eval_accuracy': 0.6326249837875366, 'eval_runtime': 23.1353, 'eval_samples_per_second': 345.792, 'eval_steps_per_second': 43.224, 'epoch': 5.0}
Testing: LR=1e-05, BS=16


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6629,0.657769,0.607375
2,0.5924,0.635025,0.62275
3,0.5457,0.666918,0.620875
4,0.4723,0.734091,0.6165
5,0.3626,0.820798,0.619875


Validation Results for LR=1e-05, BS=16: {'eval_loss': 0.6350246667861938, 'eval_accuracy': 0.6227499842643738, 'eval_runtime': 20.5129, 'eval_samples_per_second': 389.999, 'eval_steps_per_second': 24.375, 'epoch': 5.0}
Testing: LR=1e-05, BS=32


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6911,0.667105,0.5985
2,0.6111,0.632543,0.615875
3,0.6101,0.64251,0.6265
4,0.5503,0.674217,0.622375
5,0.4635,0.703608,0.620875


Validation Results for LR=1e-05, BS=32: {'eval_loss': 0.6425099968910217, 'eval_accuracy': 0.6265000104904175, 'eval_runtime': 18.6359, 'eval_samples_per_second': 429.28, 'eval_steps_per_second': 13.415, 'epoch': 5.0}
Testing: LR=2e-05, BS=8


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7012,0.655713,0.606875
2,0.5597,0.642044,0.627375
3,0.5751,0.713769,0.61725
4,0.417,1.094307,0.61625
5,0.1596,1.840397,0.614375


Validation Results for LR=2e-05, BS=8: {'eval_loss': 0.6420437097549438, 'eval_accuracy': 0.6273750066757202, 'eval_runtime': 23.1053, 'eval_samples_per_second': 346.242, 'eval_steps_per_second': 43.28, 'epoch': 5.0}
Testing: LR=2e-05, BS=16


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6314,0.668508,0.61075
2,0.5926,0.646293,0.6215
3,0.4749,0.748215,0.621625
4,0.398,0.910495,0.62175
5,0.2344,1.256693,0.62075


Validation Results for LR=2e-05, BS=16: {'eval_loss': 0.910495400428772, 'eval_accuracy': 0.621749997138977, 'eval_runtime': 20.5371, 'eval_samples_per_second': 389.538, 'eval_steps_per_second': 24.346, 'epoch': 5.0}
Testing: LR=2e-05, BS=32


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6783,0.671556,0.577
2,0.6239,0.639952,0.6155
3,0.5428,0.682718,0.623375
4,0.4939,0.784537,0.62175
5,0.3033,0.903538,0.61975


Validation Results for LR=2e-05, BS=32: {'eval_loss': 0.6827182769775391, 'eval_accuracy': 0.6233749985694885, 'eval_runtime': 18.6102, 'eval_samples_per_second': 429.871, 'eval_steps_per_second': 13.433, 'epoch': 5.0}


In [11]:
results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(data_dir, 'results.csv'), index=False)
display(results_df)

Unnamed: 0,LearningRate,BatchSize,Loss,Accuracy
0,1e-05,8,0.643939,0.632625
1,1e-05,16,0.635025,0.62275
2,1e-05,32,0.64251,0.6265
3,2e-05,8,0.642044,0.627375
4,2e-05,16,0.910495,0.62175
5,2e-05,32,0.682718,0.623375


In [12]:
from google.colab import runtime
runtime.unassign()

In [None]:
# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the model
model.save_pretrained('./best_model')
tokenizer.save_pretrained('./best_model')

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6569,0.657578,0.60525
2,0.6441,0.651822,0.61625
3,0.4515,0.798858,0.612375
4,0.5281,1.205326,0.62275
5,0.2387,1.933847,0.618375


('./best_model/tokenizer_config.json',
 './best_model/special_tokens_map.json',
 './best_model/vocab.txt',
 './best_model/added_tokens.json')

In [None]:
# Evaluate the model
eval_results = trainer.evaluate()

# Print the evaluation results explicitly
print(f"Validation Results: {eval_results}")

Validation Results: {'eval_loss': 1.2053260803222656, 'eval_accuracy': 0.6227499842643738, 'eval_runtime': 87.6227, 'eval_samples_per_second': 91.301, 'eval_steps_per_second': 11.413, 'epoch': 5.0}


In [None]:
from sklearn.metrics import classification_report

predictions = trainer.predict(val_dataset)

logits = predictions.predictions
predicted_classes = np.argmax(logits, axis=1)
true_classes = predictions.label_ids

report = classification_report(true_classes, predicted_classes, target_names=['Mistral', 'Llama'])
print(report)

              precision    recall  f1-score   support

     Mistral       0.63      0.65      0.64      4152
       Llama       0.61      0.59      0.60      3848

    accuracy                           0.62      8000
   macro avg       0.62      0.62      0.62      8000
weighted avg       0.62      0.62      0.62      8000



In [None]:
## Save the model and tokenizer to Hugging Face Hub
model.push_to_hub(f"MatthewFrank/w266FinalProject")
tokenizer.push_to_hub(f"MatthewFrank/w266FinalProject")

In [None]:
from google.colab import runtime
runtime.unassign()