In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback, TextClassificationPipeline, DataCollatorWithPadding
from sklearn.model_selection import train_test_split
import evaluate as ev
from datasets import DatasetDict, Dataset
import torch
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [3]:
training_data_path = '../quora-question-data/train.csv'
test_data_path = '../quora-question-data/test.csv'
plots_path = '../plots'
SEED = 42

train_data = pd.read_csv(training_data_path)
test_data = pd.read_csv(test_data_path)

In [5]:
train_data["question1"]= train_data["question1"].astype(str)
train_data["question2"] = train_data["question2"].astype(str)
train_data["len_q1"] = train_data["question1"].apply(lambda sentence: len(sentence.split()))
train_data["len_q2"] = train_data["question2"].apply(lambda sentence: len(sentence.split()))
train_data["length_difference"] = np.abs(train_data["len_q1"] - train_data["len_q2"])

In [9]:
train_data = train_data.rename(columns={'is_duplicate': 'label'})

In [10]:
train_df, val_test_df = train_test_split(train_data.drop(columns=["len_q1", "len_q2", "length_difference"]), test_size=0.2, random_state=42)
val_df, test_df = train_test_split(val_test_df, test_size=0.5, random_state=42)

# Convert DataFrames to datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

dataset = DatasetDict()

# Add datasets to DatasetDict
dataset["train"] = train_dataset
dataset["validation"] = val_dataset
dataset["test"] = test_dataset

# Tokenizer from a pretrained model
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Funtion to tokenize data

def tokenize_dataset(data):
    return tokenizer(data["question1"],
                     data["question2"],
                     max_length=20,
                     truncation=True)
                     #padding="max_length"

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_dataset, batched=True)

# Dynamically pad
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/323432 [00:00<?, ? examples/s]

Map:   0%|          | 0/40429 [00:00<?, ? examples/s]

Map:   0%|          | 0/40429 [00:00<?, ? examples/s]

In [34]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
# Set up training arguments
training_args = TrainingArguments(
    output_dir="./finetune_bert/",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-6,
    seed=SEED,
    save_strategy='epoch',
    evaluation_strategy='epoch',
    load_best_model_at_end=True
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
# Function to compute the metric
def compute_metrics(eval_pred):
    metric = ev.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [36]:
# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5282,0.527452,0.834104


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

TrainOutput(global_step=80858, training_loss=0.5219093433527228, metrics={'train_runtime': 6419.8021, 'train_samples_per_second': 50.38, 'train_steps_per_second': 12.595, 'total_flos': 3324114240091440.0, 'train_loss': 0.5219093433527228, 'epoch': 1.0})

In [37]:
# Trainer evaluate
trainer.evaluate(tokenized_dataset['test'])
# Save tokenizer
tokenizer.save_pretrained('./finetune_bert/')

# Save model
trainer.save_model('./finetune_bert/')