In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

In [None]:


df = pd.read_csv('/content/drive/My Drive/Project/Anno_Chicago.csv')
#smalltest

small_test = df.sample(n=100)

# Define your sentences and labels
sentences = small_test['Sentence'].tolist()



labels = small_test['Definition?'].tolist()  #  # Labels (1 for definition , 0 for negative)


# Load pre-trained Sentence-BERT model and tokenizer
model_name = "sentence-transformers/paraphrase-mpnet-base-v2"  # Optimal model for long English sentences




In [None]:
#Path to the directory containing the downloaded model files
model_directory = "/content/drive/My Drive/Project/Pre-trainedSbert"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_directory)

# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_directory, num_labels=2)  # 2 for binary classification

tokenized_input = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

# Extract input_ids and attention_mask from the tokenized inputs
input_ids = tokenized_input['input_ids']
attention_mask = tokenized_input['attention_mask']

# Split data into training and validation sets
train_inputs, val_inputs, train_labels, val_labels, train_mask, val_mask = train_test_split(
    input_ids, labels, attention_mask, test_size=0.2, random_state=42
)





Some weights of MPNetForSequenceClassification were not initialized from the model checkpoint at /content/drive/My Drive/Project/Pre-trainedSbert and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='/content/drive/My Drive/Project/Trained_models/Chicago_Trained/output',  # Specify the directory where checkpoints and logs will be saved
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    num_train_epochs=4,
   logging_dir= '/content/drive/My Drive/Project/Trained_models/Chicago_Trained/logs',
    evaluation_strategy="epoch",

)

class CustomDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        input_id = self.input_ids[idx]
        attention_mask = self.attention_mask[idx]
        label = self.labels[idx]
        return {
            'input_ids': input_id,
            'attention_mask': attention_mask,
            'labels': label
        }

train_dataset = CustomDataset(train_inputs, train_mask, train_labels)
val_dataset = CustomDataset(val_inputs, val_mask, val_labels)


In [None]:
# Define the compute_metrics function to include accuracy, precision, recall, and F1-score
def compute_metrics(pred):
    labels = val_labels
    preds = np.argmax(pred.predictions, axis=1)
    accuracy = np.mean(preds == labels)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,

    }



In [None]:
# Define Trainer with updated compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)




In [None]:
# Train the model
print("Training started...")
trainer.train()






Training started...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.516369,0.9,0.857143,1.0,0.923077
2,No log,0.522317,0.7,0.875,0.583333,0.7
3,No log,0.54883,0.75,1.0,0.583333,0.736842
4,No log,0.570991,0.7,0.875,0.583333,0.7


TrainOutput(global_step=108, training_loss=0.43135914979157625, metrics={'train_runtime': 577.8978, 'train_samples_per_second': 0.554, 'train_steps_per_second': 0.187, 'total_flos': 17924440646400.0, 'train_loss': 0.43135914979157625, 'epoch': 4.0})

In [None]:
# Evaluate the model on the validation dataset
print("Evaluation started...")
results = trainer.evaluate(eval_dataset=val_dataset)
print("Evaluation completed!")

print("Evaluation results:", results)




Evaluation started...


Evaluation completed!
Evaluation results: {'eval_loss': 0.5709911584854126, 'eval_accuracy': 0.7, 'eval_precision': 0.875, 'eval_recall': 0.5833333333333334, 'eval_f1': 0.7000000000000001, 'eval_runtime': 16.9091, 'eval_samples_per_second': 1.183, 'eval_steps_per_second': 0.414, 'epoch': 4.0}


In [None]:
FileName = 'sbert_classification_model'
model_save_path = '/content/drive/My Drive/Project/' + FileName
# Save the model
trainer.save_model(model_save_path)