<a href="https://colab.research.google.com/github/mahnoor-khalid9/Advance-Data-Mining/blob/main/SFT_GradQA_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the JSON file into a pandas DataFrame
df = pd.read_json("/content/train.json")

# Create the 'input_text' column from the 'Problem' column
df['input_text'] = df['Problem']

# Create the 'target_text' column by concatenating specified columns
df['target_text'] = (
    'Rationale: ' + df['Rationale'].astype(str) + '\n' +
    'Options: ' + df['options'].astype(str) + '\n' +
    'Correct: ' + df['correct'].astype(str) + '\n' +
    'Annotated Formula: ' + df['annotated_formula'].astype(str) + '\n' +
    'Linear Formula: ' + df['linear_formula'].astype(str) + '\n' +
    'Category: ' + df['category'].astype(str)
)

# Split the DataFrame into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

print("Dataset loaded and split successfully.")
print(f"Training set size: {len(train_df)} rows")
print(f"Validation set size: {len(val_df)} rows")
print("First 5 rows of input_text and target_text from train_df:")
print(train_df[['input_text', 'target_text']].head())

Dataset loaded and split successfully.
Training set size: 23869 rows
Validation set size: 5968 rows
First 5 rows of input_text and target_text from train_df:
                                              input_text  \
17594  there are 500 employees in a room . 99 % are m...   
1291   what is the smallest positive integer that lea...   
10319  john can complete a given task in 20 days . ja...   
29789  a salt manufacturing company produced a total ...   
15405  the perimeter of one square is 48 cm and that ...   

                                             target_text  
17594  Rationale: "there are 495 managers and 5 other...  
1291   Rationale: remainder of 1 when divided by 2 : ...  
10319  Rationale: "john 6 * 20 = 120 jane 10 * 12 = 1...  
29789  Rationale: total production of salt by the com...  
15405  Rationale: "4 a = 48 4 a = 28 a = 12 a = 7 a 2...  


In [6]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Define the model name to be used
model_name = 't5-small'

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

print(f"Tokenizer and Model '{model_name}' loaded successfully.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Tokenizer and Model 't5-small' loaded successfully.


In [7]:
from transformers import TrainingArguments, DataCollatorForSeq2Seq

# 2. Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    report_to="none" # Disable Weights & Biases reporting
)

# 3. Initialize DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

print("Training arguments and data collator defined successfully.")

Training arguments and data collator defined successfully.


In [8]:
from transformers import Trainer
from datasets import Dataset

# 1. Define the tokenization function
def tokenize_function(examples):
    # Tokenize input texts
    model_inputs = tokenizer(examples['input_text'], max_length=512, truncation=True, padding="max_length")

    # Tokenize target texts
    labels = tokenizer(examples['target_text'], max_length=512, truncation=True, padding="max_length")

    # Assign tokenized labels to the 'labels' key
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

print("Tokenization function defined.")


Tokenization function defined.


In [9]:
from transformers import Trainer
from datasets import Dataset

# Convert pandas DataFrames to Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

# Apply the tokenization function to the datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    processing_class=tokenizer, # Changed 'tokenizer' to 'processing_class'
    data_collator=data_collator,
)

print("Tokenized datasets created and Trainer initialized successfully.")

Map:   0%|          | 0/23869 [00:00<?, ? examples/s]

Map:   0%|          | 0/5968 [00:00<?, ? examples/s]

Tokenized datasets created and Trainer initialized successfully.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Save model + tokenizer
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")