In [1]:
# Install required libraries
!pip install transformers datasets

# Mount Google Drive (if using Drive for file storage)
from google.colab import drive
drive.mount('/content/drive')

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

In [9]:
import torch
import json

# Load the JSON data
with open('/content/drive/MyDrive/intents.json', 'r') as file:
    data = json.load(file)

# Convert to a list of dialogues
train_data = []
for intent in data['intents']:
    for pattern in intent['patterns']:
        train_data.append({
            'input': pattern,
            'output': intent['responses'][0]  # Choose the first response or modify as needed
        })

# Save the formatted data
with open('formatted_data.txt', 'w') as f:
    for item in train_data:
        f.write(f"{item['input']}\t{item['output']}\n")

# Import necessary libraries from Hugging Face
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import torch

# Check GPU availability
print(torch.cuda.get_device_name(0))  # Should print "Tesla T4"
print(torch.cuda.is_available())  # Should return True if GPU is available

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load the formatted dataset
data_files = {"train": "formatted_data.txt"}
dataset = load_dataset("text", data_files=data_files)

# Preprocess the data
def preprocess_function(examples):
    inputs = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)
    return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'], 'labels': inputs['input_ids']}

encoded_dataset = dataset.map(preprocess_function, batched=True)

# Split dataset into training and evaluation sets
train_size = int(0.9 * len(encoded_dataset['train']))
eval_size = len(encoded_dataset['train']) - train_size

train_dataset = encoded_dataset['train'].select(range(train_size))
eval_dataset = encoded_dataset['train'].select(range(train_size, train_size + eval_size))

# Load the model
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
model.resize_token_embeddings(len(tokenizer))  # Ensure the model's embedding layer matches the tokenizer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,  # Log every 10 steps
    save_steps=10,     # Save checkpoint every 10 steps
)

# Define a custom Trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        logits = outputs.get("logits")
        labels = inputs.get("labels")
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained('./chatbot_model')
tokenizer.save_pretrained('./chatbot_model')

# Load the model for inference
from transformers import pipeline

# Load the model and tokenizer for inference
chatbot = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)  # Use GPU

# Generate a response with customized parameters
response = chatbot(
    "Hi there! How can I help you?",
    max_length=50,            # Maximum length of the generated text
    num_return_sequences=1,  # Number of sequences to return
    temperature=0.7,         # Sampling temperature (0.7 is a good default)
    top_k=50                 # Limits the sampling to top-k most probable tokens
)

print(response)

# Optional: Push to Hugging Face Model Hub (requires authentication)
# model.push_to_hub("your-hub-repo-name")
# tokenizer.push_to_hub("your-hub-repo-name")

Tesla T4
True


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/287 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,1.0887,1.657382
2,0.2638,0.342141
3,0.195,0.220397


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'Hi there! How can I help you??????????????????????????????????????????'}]


In [15]:
import torch
import json

# Load the JSON data
with open('/content/drive/MyDrive/intents.json', 'r') as file:
    data = json.load(file)

# Convert to a list of dialogues
train_data = []
for intent in data['intents']:
    for pattern in intent['patterns']:
        train_data.append({
            'input': pattern,
            'output': intent['responses'][0]  # Choose the first response or modify as needed
        })

# Save the formatted data
with open('/content/formatted_data.txt', 'w') as f:
    for item in train_data:
        f.write(f"{item['input']}\t{item['output']}\n")

# Import necessary libraries from Hugging Face
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

# Check GPU availability
print(torch.cuda.get_device_name(0))  # Should print "Tesla T4"
print(torch.cuda.is_available())  # Should return True if GPU is available

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

# Add padding token if not present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load the formatted dataset
data_files = {"train": "/content/formatted_data.txt"}
dataset = load_dataset("text", data_files=data_files)

# Preprocess the data
def preprocess_function(examples):
    inputs = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128)
    return {'input_ids': inputs['input_ids'], 'attention_mask': inputs['attention_mask'], 'labels': inputs['input_ids']}

encoded_dataset = dataset.map(preprocess_function, batched=True)

# Split dataset into training and evaluation sets
train_size = int(0.9 * len(encoded_dataset['train']))
eval_size = len(encoded_dataset['train']) - train_size

train_dataset = encoded_dataset['train'].select(range(train_size))
eval_dataset = encoded_dataset['train'].select(range(train_size, train_size + eval_size))

# Load the model
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
model.resize_token_embeddings(len(tokenizer))  # Ensure the model's embedding layer matches the tokenizer

# Define training arguments
training_args = TrainingArguments(
    output_dir="/content/fine_tuned_chatbot_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='/content/fine_tuned_chatbot_model/logs',
    logging_steps=10,  # Log every 10 steps
    save_steps=10,     # Save checkpoint every 10 steps
)

# Define a custom Trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(**inputs)
        logits = outputs.get("logits")
        labels = inputs.get("labels")
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)

# Train the model
trainer.train()

# Save the model and tokenizer
model.save_pretrained('/content/fine_tuned_chatbot_model')
tokenizer.save_pretrained('/content/fine_tuned_chatbot_model')

# Load the model for inference
from transformers import pipeline

# Load the model and tokenizer for inference
chatbot = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0)  # Use GPU

# Generate a response with customized parameters
response = chatbot(
    "Hi there! How can I help you?",
    max_length=50,            # Maximum length of the generated text
    num_return_sequences=1,  # Number of sequences to return
    temperature=0.7,         # Sampling temperature (0.7 is a good default)
    top_k=50                 # Limits the sampling to top-k most probable tokens
)

print(response)

# Optional: Push to Hugging Face Model Hub (requires authentication)
# model.push_to_hub("your-hub-repo-name


Tesla T4
True


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/287 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,1.0887,1.657382
2,0.2638,0.342141
3,0.195,0.220397


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[{'generated_text': 'Hi there! How can I help you??????????????????????????????????????????'}]
