Uninstall conflicting packages

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Uninstall conflicting packages
!pip3 uninstall pyarrow -y
!pip3 uninstall requests -y
!pip install datasets


# Install specific versions
!pip3 install pyarrow==14.0.1
!pip3 install requests==2.31.0

# Reinstall dependencies
!pip3 install cudf-cu12
!pip3 install ibis-framework

# Verify installation
import pyarrow
import requests

print(f"pyarrow version: {pyarrow.__version__}")
print(f"requests version: {requests.__version__}")

In [None]:
!pip3 install accelerate -U

In [None]:
# Install the transformers library if not already installed
!pip install transformers

# Import the necessary library
from huggingface_hub import login

# Login using the token
login(token="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")

Data Preprocessing

In [None]:
import os
import re
from transformers import GPT2Tokenizer

def preprocess_data(input_file, output_file, tokenizer_name="gpt2"):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    with open(output_file, 'w', encoding='utf-8') as f:
        for line in lines:
            # Strip leading/trailing whitespace
            line = line.strip()

            # Additional preprocessing steps
            # Example: Convert all text to lowercase
            line = line.lower()

            # Example: Replace specific characters or patterns
            line = re.sub(r'\W', ' ', line)  # Replace non-word characters with space

            # Tokenize the line
            token_ids = tokenizer.encode(line, add_special_tokens=False)
            # Convert token IDs back to tokens
            tokenized_line = tokenizer.convert_ids_to_tokens(token_ids)
            # Convert tokens to text and remove special tokens
            processed_line = " ".join(tokenized_line).replace('Ġ', '').replace('Ċ', '').replace('�', '').strip()
            processed_line = re.sub(r'[^\w\s]', '', processed_line)
            processed_line = re.sub(r'\bgu vi\b','guvi',processed_line)
            # Remove extra spaces
            processed_line = re.sub(r'\s+', ' ', processed_line)

            # Write the processed line to the output file
            f.write(processed_line + "\n")

# Example usage:
input_file = "/content/Guvi_Dataset.txt"  # Make sure this path is correct
output_file = "processed_company_data.txt"
preprocess_data(input_file, output_file)


Fine Tuneing the Pretrained Model

In [None]:
pip install --upgrade datasets

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# Load pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Create dataset
def load_dataset(file_path, tokenizer, block_size=128):
    dataset = load_dataset('text', data_files={'train': file_path})
    tokenized_dataset = dataset.map(
        lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=block_size),
        batched=True
    )
    return tokenized_dataset['train']

train_dataset = load_dataset("/path/to/processed_data.txt", tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    save_steps=1000,  # Save checkpoints more frequently for safety
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=200,
)

# Initialize data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("/content/drive/MyDrive/fine_tuned_model12345")
tokenizer.save_pretrained("/content/drive/MyDrive/fine_tuned_model12345")

TypeError: load_dataset() got an unexpected keyword argument 'data_files'

Test the Model

In [16]:
#!pip install transformers

from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

# Load the fine-tuned model and tokenizer
model_name_or_path = "/content/drive/MyDrive/fine_tuned_model12345"  # Use the directory where you saved the model
model = GPT2LMHeadModel.from_pretrained(model_name_or_path)

token_name_or_path = "/content/drive/MyDrive/fine_tuned_model12345"  # Use the directory where you saved the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(token_name_or_path)

# Set the pad_token to eos_token if it's not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the text generation function
def generate_text(model, tokenizer, seed_text, max_length=100, temperature=1.0, num_return_sequences=1):
    # Tokenize the input text with padding
    inputs = tokenizer(seed_text, return_tensors='pt', padding=True, truncation=True)

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=num_return_sequences,
            do_sample=True,
            top_k=50,
            top_p=0.01,
            pad_token_id=tokenizer.eos_token_id  # Ensure padding token is set to eos_token_id
        )

    # Decode the generated text
    generated_texts = []
    for i in range(num_return_sequences):
        generated_text = tokenizer.decode(output[i], skip_special_tokens=True)
        generated_texts.append(generated_text)

    return generated_texts

# Test the model
seed_text = input("Enter the text: ")
generated_texts = generate_text(model, tokenizer, seed_text, max_length=200, temperature=0.9, num_return_sequences=1)

for i, text in enumerate(generated_texts):
    print(f"Generated Text {i + 1}:\n{text}\n")


Enter the text: Who is guvi ceo
Generated Text 1:
Who is guvi ceo the best online learning platform for beginners

guvi offers a comprehensive online learning platform with a vast library of courses and resources it is a leader in online learning with a reputation for providing a comprehensive learning experience for its users

guvi offers a comprehensive online learning platform with a vast library of courses and resources it is a leader in online learning with a reputation for providing a comprehensive learning experience for its users

guvi offers a comprehensive online learning platform with a vast library of courses and resources it is a leader in online learning with a reputation for providing a comprehensive learning experience for its users

guvi offers a comprehensive online learning platform with a vast library of courses and resources it is a leader in online learning with a reputation for providing a comprehensive learning experience for its users

guvi offers a comprehensi