In [7]:
import pandas as pd

# Load the CSV data
csv_file = "concap.csv"  # Replace with your actual CSV file path
data = pd.read_csv(csv_file)

# Convert to text format
def convert_to_text(data):
    text_data = ""
    for _, row in data.iterrows():
        text_data += f"Country: {row['CountryName']}\n"
        text_data += f"Capital: {row['CapitalName']}\n"
        text_data += f"Country Code: {row['CountryCode']}\n"
        text_data += f"Continent: {row['ContinentName']}\n"
        text_data += "\n"  # Separate entries with a newline
    return text_data

# Save to a text file
text_data = convert_to_text(data)
with open("custom_dataset.txt", "w", encoding='utf-8') as f:
    f.write(text_data)


In [8]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments


In [9]:
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [10]:
def prepare_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )
    return dataset

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)


In [24]:
train_dataset = prepare_dataset("custom_dataset.txt", tokenizer)

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
)



In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

In [26]:
trainer.train()

100%|██████████| 210/210 [09:44<00:00,  2.78s/it]

{'train_runtime': 584.1659, 'train_samples_per_second': 0.702, 'train_steps_per_second': 0.359, 'train_loss': 0.4204004196893601, 'epoch': 10.0}





TrainOutput(global_step=210, training_loss=0.4204004196893601, metrics={'train_runtime': 584.1659, 'train_samples_per_second': 0.702, 'train_steps_per_second': 0.359, 'total_flos': 26782433280000.0, 'train_loss': 0.4204004196893601, 'epoch': 10.0})

In [27]:
trainer.save_model("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

('./gpt2-finetuned\\tokenizer_config.json',
 './gpt2-finetuned\\special_tokens_map.json',
 './gpt2-finetuned\\vocab.json',
 './gpt2-finetuned\\merges.txt',
 './gpt2-finetuned\\added_tokens.json')

In [28]:
model_path = "./gpt2-finetuned"
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

def generate_paragraph(prompt_text, model, tokenizer, max_length=200):
    inputs = tokenizer.encode(prompt_text, return_tensors="pt")
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2, early_stopping=True)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

In [29]:
prompt = "what is the capital of Morocco Country?"
generated_paragraph = generate_paragraph(prompt, model, tokenizer)
print(generated_paragraph)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


what is the capital of Morocco Country?

Morocco
Capital: Rabat
Country Code: MW
Continent: Africa
: Morocco
Regional: North Africa Mainland
Month: May
Cap: King Edward Point
City: Moroni
State Code
Land: Niger
Domestic: Nuku'alofa
Province: Niamey
Mainland: Victoria
Region: Australia
Nauru
 Capital: Port Moresby
County: Manus
 Country Code : OM
CONTACT:
For enquiries: John Hutton
Phone: +61 2 689 709
Email: john@hutton.co.nz
Website: www.hottentot.com
SOURCE: Hottentedot
Related Links:
