In [None]:
pip install pyarrow==10.0.1 datasets

Collecting pyarrow==10.0.1
  Downloading pyarrow-10.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
INFO: pip is looking at multiple versions of datasets to determine which version is compatible with other requirements. This could take a while.
  Downloading datasets-2.19.2-py3-none-any.whl.metadata (19 kB)
  Downloading datasets-2.19.1-py3-none-any.whl.metadata (19 kB)
  Downloading datasets-2.19.0-py3-none-any.whl.metadata (19 kB)
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
  Downloading datasets-2.17.1-py3-none-any.whl.metadata (20 kB)
  Downloading datasets-2.17.0-py3-none-any.whl.metadata (20 kB)
  Downloading datasets-2.16.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2

In [None]:
!pip install transformers pandas torch

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [None]:
import torch
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling, TrainerCallback
from datasets import Dataset

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Set the padding token to eos_token
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(model_name).to(device)

# Freeze all layers except the last few
for name, param in model.named_parameters():
    if "h.10" not in name and "h.11" not in name:  # Assuming GPT-2 has 12 layers
        param.requires_grad = False

# Load and preprocess the dataset
def load_and_preprocess_dataset(file_path):
    df = pd.read_csv(file_path)
    df['text'] = df['abstract']
    dataset = Dataset.from_pandas(df[['text']])
    return dataset

dataset_path = 'cleaned_NYT_2016_to_2022-updated.csv'
dataset = load_and_preprocess_dataset(dataset_path)

# Split the dataset
train_test_split = dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Define a function for data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    eval_strategy="steps",  # Use eval_strategy instead of evaluation_strategy
    logging_dir="./logs",
    logging_steps=10,
    per_device_train_batch_size=16,  # Adjust based on your GPU's memory
    gradient_accumulation_steps=1,  # No gradient accumulation to maximize GPU memory usage
    num_train_epochs=3,  # Fine-tune for more epochs since only a few layers are updated
    weight_decay=0.01,
    logging_first_step=True,
    logging_strategy="steps",
    report_to="none",
    fp16=torch.cuda.is_available(),  # Enable mixed precision if on GPU
)

# Define a custom callback to monitor progress
class CustomCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            print(logs)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    callbacks=[CustomCallback()],  # Add custom callback for live printing
)

# Start fine-tuning
trainer.train()


Map:   0%|          | 0/69815 [00:00<?, ? examples/s]

Map:   0%|          | 0/7758 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
10,4.4991,4.080737
20,4.522,4.00352
30,4.4302,3.956403
40,4.2576,3.924631
50,4.2667,3.896185
60,4.2056,3.874353
70,4.2795,3.856035
80,4.0838,3.841216
90,4.1452,3.828338
100,4.216,3.81676


{'loss': 4.8162, 'grad_norm': 1.22317636013031, 'learning_rate': 4.9996180873816074e-05, 'epoch': 0.00022914757103574703}
{'loss': 4.4991, 'grad_norm': 1.1494336128234863, 'learning_rate': 4.996180873816071e-05, 'epoch': 0.00229147571035747}
{'eval_loss': 4.0807366371154785, 'eval_runtime': 150.0016, 'eval_samples_per_second': 51.719, 'eval_steps_per_second': 6.467, 'epoch': 0.00229147571035747}
{'loss': 4.522, 'grad_norm': 1.028442621231079, 'learning_rate': 4.992361747632142e-05, 'epoch': 0.00458295142071494}
{'eval_loss': 4.003520488739014, 'eval_runtime': 150.0077, 'eval_samples_per_second': 51.717, 'eval_steps_per_second': 6.466, 'epoch': 0.00458295142071494}
{'loss': 4.4302, 'grad_norm': 1.0079808235168457, 'learning_rate': 4.9885426214482125e-05, 'epoch': 0.0068744271310724105}
{'eval_loss': 3.9564032554626465, 'eval_runtime': 150.416, 'eval_samples_per_second': 51.577, 'eval_steps_per_second': 6.449, 'epoch': 0.0068744271310724105}
{'loss': 4.2576, 'grad_norm': 0.99095201492309

KeyboardInterrupt: 

In [None]:
# After training, save the model and tokenizer
trainer.save_model("./gptNYT-2016-2022")
tokenizer.save_pretrained("./gptNYT-2016-2022")


('./gptNYT-2016-2022/tokenizer_config.json',
 './gptNYT-2016-2022/special_tokens_map.json',
 './gptNYT-2016-2022/vocab.json',
 './gptNYT-2016-2022/merges.txt',
 './gptNYT-2016-2022/added_tokens.json')

In [None]:
import shutil

# Path to the directory containing the model and tokenizer
model_directory = "./gptNYT-2016-2022"

# Path to save the zipped file
output_filename = "./gptNYT-2016-2022.zip"

# Zip the directory
shutil.make_archive(base_name=output_filename.replace('.zip', ''), format='zip', root_dir=model_directory)


'/content/gptNYT-2016-2022.zip'

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the pre-trained model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
pretrained_model = GPT2LMHeadModel.from_pretrained(model_name).to(device)

# Load the fine-tuned model and tokenizer
fine_tuned_model_path = "./my_finetuned_model"
fine_tuned_model = GPT2LMHeadModel.from_pretrained(fine_tuned_model_path).to(device)
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained(fine_tuned_model_path)

# Function to generate text with attention mask
def generate_text(model, tokenizer, prompt, max_length=100):
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
    attention_mask = torch.ones(inputs.shape, device=device)  # Create attention mask
    outputs = model.generate(
        inputs,
        attention_mask=attention_mask,  # Pass attention mask
        max_length=max_length,
        pad_token_id=tokenizer.eos_token_id  # Set pad token ID to eos_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Testing the pre-trained model
pretrained_prompt = "In 2021"
pretrained_output = generate_text(pretrained_model, tokenizer, pretrained_prompt)
print("Pre-trained Model Output:")
print(pretrained_output)

# Testing the fine-tuned model
finetuned_prompt = "In 2021"
finetuned_output = generate_text(fine_tuned_model, fine_tuned_tokenizer, finetuned_prompt)
print("\nFine-tuned Model Output:")
print(finetuned_output)


Pre-trained Model Output:
In 2021, the government will have to decide whether to extend the existing contract with the company.

The government has already announced that it will not renew the contract with the company.

The government has also announced that it will not renew the contract with the company.

The government has also announced that it will not renew the contract with the company.

The government has also announced that it will not renew the contract with the company.

The government has also announced that it

Fine-tuned Model Output:
In 2021, the company will begin to sell its first-generation electric vehicles, which will be powered by a battery pack. The company will also offer a range of other products, including a range of smart home devices. The company is also developing a new product that will allow users to control their own home. The company is also developing a new product that will allow users to control their own home. The company is also developing a new pr