# Connecting and Uploading

In [None]:
#loading Last Datasets:
import pandas as pd
dataset_5A = pd.read_pickle("Dataset (5 Authors).pkl")

In [None]:
# taging:

dataset_5A['Merged'] = "<" + dataset_5A['Label_(number)'].astype(str) + "> " + dataset_5A['Sentence'] + " <end>"# Sentence Merge with their Label

display(dataset_5A.head(3))
print()

#Number of sample in each group:

num_samples = dataset_5A['Author'].value_counts()

display(num_samples)

print("\nSum = ", num_samples.sum())

In [None]:
# Example:
print(dataset_5A["Sentence"][10])

# Library Installation

In [None]:
#!pip install requests==2.31.0
!pip install requests==2.32.3

In [None]:
#!pip install pyarrow==14.0.1
!pip install -U pyarrow==19.0.0

In [None]:
!pip install datasets==2.10.0 #"datasets>=2.19,<3.0"

In [None]:
!pip install transformers

In [None]:
!pip install evaluate torch

In [None]:
!pip install --upgrade evaluate

In [None]:
!pip install accelerate

In [None]:
!pip install numpy==2.0.2

In [None]:
!pip install torchinfo

In [None]:
!pip install wandb

# Generator (GPT-3)

## ---- Configuration:

In [None]:
dataset_text = dataset_5A[['Merged']]

In [None]:
dataset_text = dataset_text.rename(columns={'Merged': 'text'})

In [None]:
# <0> = Charles Dickens
# <1> = Jane Austen
# <2> = Mark Twain
# <3> = Louisa May Alcott
# <4> = Herman Melville

In [None]:
dataset_text

In [None]:
# Example:
dataset_text["text"][0]

In [None]:
len(dataset_text)

In [None]:
#pip install transformers

In [None]:
#pip install evaluate torch

In [None]:
#pip install --upgrade pyarrow evaluate

In [None]:
#pip install accelerate

In [None]:
#!pip install datasets==2.10.0

In [None]:
import pyarrow as pa
print(pa.__version__)  #pip install "pyarrow<21"
print(hasattr(pa, "PyExtensionType"))

In [None]:
import torch
import pandas as pd
from datasets import Dataset as HFDataset

In [None]:
#pip install numpy==1.26.4

In [None]:
from transformers import GPT2Tokenizer, GPTNeoForCausalLM, Trainer, TrainingArguments

In [None]:
# Load pre-trained model and tokenizer
model_name = 'EleutherAI/gpt-neo-1.3B'
#model_name = 'EleutherAI/gpt-neo-125m'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [None]:
# Add a padding token if it does not exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = GPTNeoForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    encoding = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)
    encoding['labels'] = encoding['input_ids'].copy()
    return encoding

In [None]:
import pandas as pd

# Convert pandas DataFrame to Hugging Face Dataset
hf_dataset_5A = HFDataset.from_pandas(dataset_text)

In [None]:
hf_dataset_5A

## ---- Training:

In [None]:
# Apply the tokenize function to the dataset
tokenized_datasets = hf_dataset_5A.map(tokenize_function, batched=True)

In [None]:
# Split dataset into train and eval
train_size = int(0.8 * len(tokenized_datasets))
train_dataset = tokenized_datasets.select(range(train_size))
eval_dataset = tokenized_datasets.select(range(train_size, len(tokenized_datasets)))

In [None]:
from datetime import datetime

base_dir = '/stylometry'
run_name             = "writing_style_v1"
current_date = datetime.now().strftime("%Y.%m.%d")

# Define save paths in Google Drive with date
drive_model_path     = f'{base_dir}/fine_tuned_model_gptNeo_1.3B/saved_model_{current_date}'
drive_tokenizer_path = f'{base_dir}/fine_tuned_model_gptNeo_1.3B/saved_tokenizer_{current_date}'
output_dir           = f'{base_dir}/fine_tuned_model_gptNeo_1.3B/results_{current_date}'
logging_dir          = f'{base_dir}/fine_tuned_model_gptNeo_1.3B/logs_{current_date}'

In [None]:
print(run_name,":")
print(drive_model_path)
print(drive_tokenizer_path)
print(output_dir)
print(logging_dir)

In [None]:
#pip install wandb

In [None]:
import wandb

In [None]:
#!WANDB_START_METHOD=thread

In [None]:
#!WANDB_HTTP_TIMEOUT=300

In [None]:
# Initialize the wandb session
wandb.init(project=run_name, entity="niu")

In [None]:
!wandb login

In [None]:
# Define training arguments
training_args = TrainingArguments(
    run_name = run_name,               # name for run
    output_dir = output_dir,           # output directory
    num_train_epochs = 3,             # number of training epochs (10)
    per_device_train_batch_size = 16,  # batch size for training
    per_device_eval_batch_size = 16,   # batch size for evaluation
    warmup_steps = 1000,               # number of warmup steps for learning rate scheduler
    weight_decay = 0.01,               # strength of weight decay
    logging_dir = logging_dir,         # directory for storing logs
    report_to="wandb",                 # report metrics to wandb
    save_steps = 2000,                 # save checkpoint every 2000 steps
    save_total_limit = 10,             # keep only the last 5 checkpoints
    save_strategy = "steps",           # save based on steps (other option: "epoch")
)

In [None]:
# Use Trainer to fine-tune the model
trainer = Trainer(
    model         = model,
    args          = training_args,
    train_dataset = train_dataset,
    eval_dataset  = eval_dataset,
)

In [None]:
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
from torchinfo import summary

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the device
model.to(device)

# Tokenize a sample input and move to the same device
input_text = "Hello, this is a test input for model summary."
input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)

# Print the detailed summary
summary(model, input_data=input_ids)

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
# Saving ===================================================
model.save_pretrained(drive_model_path)
tokenizer.save_pretrained(drive_tokenizer_path)

print(f"Model saved to: {drive_model_path}")
print(f"Tokenizer saved to: {drive_tokenizer_path}")

In [None]:
# Ensure the padding token is set if it was added
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

## ---- Loading:

In [None]:
drive_model_path     = 'fine_tuned_model_gptNeo_1.3B/saved_model_2025.01.03'
drive_tokenizer_path = 'fine_tuned_model_gptNeo_1.3B/saved_tokenizer_2025.01.03'
output_dir           = 'fine_tuned_model_gptNeo_1.3B/results_2025.01.03'
logging_dir          = 'fine_tuned_model_gptNeo_1.3B/logs_2025.01.03'

In [None]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer

# Load the model and tokenizer
model = GPTNeoForCausalLM.from_pretrained(drive_model_path)
tokenizer = GPT2Tokenizer.from_pretrained(drive_tokenizer_path)

In [None]:
# Generate text
def generate_text(input_text, model, tokenizer, max_length=50, num_return_sequences=1):
    inputs = tokenizer.encode(input_text, return_tensors='pt')
    attention_mask = torch.ones(inputs.shape, dtype=torch.long)
    outputs = model.generate(inputs,
                             max_length=max_length,
                             num_return_sequences=num_return_sequences,
                             pad_token_id=tokenizer.eos_token_id,
                             attention_mask=attention_mask)
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return generated_texts

In [None]:
#Labels of Authors:

tages = [
         "<0> ",  # Charles Dickens
         "<1> ",  # Jane Austen
         "<2> ",  # Mark Twain
         "<3> ",  # Louisa May Alcott
         "<4> "   # Herman Melville
        ]

In [None]:
# Example usage

start_with = "Hi"

for tag in tages:
    input_text = tag + start_with
    generated_texts = generate_text(input_text, model, tokenizer)
    print(generated_texts[0])

In [None]:
import random

grouped = dataset_5A.groupby('Label_(number)')

def sample_and_extract(group):
  sampled_sentences = group.sample(n=1000)  # Randomly select 1000 sentences form each author
  first_words = sampled_sentences['Sentence'].apply(lambda x: x.split()[0])  # Extract first word
  # Create a new DataFrame from the extracted words and labels
  result_df = pd.DataFrame({'First_Word': first_words, 'Label': group.name})
  return result_df

randome_start_words_gpt3 = grouped.apply(sample_and_extract, include_groups=False)
randome_start_words_gpt3.reset_index(inplace=True) #ungrouping
randome_start_words_gpt3.drop(['Label_(number)','level_1'], axis=1, inplace=True)

In [None]:
randome_start_words_gpt3

In [None]:
import torch
from tqdm.notebook import tqdm
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()

# Build prompts robustly
prompts = [f"{tag}{s.split()[0]}"
           for s in randome_start_words_gpt3["First_Word"].astype(str)
           for tag in tages]

batch_size = 64        # you tested 64 OK on A100
max_new_tokens = 64

generated = []
with torch.inference_mode(), torch.amp.autocast("cuda", dtype=torch.float16):
    for i in tqdm(range(0, len(prompts), batch_size), desc="Generating (batched)", unit="batch"):
        batch = prompts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)  # <<< move to CUDA
        out = model.generate(
            **enc,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.9,
            pad_token_id=tokenizer.eos_token_id,
        )
        texts = tokenizer.batch_decode(out, skip_special_tokens=True)
        generated.extend(texts)

generated_texts_gpt3 = pd.DataFrame({"Text": generated})


In [None]:
generated_texts_gpt3

In [None]:
for text in generated_texts_gpt3["Text"].head(50):
    print(text)

In [None]:
# Save to CSV (without index column)
generated_texts_gpt3.to_csv("generated_texts_gpt3.csv", index=False)

#  Generator (GPT-3 + LoRA)

## ---- Configuration:

In [None]:
import torch
from transformers import GPT2Tokenizer, GPTNeoForCausalLM, Trainer, TrainingArguments
from datasets import Dataset as HFDataset
from datetime import datetime
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import pandas

In [None]:
#Number of sample in each group:

num_samples = dataset_5A['Author'].value_counts()

display(num_samples)

print("\nSum = ", num_samples.sum())



In [None]:
# Prepare dataset for fine-tuning
dataset_text = dataset_5A[['Merged']].rename(columns={'Merged': 'text'})

In [None]:
# Load pre-trained model and tokenizer
model_name = 'EleutherAI/gpt-neo-1.3B'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [None]:
# Add a padding token if it does not exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = GPTNeoForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    encoding = tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)
    encoding['labels'] = encoding['input_ids'].copy()
    return encoding

In [None]:
# Convert pandas DataFrame to Hugging Face Dataset
hf_dataset_5A = HFDataset.from_pandas(dataset_text)

In [None]:
# Apply tokenization
tokenized_datasets = hf_dataset_5A.map(tokenize_function, batched=True)

## ---- LoRA Configuration:

In [None]:
# Define LoRA configuration
lora_config = LoraConfig( task_type=TaskType.CAUSAL_LM,
                          r=8,  # Rank of the update matrices
                          lora_alpha=32,  # Alpha parameter for LoRA scaling
                          lora_dropout=0.1,  # Dropout probability for LoRA layers
                          bias="none",  # Don't train bias parameters
                          target_modules=["attn.attention.q_proj", "attn.attention.v_proj", "attn.attention.k_proj", "attn.attention.out_proj"]  # Target attention and MLP modules
                        )

In [None]:
# Prepare the model with LoRA adapters
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [None]:
# Print trainable parameters info
model.print_trainable_parameters()

## ---- Training:

In [None]:
# Split dataset into train and eval
train_size = int(0.8 * len(tokenized_datasets))
train_dataset = tokenized_datasets.select(range(train_size))
eval_dataset = tokenized_datasets.select(range(train_size, len(tokenized_datasets)))

In [None]:
# Define paths and filenames
base_dir = 'stylometry' # Local directory for saving
run_name             = "writing_style_lora_v1"
current_date = datetime.now().strftime("%Y.%m.%d")

# Define save paths in Google Drive with date
lora_model_path     = f'{base_dir}/fine_tuned_model_gptNeo_1.3B_lora/saved_model_{current_date}'
drive_tokenizer_path = f'{base_dir}/fine_tuned_model_gptNeo_1.3B_lora/saved_tokenizer_{current_date}'
output_dir           = f'{base_dir}/fine_tuned_model_gptNeo_1.3B_lora/results_{current_date}'
logging_dir          = f'{base_dir}/fine_tuned_model_gptNeo_1.3B_lora/logs_{current_date}'


In [None]:
print(run_name,":")
print(lora_model_path)
print(drive_tokenizer_path)
print(output_dir)
print(logging_dir)

In [None]:
#pip install wandb

In [None]:
import wandb

In [None]:
#!WANDB_START_METHOD=thread

In [None]:
#!WANDB_HTTP_TIMEOUT=300

In [None]:
# Initialize the wandb session
wandb.init(project=run_name, entity="niu")

In [None]:
!wandb login

In [None]:

# Define training arguments - reduced batch size for memory efficiency
training_args = TrainingArguments(
    run_name=run_name,
    output_dir=output_dir,
    num_train_epochs=3,  # Keep same as original
    per_device_train_batch_size=8,  # Reduced for memory efficiency
    per_device_eval_batch_size=8,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir=logging_dir,
    save_steps=2000,
    save_total_limit=5,
    save_strategy="steps",
    eval_strategy="steps",
    eval_steps=5000,
    fp16=False,
    report_to="none"
)

In [None]:
# Set up trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,)

In [None]:
import torch
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
from torchinfo import summary

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the device
model.to(device)

# Tokenize a sample input and move to the same device
input_text = "Hello, this is a test input for model summary."
input_ids = tokenizer(input_text, return_tensors='pt').input_ids.to(device)

# Print the detailed summary
summary(model, input_data=input_ids)

In [None]:
# Fine-tune the model
trainer.train()

In [None]:
# Saving ===================================================
model.save_pretrained(lora_model_path)
tokenizer.save_pretrained(drive_tokenizer_path)

print(f"Model saved to: {lora_model_path}")
print(f"Tokenizer saved to: {drive_tokenizer_path}")

## ---- Loading:

In [None]:
lora_model_path      = 'fine_tuned_model_gptNeo_1.3B_lora/saved_model_2025.08.09'
drive_tokenizer_path = 'fine_tuned_model_gptNeo_1.3B_lora/saved_tokenizer_2025.08.09'
output_dir           = 'fine_tuned_model_gptNeo_1.3B_lora/results_2025.08.09'
logging_dir          = 'fine_tuned_model_gptNeo_1.3B_lora/logs_2025.08.09'

In [None]:
#output_dir           = ''
#logging_dir          = ''

In [None]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
from peft import PeftModel, PeftConfig

# Load the model and tokenizer
peft_config = PeftConfig.from_pretrained(lora_model_path)
base_model = GPTNeoForCausalLM.from_pretrained(peft_config.base_model_name_or_path)
tokenizer = GPT2Tokenizer.from_pretrained(drive_tokenizer_path)
base_model.resize_token_embeddings(len(tokenizer))
model = PeftModel.from_pretrained(base_model, lora_model_path)

In [None]:
# Generate text
def generate_text(input_text, model, tokenizer, max_length=50, num_return_sequences=1):
    inputs = tokenizer.encode(input_text, return_tensors='pt')
    attention_mask = torch.ones(inputs.shape, dtype=torch.long)
    outputs = model.generate(inputs,
                             max_length=max_length,
                             num_return_sequences=num_return_sequences,
                             pad_token_id=tokenizer.eos_token_id,
                             attention_mask=attention_mask)
    generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
    return generated_texts

In [None]:
#Labels of Authors:

tages = [
         "<0> ",  # Charles Dickens
         "<1> ",  # Jane Austen
         "<2> ",  # Mark Twain
         "<3> ",  # Louisa May Alcott
         "<4> "   # Herman Melville
        ]

In [None]:
# Example usage

start_with = "Hi"

for tag in tages:
    input_text = tag + start_with
    generated_texts = generate_text(input_text, model, tokenizer)
    print(generated_texts[0])

In [None]:
# "First word sampleing" has been removed to have "randome_start_words_gpt3" same as GPT-3 FFT.

In [None]:
randome_start_words_gpt3

In [None]:
import torch
from tqdm.notebook import tqdm
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device).eval()

# Build prompts robustly
prompts = [f"{tag}{s.split()[0]}"
           for s in randome_start_words_gpt3["First_Word"].astype(str)
           for tag in tages]

batch_size = 64        # you tested 64 OK on A100
max_new_tokens = 64

generated = []
with torch.inference_mode(), torch.amp.autocast("cuda", dtype=torch.float16):
    for i in tqdm(range(0, len(prompts), batch_size), desc="Generating (batched)", unit="batch"):
        batch = prompts[i:i+batch_size]
        enc = tokenizer(batch, padding=True, truncation=True, return_tensors="pt").to(device)  # <<< move to CUDA
        out = model.generate(
            **enc,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.9,
            pad_token_id=tokenizer.eos_token_id,
        )
        texts = tokenizer.batch_decode(out, skip_special_tokens=True)
        generated.extend(texts)

generated_texts_gpt3_lora = pd.DataFrame({"Text": generated})


In [None]:
generated_texts_gpt3_lora

In [None]:
for text in generated_texts_gpt3_lora["Text"].head(50):
    print(text)

In [None]:
# Save to CSV (without index column)
generated_texts_gpt3_lora.to_csv("generated_texts_gpt3_lora.csv", index=False)

In [None]:
!pip install notebook ipywidgets==8.1.2

In [None]:
!jupyter nbextension enable --py widgetsnbextension