In [None]:
from transformers import pipeline
print(pipeline('sentiment-analysis')('good day, good life.'))

In [None]:
### sentiment-analysis
classifier = pipeline('sentiment-analysis')

In [None]:
res = classifier("The course was really great, the mentor explained every concept in detail")
res

In [None]:
### text-generation
generator = pipeline('text-generation', model='distilgpt2')


In [None]:
res = generator(
    "once upon a time there was a king ",
    max_length = 100,
    num_return_sequences=2
)

res

In [None]:
"""{'generated_text': 'once upon a time there was a king ㅠㅠㅠㅠㅠㅠㅠㅠㅠ㙠ㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠㅠ�'},
 {'generated_text': "once upon a time there was a king \u0bfe\u0bfe\u0b81\u0bfe\u0bfe.\n‹ I have not even heard much of the history of what is happening in India. How shall we ever be informed of such an event, if any?\nI don't know, how far away we can be before the world's eye is set, but it will be on us. What does this mean? I think that it is because in a nation governed"}]"""

In [None]:
### zero-shot-classification
classifier = pipeline('zero-shot-classification')

In [None]:
res = classifier(
    "this is course about python list comprehension",
    candidate_labels = ['Education', 'politics', 'Engineering']
)

In [None]:
res

### Tokenizer / Model

In [None]:
from transformers import  pipeline
from transformers import  AutoTokenizer, AutoModelForSequenceClassification

In [None]:
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)



In [None]:
res = classifier('The restaurant is not terrible')
res

In [None]:
sq = "Tokenizers split input text into smaller units called tokens. These tokens can be words, subwords, or even characters."
toks = tokenizer(sq)
toks

In [None]:
tokenizer.decode(toks['input_ids'])

#### Pytorch

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

In [None]:
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [None]:
x_train = ["Tokenizers split input text into smaller units called tokens. These tokens can be words, subwords, or even characters.",
           "The goal is to convert raw text into a format that machine learning models can process. Since models typically work with numerical data, tokenizers play a crucial role in this conversion.",
           "Incredible Chinese, Japanese, and Sushi dishes. The ambiance is modern and chic.",
           "Can get crowded during peak hours.",
           "Elegant Indian cuisine with a colonial touch."]

In [None]:
#### Normal flow
classifier(x_train)

In [None]:
#### with batches

batch = tokenizer(x_train, padding=True, truncation=True, max_length=512, return_tensors="pt")
batch

In [None]:
#### inferance in pytorch

with torch.no_grad():
    outputs = model(**batch)
    print(outputs)

    predictions = F.softmax(outputs.logits, dim=1)
    print(predictions)

    labels = torch.argmax(predictions, dim=1)
    print(labels)

#### Save / Load Tokenizer & Model

In [None]:
### saving
save_dir = './tokenzr' 
tokenizer.save_pretrained(save_directory=save_dir)
model.save_pretrained(save_dir)

In [None]:
### loading
tok = AutoTokenizer.from_pretrained(save_dir)
model = AutoModelForSequenceClassification.from_pretrained(save_dir)


### **FineTune**

##### *Prepare dataset*

In [None]:
from datasets import load_dataset, load_from_disk
import os 
if os.path.exists('./yelp_review_full'):
    print("loading from disk : ./yelp_review_full")
    dataset = load_from_disk('./yelp_review_full')
else:
    dataset = load_dataset("yelp_review_full")
    dataset.save_to_disk("./yelp_review_full")

dataset["train"][100]

In [None]:
type(dataset["train"]), type(dataset["train"][0:10])

In [None]:
dataset["train"][1:10]

##### *Tokenization*
Need tokenizer to process the text and include a padding and truncation strategy to handle any variable sequence lengths

In [None]:
from transformers import AutoTokenizer

`padding='max_length':`
the tokenizer pads the tokenized text with zeros (i.e., [PAD] tokens) to make all sequences in the batch have the same length.
If the original sentence length exceeds max_length after appending [CLS] and [SEP] tokens, padding is applied to reach the specified max_length.
For example, if you set max_length=10, the tokenized text might look like: [101, 2026, 2171, 2003, 11754, 102, 0, 0, 0, 0], where 101 represents the [CLS] token and 102 represents the [SEP] token.



`truncate=True:`
When truncate=True, longer sentences are truncated to exactly max_length.
This ensures that all input sequences have consistent lengths, which is crucial for tasks like classification.

In [None]:

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


# To process dataset in one step, use Datasets map method to apply a preprocessing function over the entire dataset

if os.path.exists('./tokenized_datasets'):
    print("loading from disk : ./tokenized_datasets")
    tokenized_datasets = load_from_disk('./tokenized_datasets')
else:
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    tokenized_datasets.save_to_disk("./tokenized_datasets")



In [None]:
### create a smaller subset of the full dataset to fine-tune on to reduce the time it takes:

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

### *Train with PyTorch Trainer*

##### fine-tuning for sequence classification task

In [None]:
from transformers import AutoModelForSequenceClassification


### Start by loading your model and specify the number of expected labels
### There are 5 lables

if os.path.exists('./google-bert_bert-base-cased'):
    model = AutoModelForSequenceClassification.from_pretrained('./google-bert_bert-base-cased', num_labels=5)
else:
    model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5)
    model.save_pretrained('./google-bert_bert-base-cased')

#####  *Training hyperparameters*

Next, we have to create a TrainingArguments class which contains all the hyperparameters.
here we are using default training hyperparameters


In [None]:
### Specify where to save the checkpoints from your training:

from transformers import TrainingArguments

training_args = TrainingArguments(output_dir="./test_trainer")

#### *Evaluate*<br>
`Trainer` does not automatically evaluate model performance during training. we need to pass Trainer a function to compute and report metrics.<br>
The `Evaluate` library provides a simple accuracy function you can load with the `evaluate.load()` function

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

Call compute on metric to calculate the accuracy of your predictions. Before passing your predictions to compute, you need to convert the logits to predictions (remember all 🤗 Transformers models return logits)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

To monitor the evaluation metrics during fine-tuning, specify the `evaluation_strategy` parameter in your training arguments to report the evaluation metric at the end of each epoch:

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")

#### *Trainer*
Create a `Trainer` object with the model, training arguments, training and test datasets, and evaluation function

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
# Then fine-tune the model by calling train()
trainer.train()

In [None]:
trainer.save_model('./bert_base_cased_finetuned')

In [None]:
"""
{'eval_loss': 1.0237584114074707, 'eval_accuracy': 0.578, 'eval_runtime': 19.2998, 'eval_samples_per_second': 51.814, 'eval_steps_per_second': 6.477, 'epoch': 3.0}
{'train_runtime': 211.9045, 'train_samples_per_second': 14.157, 'train_steps_per_second': 1.77, 'train_loss': 1.033949951171875, 'epoch': 3.0}

TrainOutput(global_step=375, training_loss=1.033949951171875, metrics={'train_runtime': 211.9045, 'train_samples_per_second': 14.157, 'train_steps_per_second': 1.77, 'total_flos': 789354427392000.0, 'train_loss': 1.033949951171875, 'epoch': 3.0})"""

### Finetuning

In [None]:
from transformers import Trainer, TrainingArguments, AutoTokenizer #, GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_dataset, load_from_disk
import os
from transformers import TextDataset, DataCollatorForLanguageModeling



# Load pre-trained GPT-2 model and tokenizer
# model_name = "gpt2"
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# model = GPT2LMHeadModel.from_pretrained(model_name)


from llmsherpa.readers import LayoutPDFReader

llmsherpa_api_url = "https://readers.llmsherpa.com/api/document/developer/parseDocument?renderFormat=all"   

do_ocr = True
if do_ocr:
    llmsherpa_api_url = llmsherpa_api_url + "&applyOcr=yes"

pdf_reader = LayoutPDFReader(llmsherpa_api_url)

def extract_text_from_pdf(file_path):
    doc_obj = pdf_reader.read_pdf(file_path)
    text_data = ''
    for chunk in doc_obj.chunks():
        text_data += chunk.to_text()
    return text_data
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer_flan = AutoTokenizer.from_pretrained("google/flan-t5-base")
model_flan = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")


# Process PDF data from books and user manuals
pdf_data_path = "./test_file"
text_data = ""

for filename in os.listdir(pdf_data_path):
    if filename.endswith(".pdf"):
        with open(os.path.join(pdf_data_path, filename), "rb") as file:
            pdf_text = extract_text_from_pdf(file)
            text_data += pdf_text

# Tokenize the text data
tokenized_text = tokenizer_flan(text_data, return_tensors="pt")

# Create a TextDataset from the tokenized text
dataset = TextDataset(tokenized_text, tokenizer=tokenizer_flan)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Define Trainer for unsupervised fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer_flan),
    train_dataset=dataset,
)

# Perform unsupervised fine-tuning
trainer.train()



