In [None]:
!pip install transformers[torch]
!pip install accelerate -U



In [None]:
!pip install -q PyMuPDF nltk

In [None]:
!pip install -q transformers torch datasets

In [None]:
# !pip install accelerate==0.21.0

In [None]:
!pip install --upgrade transformers accelerate



In [None]:
import fitz
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def extract_text_from_pdf(pdf_path):

    text = ''
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

def clean_text(text):

    cleaned_text = re.sub("(\\d|\\W)+", " ", text)
    return cleaned_text

def preprocess_text(text):

    cleaned_text = clean_text(text)
    sentences = sent_tokenize(cleaned_text)
    return sentences

def preprocess_pdfs(pdf_paths):

    preprocessed_data = []
    for pdf_path in pdf_paths:
        text = extract_text_from_pdf(pdf_path)
        sentences = preprocess_text(text)
        preprocessed_data.extend(sentences)
    return preprocessed_data

pdf_paths = ["/content/drive/MyDrive/pdf_documents/Undergraduate Student Handbook 2022-2023.pdf"]
preprocessed_data = preprocess_pdfs(pdf_paths)

with open('preprocessed_data.txt', 'w', encoding='utf-8') as f:
    for line in preprocessed_data:
        f.write("%s\n" % line)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
token = "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"
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", token = token)
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", token = token)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
from transformers import TextDataset, DataCollatorForLanguageModeling

def load_dataset(train_path, tokenizer, block_size=128):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=train_path,
        block_size=block_size
    )

def load_data_collator(tokenizer, mlm=False):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=mlm
    )

train_path = '/content/preprocessed_data.txt'
block_size = 128

dataset = load_dataset(train_path, tokenizer, block_size)
data_collator = load_data_collator(tokenizer)




In [None]:
!pip install accelerate -U



In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./model_finetuned",  # Directory for saving fine-tuned model
    overwrite_output_dir=True,
    num_train_epochs=3,  # Adjust epochs according to your dataset size and needs
    per_device_train_batch_size=4,  # Adjust based on your GPU memory
    save_steps=10_000,  # Save checkpoint every 10,000 steps
    save_total_limit=2,  # Only keep the last 2 checkpoints
    logging_dir='./logs',  # Directory for training logs
    logging_steps=100,  # Log metrics every 100 steps
    load_best_model_at_end=True,  # Load the best model at the end of training
    evaluation_strategy="steps",  # Evaluate at the same strategy as saving, here it's adjusted to "steps"
    eval_steps=10_000,  # Evaluate every 10,000 steps, make sure it aligns with save_steps if load_best_model_at_end is True
    save_strategy="steps",  # Adjusted to match evaluation_strategy for load_best_model_at_end
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

model.save_pretrained("./model_finetuned")
tokenizer.save_pretrained("./model_finetuned")


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 15.77 GiB of which 12.38 MiB is free. Process 107935 has 15.76 GiB memory in use. Of the allocated memory 15.46 GiB is allocated by PyTorch, and 1.46 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name_or_path = "./model_finetuned"

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path)


In [None]:
def ask_question(question, model, tokenizer):

    input_ids = tokenizer.encode(question, return_tensors='pt')

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    output = model.generate(input_ids, max_length=50, num_return_sequences=1, temperature=0.7)

    answer = tokenizer.decode(output[0], skip_special_tokens=True)

    return answer


In [None]:
question = "What is the fee structure of lums"

answer = ask_question(question, model, tokenizer)

print("Question:", question)
print("Answer:", answer)