In [1]:
import pdfplumber
import re
import spacy
import sqlite3
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling

# Function to extract text from PDF
def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Function to clean extracted text
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\n+', ' ', text)  # Remove line breaks
    return text

# Function to chunk text
def chunk_text(text, chunk_size=100):
    words = text.split()
    for i in range(0, len(words), chunk_size):
        yield ' '.join(words[i:i + chunk_size])

# Function to process and store financial data using spaCy
def process_and_store_data(text, db_name="financial_data.db"):
    nlp = spacy.load("en_core_web_sm")
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    
    # Create table if not exists
    cursor.execute('''CREATE TABLE IF NOT EXISTS FinancialData
                      (entity TEXT, value TEXT)''')
    
    for chunk in chunk_text(text):
        doc = nlp(chunk)
        for ent in doc.ents:
            if ent.label_ in ["ORG", "MONEY", "DATE"]:
                cursor.execute("INSERT INTO FinancialData (entity, value) VALUES (?, ?)", (ent.text, ent.label_))
        conn.commit()  # Commit after each chunk to save progress
    conn.close()

# Main workflow
pdf_text = extract_text_from_pdf("nazara_tech.pdf")
cleaned_text = clean_text(pdf_text)
process_and_store_data(cleaned_text)





  from .autonotebook import tqdm as notebook_tqdm





In [2]:
import tempfile
from transformers import TrainingArguments


# Write cleaned text to a temporary file with UTF-8 encoding
with tempfile.NamedTemporaryFile(mode="w", delete=False, encoding="utf-8") as temp_file:
    temp_file.write(cleaned_text)
    temp_file_path = temp_file.name

In [3]:

from transformers import Trainer

# Initialize GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Add padding token
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Tokenize and preprocess financial text data
# Add padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token

# Tokenize and preprocess financial text data
encoded_data = tokenizer(cleaned_text, return_tensors="pt", padding=True, truncation=True)
dataset = TextDataset(file_path=temp_file_path, tokenizer=tokenizer, block_size=128)  # Adjust block size as needed

# Define data collator and training arguments
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Train the model
model.train()

# Define training arguments
training_args = TrainingArguments(
    output_dir='./output_dir',  # Specify the output directory
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir='./logs',
)


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("financial_chatbot_model")
tokenizer.save_pretrained("financial_chatbot_model")


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
 28%|██▊       | 343/1239 [18:52:26<929:32:11, 3734.74s/it] 