# BudgetBuddy: Personal Finance Chatbot

**Domain-Specific Chatbot using Fine-Tuned FLAN-T5 + RAG**



In [1]:
!pip install -q transformers sentence-transformers faiss-cpu langchain langchain-community tensorflow sacrebleu rouge_score gradio accelerate datasets

ERROR: Could not install packages due to an OSError: [Errno 13] Permission denied: 'C:\\Python312\\Scripts\\get_gprof'
Consider using the `--user` option or check the permissions.


[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: C:\Python312\python.exe -m pip install --upgrade pip


## Importing dependencies

In [None]:
import os
import json
import re
import math
import pickle
from pathlib import Path
from datetime import datetime

import pandas as pd
import numpy as np

import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM, pipeline

from sentence_transformers import SentenceTransformer
from langchain.docstore.document import Document
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline

from sacrebleu import corpus_bleu
from rouge_score import rouge_scorer

print("All imports successful! Using TensorFlow backend.")

  from .autonotebook import tqdm as notebook_tqdm



All imports successful! Using TensorFlow backend.
All imports successful! Using TensorFlow backend.


 3. Setting configuration parameters

In [None]:

DATA_PATH = 'https://raw.githubusercontent.com/lmurayire12/DOMAIN-SPECIFIC-CHATBOT/refs/heads/main/data/personal_transactions%20new.csv'  # adjust if needed
NOTEBOOK_DIR = '/mnt/data/budgetbuddy_notebook'
Path(NOTEBOOK_DIR).mkdir(parents=True, exist_ok=True)

# Model choices
SEQ2SEQ_MODEL = 'google/flan-t5-small'   # fine-tune with TensorFlow
EMB_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'  # embeddings for FAISS

# Training settings
BATCH_SIZE = 8
EPOCHS = 3
LEARNING_RATE = 5e-5
MAX_INPUT_LENGTH = 256
MAX_TARGET_LENGTH = 128


config set


## Loading and cleaning transaction Data

In [22]:
df = pd.read_csv(DATA_PATH, encoding='utf-8')
print('loaded', df.shape)
# Basic cleaning
if 'Date' in df.columns:
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
else:
    df['Date'] = pd.NaT

if 'Amount' in df.columns:
    df['Amount'] = pd.to_numeric(df['Amount'], errors='coerce')
else:
    df['Amount'] = 0.0

# creating SignedAmount if transcation type exists
if 'Transaction Type' in df.columns:
    df['SignedAmount'] = df.apply(lambda r: -abs(r['Amount']) if str(r['Transaction Type']).lower().strip()=='debit' else abs(r['Amount']), axis=1)
else:
    df['SignedAmount'] = df['Amount']

# Cleaning description
if 'Description' in df.columns:
    df['Description_clean'] = df['Description'].astype(str).str.lower().str.replace('[^a-z0-9 ]',' ', regex=True).str.replace('\s+',' ', regex=True).str.strip()
else:
    df['Description_clean'] = ''

# Adding period columns
if df['Date'].notna().any():
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
else:
    df['Year'] = np.nan
    df['Month'] = np.nan

print(df.head(5).to_dict(orient='records'))

  df['Description_clean'] = df['Description'].astype(str).str.lower().str.replace('[^a-z0-9 ]',' ', regex=True).str.replace('\s+',' ', regex=True).str.strip()


loaded (806, 6)
[{'Date': Timestamp('2018-01-01 00:00:00'), 'Description': 'Amazon', 'Amount': 11.11, 'Transaction Type': 'debit', 'Category': 'Shopping', 'Account Name': 'Platinum Card', 'SignedAmount': -11.11, 'Description_clean': 'amazon', 'Year': 2018, 'Month': 1}, {'Date': Timestamp('2018-01-02 00:00:00'), 'Description': 'Mortgage Payment', 'Amount': 1247.44, 'Transaction Type': 'debit', 'Category': 'Mortgage & Rent', 'Account Name': 'Checking', 'SignedAmount': -1247.44, 'Description_clean': 'mortgage payment', 'Year': 2018, 'Month': 1}, {'Date': Timestamp('2018-01-02 00:00:00'), 'Description': 'Thai Restaurant', 'Amount': 24.22, 'Transaction Type': 'debit', 'Category': 'Restaurants', 'Account Name': 'Silver Card', 'SignedAmount': -24.22, 'Description_clean': 'thai restaurant', 'Year': 2018, 'Month': 1}, {'Date': Timestamp('2018-01-03 00:00:00'), 'Description': 'Credit Card Payment', 'Amount': 2298.09, 'Transaction Type': 'credit', 'Category': 'Credit Card Payment', 'Account Name'

## Generating training dataset

In [None]:

train_pairs = []
for year in sorted(df['Year'].dropna().unique()):
    ydf = df[df['Year']==year]
    for month in sorted(ydf['Month'].dropna().unique()):
        mdf = ydf[ydf['Month']==month]
        if mdf.empty:
            continue
        # overall question
        total = mdf['Amount'].sum()
        month_name = datetime(int(year), int(month), 1).strftime('%B')
        q = f"How much did I spend in {month_name} {int(year)}?"
        a = f"You spent ${total:.2f} in {month_name} {int(year)}."
        train_pairs.append({'question': q, 'context': ' '.join(mdf['Description_clean'].head(10).tolist()), 'answer': a, 'intent':'aggregate'})
        # per-category
        for cat, g in mdf.groupby('Category'):
            total_cat = g['Amount'].sum()
            if total_cat==0:
                continue
            q = f"How much did I spend on {cat} in {month_name} {int(year)}?"
            a = f"You spent ${total_cat:.2f} on {cat} in {month_name} {int(year)}, mostly on {g['Description_clean'].mode().iloc[0] if not g['Description_clean'].mode().empty else 'various purchases'}."
            train_pairs.append({'question': q, 'context': ' '.join(g['Description_clean'].head(10).tolist()), 'answer': a, 'intent':'aggregate'})

#  adding some general questions (synthetic)
train_pairs.append({'question':'What are my top 3 spending categories this year?','context':'','answer':'Your top 3 categories are: ' + ', '.join(df.groupby('Category')['Amount'].sum().abs().sort_values(ascending=False).head(3).index.astype(str)), 'intent':'summary'})

# Saving dataset to disk for training/validation split
DATASET_PATH = Path(NOTEBOOK_DIR)/'generated_dataset.jsonl'
with open(DATASET_PATH,'w',encoding='utf-8') as f:
    for rec in train_pairs:
        f.write(json.dumps(rec) + '\n')

print('created', len(train_pairs), 'pairs. sample:', train_pairs[:3])

created 362 pairs. sample: [{'question': 'How much did I spend in January 2018?', 'context': 'amazon mortgage payment thai restaurant credit card payment netflix american tavern hardware store gas company hardware store spotify', 'answer': 'You spent $10094.34 in January 2018.', 'intent': 'aggregate'}, {'question': 'How much did I spend on Coffee Shops in January 2018?', 'context': 'starbucks', 'answer': 'You spent $3.00 on Coffee Shops in January 2018, mostly on starbucks.', 'intent': 'aggregate'}, {'question': 'How much did I spend on Credit Card Payment in January 2018?', 'context': 'credit card payment credit card payment credit card payment credit card payment credit card payment', 'answer': 'You spent $4027.69 on Credit Card Payment in January 2018, mostly on credit card payment.', 'intent': 'aggregate'}]


## Splitting the dataset into training and validation sets

In [6]:
from sklearn.model_selection import train_test_split
pairs = train_pairs
train, valid = train_test_split(pairs, test_size=0.1, random_state=42)
TRAIN_PATH = Path(NOTEBOOK_DIR)/'train.jsonl'
VALID_PATH = Path(NOTEBOOK_DIR)/'valid.jsonl'
with open(TRAIN_PATH,'w',encoding='utf-8') as f:
    for r in train:
        f.write(json.dumps(r)+'\n')
with open(VALID_PATH,'w',encoding='utf-8') as f:
    for r in valid:
        f.write(json.dumps(r)+'\n')
print('train/valid sizes:', len(train), len(valid))

train/valid sizes: 325 37


## Preparing tokenizer and data pipeline

In [None]:
from transformers import AutoTokenizer

print('Loading tokenizer for', SEQ2SEQ_MODEL)
tokenizer = AutoTokenizer.from_pretrained(SEQ2SEQ_MODEL)

MAX_INPUT_LENGTH = 256
MAX_TARGET_LENGTH = 128

def encode_example(example):
    inp = example['question']
    if example.get('context'):
        inp = inp + ' </s> CONTEXT: ' + example['context']
    tgt = example['answer']
    enc_inp = tokenizer(inp, truncation=True, padding='max_length', max_length=MAX_INPUT_LENGTH, return_tensors='np')
    enc_tgt = tokenizer(tgt, truncation=True, padding='max_length', max_length=MAX_TARGET_LENGTH, return_tensors='np')
    # Reshaping labels to have an extra dimension for compatibility with compiled loss
    labels = enc_tgt['input_ids'][0]
    labels = np.expand_dims(labels, axis=-1)
    return {'input_ids': enc_inp['input_ids'][0], 'attention_mask': enc_inp['attention_mask'][0], 'labels': labels}


def tf_dataset_from_jsonl(path, batch_size=8, shuffle=True):
    examples = [json.loads(line) for line in open(path,'r',encoding='utf-8')]
    encs = [encode_example(e) for e in examples]
    def gen():
        for e in encs:
            yield (e['input_ids'], e['attention_mask'], e['labels'])
    ds = tf.data.Dataset.from_generator(gen, output_types=(tf.int32, tf.int32, tf.int32), output_shapes=((MAX_INPUT_LENGTH,),(MAX_INPUT_LENGTH,),(MAX_TARGET_LENGTH, 1)))
    if shuffle:
        ds = ds.shuffle(1024)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds


train_ds = tf_dataset_from_jsonl(str(TRAIN_PATH), batch_size=BATCH_SIZE)
for x,y,z in train_ds.take(1):
    print('batch shapes', x.shape, y.shape, z.shape)

Loading tokenizer for google/flan-t5-small
Instructions for updating:
Use output_signature instead
Instructions for updating:
Use output_signature instead
Instructions for updating:
Use output_signature instead
Instructions for updating:
Use output_signature instead
batch shapes (8, 256) (8, 256) (8, 128, 1)
batch shapes (8, 256) (8, 256) (8, 128, 1)


## Loading FLAN-T5 model for fine-tuning

In [8]:
from transformers import TFAutoModelForSeq2SeqLM
print('Loading TF T5 model for fine-tuning...')
model = TFAutoModelForSeq2SeqLM.from_pretrained(SEQ2SEQ_MODEL, use_safetensors=False)
print('Model loaded')

Loading TF T5 model for fine-tuning...



TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at google/flan-t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Model loaded


## Training the Model (Fine-Tuning)

In [9]:
# compiling  and training
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
model.compile(optimizer=optimizer) # Remove loss function here

# creating datasets
train_ds = tf_dataset_from_jsonl(str(TRAIN_PATH), batch_size=BATCH_SIZE)
valid_ds = tf_dataset_from_jsonl(str(VALID_PATH), batch_size=BATCH_SIZE, shuffle=False)

# callbacks
ckpt_dir = Path(NOTEBOOK_DIR)/'tf_ckpt'
ckpt_dir.mkdir(parents=True, exist_ok=True)
cp_cb = tf.keras.callbacks.ModelCheckpoint(str(ckpt_dir/ 'best_model'), save_weights_only=False, save_best_only=True, monitor='val_loss')
early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=2)

history = model.fit(train_ds, validation_data=valid_ds, epochs=EPOCHS, callbacks=[cp_cb, early])

# saving final model
model.save_pretrained(Path(NOTEBOOK_DIR)/'fine_tuned_t5')
print('training complete')

Epoch 1/3

INFO:tensorflow:Assets written to: \mnt\data\budgetbuddy_notebook\tf_ckpt\best_model\assets


INFO:tensorflow:Assets written to: \mnt\data\budgetbuddy_notebook\tf_ckpt\best_model\assets


Epoch 2/3




INFO:tensorflow:Assets written to: \mnt\data\budgetbuddy_notebook\tf_ckpt\best_model\assets


INFO:tensorflow:Assets written to: \mnt\data\budgetbuddy_notebook\tf_ckpt\best_model\assets


Epoch 3/3




INFO:tensorflow:Assets written to: \mnt\data\budgetbuddy_notebook\tf_ckpt\best_model\assets


INFO:tensorflow:Assets written to: \mnt\data\budgetbuddy_notebook\tf_ckpt\best_model\assets


training complete


## Evaluating model performance

In [None]:
import os
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM

fine_tuned_path = Path(NOTEBOOK_DIR)/'fine_tuned_t5'
tokenizer_path = SEQ2SEQ_MODEL
model_path = str(fine_tuned_path) if fine_tuned_path.exists() else SEQ2SEQ_MODEL


# Loading tokenizer
print(f"Loading tokenizer from: {tokenizer_path}")
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# Loading model
print(f"Loading model from: {model_path}")
model_for_eval = TFAutoModelForSeq2SeqLM.from_pretrained(model_path)


scorer = rouge_scorer.RougeScorer(['rouge1','rouge2','rougeL'], use_stemmer=True)

def generate_answer_tf(question, context=''):
    inp = question
    if context:
        inp = inp + ' </s> CONTEXT: ' + context
    enc = tokenizer(inp, return_tensors='tf', truncation=True, padding='max_length', max_length=MAX_INPUT_LENGTH)
    out = model_for_eval.generate(enc['input_ids'], attention_mask=enc['attention_mask'], max_length=MAX_TARGET_LENGTH)
    return tokenizer.decode(out[0], skip_special_tokens=True)

# Evaluating on validation set
refs = []
hyps = []
for line in open(str(VALID_PATH),'r',encoding='utf-8'):
    ex = json.loads(line)
    pred = generate_answer_tf(ex['question'], ex.get('context',''))
    refs.append(ex['answer'])
    hyps.append(pred)

bleu = corpus_bleu(hyps, [refs])
rouges = [scorer.score(r,h) for r,h in zip(refs,hyps)]
avg_rouge = {k: sum(s[k].fmeasure for s in rouges)/len(rouges) for k in ['rouge1','rouge2','rougeL']}
print('BLEU:', bleu.score, 'ROUGE:', avg_rouge)

Loading tokenizer from: google/flan-t5-small
Loading model from: \mnt\data\budgetbuddy_notebook\fine_tuned_t5


All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at \mnt\data\budgetbuddy_notebook\fine_tuned_t5.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


BLEU: 0.04428722916403619 ROUGE: {'rouge1': 0.011589968346725104, 'rouge2': 0.005702405702405702, 'rougeL': 0.011589968346725104}


## Building FAISS Vector database for RAG

In [None]:
# Building FAISS index from transactions for RAG
sbert = SentenceTransformer(EMB_MODEL)
texts = []
metas = []
for idx,row in df.iterrows():
    date_str = str(row['Date']) if not pd.isna(row['Date']) else ''
    txt = f"DATE: {date_str} AMOUNT: {row['Amount']:.2f} CATEGORY: {row.get('Category','Unknown')} DESCRIPTION: {row.get('Description_clean','')}"
    texts.append(txt)
    metas.append({'idx': int(idx), 'date': date_str, 'amount': float(row['Amount']), 'category': row.get('Category','Unknown')})
embs = sbert.encode(texts, convert_to_numpy=True, show_progress_bar=True)
import faiss

faiss.normalize_L2(embs)
index = faiss.IndexFlatIP(embs.shape[1])
index.add(embs)
# saving index and metadata
faiss.write_index(index, str(Path(NOTEBOOK_DIR)/'transactions.faiss'))
with open(Path(NOTEBOOK_DIR)/'faiss_meta.pkl','wb') as f:
    pickle.dump({'texts':texts,'metas':metas}, f)
print('FAISS index saved')

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|██████████| 26/26 [00:06<00:00,  3.73it/s]


FAISS index saved


## Testing RAG retrieval system

In [None]:
import faiss
index = faiss.read_index(str(Path(NOTEBOOK_DIR)/'transactions.faiss'))
with open(Path(NOTEBOOK_DIR)/'faiss_meta.pkl','rb') as f:
    meta = pickle.load(f)
texts = meta['texts']
metas = meta['metas']

def retrieve_topk(query, k=5):
    q_emb = sbert.encode([query], convert_to_numpy=True)
    faiss.normalize_L2(q_emb)
    D,I = index.search(q_emb, k)
    results = []
    for score, i in zip(D[0], I[0]):
        results.append({'score': float(score), 'text': texts[i], 'meta': metas[i]})
    return results

# RAG generation using fine-tuned T5 

def rag_answer(question, k=4):
    retrieved = retrieve_topk(question, k=k)
    context = '\n'.join([r['text'] for r in retrieved])
    ans = generate_answer_tf(question, context)
    return ans, retrieved

# testing
print(rag_answer('Why did my spending increase last month?'))

('a biweekly paycheck', [{'score': 0.2846578359603882, 'text': 'DATE: 2019-03-26 00:00:00 AMOUNT: 16.04 CATEGORY: Shopping DESCRIPTION: amazon', 'meta': {'idx': 573, 'date': '2019-03-26 00:00:00', 'amount': 16.04, 'category': 'Shopping'}}, {'score': 0.27761968970298767, 'text': 'DATE: 2018-04-01 00:00:00 AMOUNT: 13.13 CATEGORY: Shopping DESCRIPTION: amazon', 'meta': {'idx': 110, 'date': '2018-04-01 00:00:00', 'amount': 13.13, 'category': 'Shopping'}}, {'score': 0.27242588996887207, 'text': 'DATE: 2019-05-09 00:00:00 AMOUNT: 3.20 CATEGORY: Shopping DESCRIPTION: target', 'meta': {'idx': 625, 'date': '2019-05-09 00:00:00', 'amount': 3.2, 'category': 'Shopping'}}, {'score': 0.27115923166275024, 'text': 'DATE: 2018-04-13 00:00:00 AMOUNT: 2000.00 CATEGORY: Paycheck DESCRIPTION: biweekly paycheck', 'meta': {'idx': 130, 'date': '2018-04-13 00:00:00', 'amount': 2000.0, 'category': 'Paycheck'}}])


## Creating rule-based query handler

In [13]:
import dateparser

def parse_time_from_text(text):
    dt = dateparser.parse(text, settings={'PREFER_DATES_FROM':'past'})
    if dt:
        return dt.year, dt.month
    m = re.search(r'(20\d{2})', text)
    if m:
        return int(m.group(1)), None
    return None, None


def structured_compute_spend(category=None, year=None, month=None):
    df2 = df.copy()
    if category:
        df2 = df2[df2['Category'].str.lower()==category.lower()]
    if year:
        df2 = df2[df2['Year']==int(year)]
    if month:
        df2 = df2[df2['Month']==int(month)]
    total = df2['Amount'].sum()
    return round(float(total),2), df2

# Example
print(structured_compute_spend(category='Entertainment', year=2018))

(9.62,           Date    Description  Amount Transaction Type       Category  \
192 2018-05-28  Movie Theater    9.62            debit  Entertainment   

    Account Name  SignedAmount Description_clean  Year  Month  
192  Silver Card         -9.62     movie theater  2018      5  )


## Installing additional dependency (dateparser)

In [14]:
!pip install -q dateparser

ERROR: Could not install packages due to an OSError: [WinError 2] The system cannot find the file specified: 'C:\\Python312\\Scripts\\dateparser-download.exe' -> 'C:\\Python312\\Scripts\\dateparser-download.exe.deleteme'


[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: C:\Python312\python.exe -m pip install --upgrade pip


## Building hybrid Question-Answering Pipeline

In [None]:
def detect_aggregate_question(text):
    return bool(re.search(r'\b(how much|total|sum|spent on|spent)\b', text.lower()))


def answer_pipeline(question):
    
    if detect_aggregate_question(question):
        cats = df['Category'].dropna().unique().tolist()
        found_cat = None
        for c in cats:
            if c.lower() in question.lower():
                found_cat = c
                break
        year, month = parse_time_from_text(question)
        total, subset = structured_compute_spend(category=found_cat, year=year, month=month)
        if subset.empty:
            ans, prov = rag_answer(question)
            return ans, prov
        extra = ''
        if found_cat:
            top_desc = subset['Description_clean'].value_counts().idxmax() if not subset['Description_clean'].empty else ''
            extra = f', mostly on {top_desc}' if top_desc else ''
            sentence = f"You spent ${total:.2f} on {found_cat}"
            if year and month:
                month_name = datetime(int(year), int(month), 1).strftime('%B') if month else ''
                sentence += f" in {month_name} {year}"
            sentence += extra + '.'
            return sentence, subset.head(5).to_dict(orient='records')
        else:
            sentence = f"You spent ${total:.2f}"
            return sentence, subset.head(5).to_dict(orient='records')
    else:
        ans, retrieved = rag_answer(question)
        return ans, retrieved

# test
print(answer_pipeline('How much did I spend on Entertainment in May 2018?'))
print(answer_pipeline('Why did my spending increase last month?'))

('You spent $9.62 on Entertainment, mostly on movie theater.', [{'Date': Timestamp('2018-05-28 00:00:00'), 'Description': 'Movie Theater', 'Amount': 9.62, 'Transaction Type': 'debit', 'Category': 'Entertainment', 'Account Name': 'Silver Card', 'SignedAmount': -9.62, 'Description_clean': 'movie theater', 'Year': 2018, 'Month': 5}])
('a biweekly paycheck', [{'score': 0.2846578359603882, 'text': 'DATE: 2019-03-26 00:00:00 AMOUNT: 16.04 CATEGORY: Shopping DESCRIPTION: amazon', 'meta': {'idx': 573, 'date': '2019-03-26 00:00:00', 'amount': 16.04, 'category': 'Shopping'}}, {'score': 0.27761968970298767, 'text': 'DATE: 2018-04-01 00:00:00 AMOUNT: 13.13 CATEGORY: Shopping DESCRIPTION: amazon', 'meta': {'idx': 110, 'date': '2018-04-01 00:00:00', 'amount': 13.13, 'category': 'Shopping'}}, {'score': 0.27242588996887207, 'text': 'DATE: 2019-05-09 00:00:00 AMOUNT: 3.20 CATEGORY: Shopping DESCRIPTION: target', 'meta': {'idx': 625, 'date': '2019-05-09 00:00:00', 'amount': 3.2, 'category': 'Shopping'}}

## Copying model files to deployment folder

In [None]:
import shutil

# Creating organized directory structure
saved_models_dir = Path('./saved_models')
saved_models_dir.mkdir(exist_ok=True)

local_model_dir = saved_models_dir / 'fine_tuned_t5'
notebook_model_dir = Path(NOTEBOOK_DIR) / 'fine_tuned_t5'

# Copying model files
if notebook_model_dir.exists():
    print("Copying fine-tuned model to saved_models/...")
    shutil.copytree(notebook_model_dir, local_model_dir, dirs_exist_ok=True)
    print(f"✓ Model copied to {local_model_dir}")
else:
    print(f"⚠ Model not found at {notebook_model_dir}")

# Copying FAISS index and metadata to saved_models folder
faiss_source = Path(NOTEBOOK_DIR) / 'transactions.faiss'
faiss_meta_source = Path(NOTEBOOK_DIR) / 'faiss_meta.pkl'

if faiss_source.exists():
    shutil.copy(str(faiss_source), str(saved_models_dir / 'transactions.faiss'))
    print(f"✓ FAISS index copied to {saved_models_dir / 'transactions.faiss'}")
else:
    print(f"⚠ FAISS index not found at {faiss_source}")

if faiss_meta_source.exists():
    shutil.copy(str(faiss_meta_source), str(saved_models_dir / 'faiss_meta.pkl'))
    print(f"✓ FAISS metadata copied to {saved_models_dir / 'faiss_meta.pkl'}")
else:
    print(f"⚠ FAISS metadata not found at {faiss_meta_source}")

print("\n All files organized in saved_models/ folder!")
print("\nDirectory structure:")
print("saved_models/")
print("├── fine_tuned_t5/        (T5 model files)")
print("├── transactions.faiss    (FAISS vector index)")
print("└── faiss_meta.pkl        (Transaction metadata)")

Copying fine-tuned model to saved_models/...
✓ Model copied to saved_models\fine_tuned_t5
✓ FAISS index copied to saved_models\transactions.faiss
✓ FAISS metadata copied to saved_models\faiss_meta.pkl

 All files organized in saved_models/ folder!

Directory structure:
saved_models/
├── fine_tuned_t5/        (T5 model files)
├── transactions.faiss    (FAISS vector index)
└── faiss_meta.pkl        (Transaction metadata)
