<a href="https://colab.research.google.com/github/pidoxy/pipeops-hackathon/blob/main/PreProcessFinanceBotText.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install --upgrade torch



In [2]:
pip install spacy



In [3]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
import nltk
import string
from nltk.corpus import stopwords
import spacy
from spacy.lang.en import English

# Cleaning the text:

Using a programming language like Python and libraries like NLTK or spaCy to remove punctuation, special characters, unnecessary formatting, and stop words (common words like "the", "a", "and").

In [5]:

# Download stopwords if not already downloaded
nltk.download('stopwords')
nltk.download('punkt')



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
# Sample text from "Rich Dad, Poor Dad"
text = """
This book is about financial literacy. It's not about getting rich quick,
but about building a foundation for financial freedom.
"""

# 1. Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))

# 2. Remove special characters
text = ''.join(char for char in text if char.isalnum() or char.isspace())

# 3. Tokenize (break into words)
tokens = nltk.word_tokenize(text)

# 4. Remove stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

# 5. Join the filtered tokens back into a string
cleaned_text = ' '.join(filtered_tokens)

print(f"Original Text:\n{text}")
print(f"\nCleaned Text:\n{cleaned_text}")

Original Text:

This book is about financial literacy Its not about getting rich quick
but about building a foundation for financial freedom


Cleaned Text:
book financial literacy getting rich quick building foundation financial freedom


In [7]:
# from google.colab import files

In [8]:
# uploaded = files.upload()

 ## Read Book Text from Files:

In [9]:
# Download resources if necessary
nltk.download('stopwords')
nltk.download('punkt')
spacy.load("en_core_web_sm")  # Load the small English model

# Read the book text from files
with open("rich_dad_poor_dad.txt", "r", encoding="utf-8") as f:
    rich_dad_text = f.read()

with open("why_c_students_work_for_a_students.txt", "r", encoding="utf-8") as f:
    c_students_text = f.read()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Clean the Text (Using NLTK):

In [10]:
def clean_text_nltk(text):
    """Cleans text using NLTK."""
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    tokens = nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    cleaned_text = ' '.join(filtered_tokens)
    return cleaned_text

# Clean the book texts
cleaned_rich_dad_text = clean_text_nltk(rich_dad_text)
cleaned_c_students_text = clean_text_nltk(c_students_text)

## Clean the Text (Using spaCy):

In [11]:
# def clean_text_spacy(text):
#     """Cleans text using spaCy."""
#     nlp = spacy.load("en_core_web_sm")
#     doc = nlp(text)
#     tokens = [token.text for token in doc if token.is_alpha or token.is_space]
#     stop_words = nlp.Defaults.stop_words
#     filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
#     cleaned_text = ' '.join(filtered_tokens)
#     return cleaned_text

# # Clean the book texts
# cleaned_rich_dad_text = clean_text_spacy(rich_dad_text)
# cleaned_c_students_text = clean_text_spacy(c_students_text)

## Tokenization and Lemmatization (NLTK):

In [12]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [13]:


import spacy
from spacy.lang.en import English

# ... (Previous code for reading and cleaning book text) ...

def tokenize_and_lemmatize_spacy(text):
    """Tokenizes and lemmatizes text using spaCy."""
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    return lemmas

# Tokenize and lemmatize the book texts
lemmas_rich_dad = tokenize_and_lemmatize_spacy(cleaned_rich_dad_text)
lemmas_c_students = tokenize_and_lemmatize_spacy(cleaned_c_students_text)

In [14]:

import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import Trainer, TrainingArguments
import torch

In [15]:
nltk.download('wordnet')

# very important to download


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
!pip install transformers[torch]



In [17]:
!pip install accelerate -U



## Creating Grammar

In [18]:
# 1. Data Preparation
# - Assuming you have your cleaned and lemmatized text from the books in variables like `cleaned_rich_dad_text` and `cleaned_c_students_text`

def create_vocabulary(text):
    """Creates a vocabulary from the text."""
    tokens = nltk.word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    vocabulary = set(lemmas)
    return vocabulary

# Create vocabularies for both books
rich_dad_vocabulary = create_vocabulary(cleaned_rich_dad_text)
c_students_vocabulary = create_vocabulary(cleaned_c_students_text)



In [19]:
# 2. Fine-tuning Gemini
# - Choose a suitable Gemini model (e.g., "google/flan-t5-xl")
# - Load the model and tokenizer
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# - Create a training dataset of question-answer pairs
#   - You can manually create these pairs from the books or use automatic question generation techniques
#   - Example: [(question1, answer1), (question2, answer2), ...]
training_data = [
    # Add your question-answer pairs here
    ("What is the key concept of Rich Dad, Poor Dad?", "The key concept is that financial literacy is crucial for building wealth and achieving financial freedom."),
    # ... more pairs
]



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [20]:
!apt install accelerate

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
[1;31mE: [0mUnable to locate package accelerate[0m


In [21]:
def preprocess_data(data):
    """Preprocess training data."""
    processed_data = []
    for question, answer in data:
        input_ids = tokenizer(question, return_tensors="pt")["input_ids"].squeeze() # Add .squeeze() to remove unnecessary dimensions
        target_ids = tokenizer(answer, return_tensors="pt")["input_ids"].squeeze()
        processed_data.append({"input_ids": input_ids, "labels": target_ids})
    return processed_data

training_data = preprocess_data(training_data)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./fin_intel_model",
    num_train_epochs=3,
    per_device_train_batch_size=8, # reduced to 4 because of space
    learning_rate=2e-5,

)

# ... (rest of your training code)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_data, # No need for TensorDataset here
)

# print(target_ids.shape)
trainer.train()



Step,Training Loss


TrainOutput(global_step=3, training_loss=3.2917982737223306, metrics={'train_runtime': 19.8932, 'train_samples_per_second': 0.151, 'train_steps_per_second': 0.151, 'total_flos': 52159251456.0, 'train_loss': 3.2917982737223306, 'epoch': 3.0})

In [22]:
# - Save the fine-tuned model
trainer.save_model("./fin_intel_model")



In [23]:
# 3. Building the Chatbot
# - Choose a chatbot platform
# - Example using Flask (for a web application)
from flask import Flask, request, jsonify
app = Flask(__name__)

In [25]:


# - Load the fine-tuned Gemini model
model = AutoModelForSeq2SeqLM.from_pretrained("./fin_intel_model")


@app.route('/')
def index():
    return 'Index Page'

@app.route('/chat', methods=['POST'])
def chat():
    try:
      # Get the user's question from the request
      user_question = request.json['question']

      # Use the model to generate an answer
      inputs = tokenizer(user_question, return_tensors="pt")
      output = model.generate(**inputs)
      answer = tokenizer.decode(output[0], skip_special_tokens=True)

      return jsonify({'answer': answer})

    except KeyError:
        return jsonify({'error': 'Missing "question" in the request'}), 400
    except Exception as e:
        return jsonify({'error': str(e)}), 500

if __name__ == '__main__':
    app.run(debug=True, port=5003)

# 4. Evaluation and Improvement
# - Test the chatbot with various questions
# - Gather feedback from users
# - Iteratively improve the chatbot based on feedback, new data, and fine-tuning

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5003
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat


In [None]:
test_data = [
    ("What is the difference between assets and liabilities in Rich Dad, Poor Dad?", "According to Rich Dad, Poor Dad, assets put money in your pocket, while liabilities take money out of your pocket. Assets are things like businesses, real estate, and stocks, while liabilities are things like cars, credit card debt, and mortgages."),
    ("What does Robert Kiyosaki teach about working for money?", "Robert Kiyosaki argues that working for money is a trap and that true wealth comes from owning assets that generate income."),
    ("How can I learn financial literacy?", "Financial literacy can be learned through books, courses, seminars, and online resources."),
    # ... Add more test questions ...
]

In [None]:
# from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
# from flask import Flask, request, jsonify

# app = Flask(__name__)

# # Load your fine-tuned model
# model = AutoModelForSeq2SeqLM.from_pretrained("./fin_intel_model")
# tokenizer = AutoTokenizer.from_pretrained("./fin_intel_model")

# def evaluate_chatbot(test_data):
#     """Evaluates the chatbot on a test set."""
#     correct_answers = 0
#     total_questions = len(test_data)

#     for question, expected_answer in test_data:
#         # Get the chatbot's response
#         inputs = tokenizer(question, return_tensors="pt")
#         output = model.generate(**inputs)
#         answer = tokenizer.decode(output[0], skip_special_tokens=True)

#         # Compare the chatbot's answer to the expected answer
#         if answer.lower() == expected_answer.lower():  # Case-insensitive comparison
#             correct_answers += 1

#     accuracy = correct_answers / total_questions
#     print(f"Accuracy: {accuracy:.2f}")

# # Example usage
# evaluate_chatbot(test_data)