In [1]:
pip install requests beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [99]:
import requests
from bs4 import BeautifulSoup

# URL of the webpage you want to scrape
url = 'https://www.mrmoneymustache.com/2011/05/18/how-to-make-money-in-the-stock-market/'

# Send a GET request to the webpage
response = requests.get(url)

# Initialize an empty string to hold all the text
all_text = ""

# Check if the request was successful
if response.status_code == 200:
    # Parse the content of the request with BeautifulSoup
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract all text within paragraph tags
    paragraphs = soup.find_all('p')
    for paragraph in paragraphs:
        # Add each paragraph's text to the all_text string with a space in between
        all_text += paragraph.text.strip() + " "
    
    # Optional: print the complete concatenated text
    #print(all_text)
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")


In [100]:
import ollama

In [135]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
max_tokens = 1024  # Adjust based on your model's max context length
overlap = max_tokens // 4  # Calculate overlap as half of max_tokens

def chunk_text(text, max_tokens, overlap):
    tokens = tokenizer.encode(text)
    chunks = []
    for i in range(0, len(tokens), overlap):
        chunk = tokens[i:i + max_tokens]
        chunks.append(chunk)
        if len(chunk) < max_tokens:
            break
    return [tokenizer.decode(chunk) for chunk in chunks]

text = all_text
chunks = chunk_text(text, max_tokens, overlap)

Token indices sequence length is longer than the specified maximum sequence length for this model (20237 > 1024). Running this sequence through the model will result in indexing errors


In [6]:
from string import Template

In [7]:
prompt_creator = Template(""" You are a curriculum creator. You teach others by creating question and answer pairs from documents. Your focus is distilling
pure knowledge by extracting generalisable, deep underlying principles, this means that you ignore information that is only relevant to a 
particular document, and focus on knowledge contained or described in the text you read. 

Your task is to teach about financial advice, and you extract key pieces of information that are underlying principles on personal finance, financial advice, the economy and life philosophy in general.  

You care about definitions, principles, rules, ways of thinking and ways of approaching and solving problems.

Please create question and answer pairs from the following excerpt.
**Document:**
$text
""")

In [8]:
prompt_supervisor = Template("""You are a content supervisor that decides on a question answer pair currciculum for a personal finance course. 
You have just received a series of question answer pairs from a document.

Your task is to review them, accepting them or improving them. 

You first will provide you reasoning, and improved version if necessary.

You will then respond with a new section, titled "Question Answer Pairs", where you will place the final version of these.

**Document**
$text

**Question answer pairs:**
$previous
""")

In [61]:
prompt_extractor = Template("""Please extract the final relevant question answer pairs into a list of dictionaries that have the form:
[{{'question':question_here, 'answer':answer_here}}, {{'question':question2_here, 'answer':answer2_here}}]

You do not reply anything else, as that would make the list unparsable and your output useless.

**Question answer pairs:**
$previous
""")

In [63]:
prompts = [prompt_creator, prompt_supervisor, prompt_extractor]

In [65]:
def get_ollama_response(prompt):
    response = ollama.chat(model='llama3', messages=[
          {
            'role': 'user',
            'content': prompt,
          },
        ])

    return response['message']['content']

In [67]:
import string
import ast

In [69]:
def pipeline(text, prompts):

    previous = get_ollama_response(prompts[0].substitute(text=text))
    previous = get_ollama_response(prompts[1].substitute(text=text, previous=previous))
    previous = get_ollama_response(prompts[2].substitute(text=text, previous=previous))

    #return previous 
    
    try:
        previous = ast.literal_eval(previous)
    except:
        previous = None
        
    return previous

In [71]:
chunks[0]

' Okay, admittedly my title for this article sounds like something that you might see in your Spam folder. But it’s also completely accurate, because I really can teach you the best way to make money from the stock market, for life, all in one short blog post. Okay, I admit it – this is widely available information: I am going to hand out some advice that has been handed out widely before, for many years now. But the reason I’m still writing is that ignorance still seems to be widespread. Almost nobody I meet in day-to-day life knows anything about investing, the stock market, or big publicly-traded companies in general. Their opinions on the subject range throughout boredom, fear, mistrust, and if they are lucky, curiosity. Or if they are unlucky, bold confidence in their abilities to drastically “beat the market” with their intuition. Here are three real quotes I have heard from friends over time when discussing the stock market. “Stocks are just a big roulette wheel.. You can’t go o

In [73]:
pipeline(chunks[0], prompts)

[{'question': "What is the author's approach to financial advice?",
  'answer': 'The author takes an educational approach, aiming to teach readers about investing and the stock market in a way that is accessible and actionable.'},
 {'question': 'What are some common misconceptions people have about the stock market?',
  'answer': 'According to the author, people often misunderstand the nature of the system, leading to fears and misconceptions about investing.'},
 {'question': 'What is the key to successful investing?',
  'answer': "The author emphasizes the importance of knowing the nature of the market to gain confidence in one's investment decisions."},
 {'question': 'What specific financial benefits can one expect from understanding investing principles?',
  'answer': 'Understanding investing principles can help individuals secure their own retirement and build a stream of lifetime income with minimal ongoing effort.'},
 {'question': "What is the author's purpose in writing about fi

In [81]:
%%time

total_list = []

for chunk in chunks:
    output = pipeline(chunk, prompts)
    if output:
       total_list.append(output)
    else:
        output = pipeline(chunk, prompts)
        if output:
           total_list.append(output)
        else:
            pass

CPU times: user 438 ms, sys: 41.9 ms, total: 480 ms
Wall time: 18min 15s


In [87]:
len(total_list)

75

In [379]:
chunks[3]

' with the lowest fees, you automatically win. Endless statistical analysis proves this again and again. If you don’t believe me, read the book “A Random Walk Down Wall Street, or look up the topic of John Bogle / Bogleheads / and the foundation of the Vanguard company itself. But my uncle bought some stocks once and sold at a big profit! Also, if index funds really are the statistically best bet, why are there still thousands of brand-name mutual funds and hotshot traders out there? For the same reason that Las Vegas still exists and people still drive SUVs. Humans are irrational creatures and it is scientifically proven that we overestimate our own investment (and gambling) abilities, and no presentation of knowledge to the affected people can completely erase this. I have some perfectly intelligent friends who still believe they are “lucky” at games of chance, even though any scientist in the world can quickly run an experiment to irrefutably disprove the existence of any form of lu

In [297]:
%%time
response = ollama.chat(model='llama3', messages=[
  {
    'role': 'user',
    'content': prompt,
  },
])
print(response['message']['content'])

Here are the question and answer pairs extracted from the excerpt:

**Q1: What is a stock?**
A1: A stock is a slice of a company that you truly own, giving you the right to attend shareholder meetings, vote on important decisions, and receive a share of future earnings (dividends).

**Q2: What is a dividend?**
A2: A dividend is a share of a company's earnings paid out to shareholders.

**Q3: Why do companies reinvest dividends instead of paying them out?**
A3: Companies may reinvest dividends to help the company grow its earnings even faster, potentially leading to more future dividends.

**Q4: What determines the value of a stock?**
A4: The true value of a stock is based on the amount of dividends it will eventually pay shareholders over time, which depends on how much money the underlying company will make.

**Q5: Why do stocks go up and down so much?**
A5: Stocks fluctuate because millions of investors and analysts have differing opinions about how much money companies will make in 

In [299]:
%%time
response = ollama.chat(model='llama3', messages=[
  {
    'role': 'user',
    'content': prompt,
  },
])
print(response['message']['content'])

As a content supervisor for a personal finance course, my task is to review these question-answer pairs and provide feedback on their quality, accuracy, and relevance to the topic. Here's my review:

**Q1: What is a stock?**
A1: A stock is a slice of a company that you truly own, giving you the right to attend shareholder meetings, vote on important decisions, and receive a share of future earnings (dividends).

Reasoning: This question-answer pair is well-written and accurate. The answer provides a clear explanation of what a stock is, including its rights and benefits.

**Q2: What is a dividend?**
A2: A dividend is a share of a company's earnings paid out to shareholders.

Reasoning: This question-answer pair is also well-written and accurate. The answer concisely defines what a dividend is.

**Q3: Why do companies reinvest dividends instead of paying them out?**
A3: Companies may reinvest dividends to help the company grow its earnings even faster, potentially leading to more future

In [301]:
%%time
response = ollama.chat(model='llama3', messages=[
  {
    'role': 'user',
    'content': prompt,
  },
])
print(response['message']['content'])

[
{'question': 'What is a stock?', 'answer': 'A stock is a slice of a company that you truly own, giving you the right to attend shareholder meetings, vote on important decisions, and receive a share of future earnings (dividends).'},
{'question': 'What is a dividend?', 'answer': 'A dividend is a share of a company\'s earnings paid out to shareholders.'},
{'question': 'Why do companies reinvest dividends instead of paying them out?', 'answer': 'Companies may reinvest dividends to help the company grow its earnings even faster, potentially leading to more future dividends.'},
{'question': 'What determines the value of a stock?', 'answer': 'The true value of a stock is based on the amount of dividends it will eventually pay shareholders over time, which depends on how much money the underlying company will make.'},
{'question': 'Why do stocks go up and down so much?', 'answer': 'Stocks fluctuate because millions of investors and analysts have differing opinions about how much money compa

In [240]:
import torch

In [203]:
torch.cuda.is_available()

True

In [89]:
pip install PyMuPDF


Note: you may need to restart the kernel to use updated packages.


In [137]:
import fitz

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        full_text += page.get_text()
    return full_text

# Path to your PDF file
pdf_path = "/home/giskard/Downloads/text.pdf"

# Extract text from the PDF
pdf_text = extract_text_from_pdf(pdf_path)

# Chunk the extracted text
chunks = chunk_text(pdf_text, max_tokens, overlap)

In [138]:
len(chunks)

961

In [141]:
%%time

total_list = []

for chunk in chunks:
    output = pipeline(chunk, prompts)
    if output:
       total_list.append(output)
    else:
        output = pipeline(chunk, prompts)
        if output:
           total_list.append(output)
        else:
            pass

CPU times: user 5.2 s, sys: 277 ms, total: 5.48 s
Wall time: 3h 39min 39s


In [145]:
len(total_list)

918

In [149]:
total_list[546]

[{'question': 'How can I make informed investment decisions using historical data?',
  'answer': 'Historical data provides valuable insights into market trends, allowing you to make informed decisions about your investments. For example, analyzing past market fluctuations can help you identify patterns and make more educated choices.'},
 {'question': 'What are the implications of exchange rates on my international trade or investment activities?',
  'answer': 'Exchange rates affect the competitiveness of exports, the attractiveness of imports, and the value of investments in foreign markets. Understanding these factors is crucial for making informed decisions about global transactions.'},
 {'question': 'Why is it essential to use standardized accounting practices (SNA) when analyzing financial data?',
  'answer': 'SNA provides a framework for presenting economic data in a consistent manner, allowing for accurate comparison across countries and over time. This consistency ensures that f

In [151]:
import json

# Specify the file name
file_name = 'data.json'

# Save the list of dictionaries to a JSON file
with open(file_name, 'w') as json_file:
    json.dump(total_, json_file, indent=4)

NameError: name 'data' is not defined