In [None]:
%pip install psycopg2

In [3]:
import psycopg2
import psycopg2.extras
import json
import gzip
import csv
import os
import logging
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
from collections import defaultdict

import io
from datetime import datetime
import time
import pprint

#### Setting up of TimescaleDB

In [4]:
DB_USER = "postgres"
DB_PASSWORD = "password"
DB_HOST = "localhost"
DB_NAME = "postgres"

CONNECTION_URL = f"postgres://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}"

In [5]:
def db_exec(sql, log="SQL executed successfully."):
    try:
        with psycopg2.connect(CONNECTION_URL) as conn:
            cursor = conn.cursor()
            cursor.execute(sql)
        conn.commit()
        print(f"{log}")
    except Exception as e:
        logging.error('Error at %s', 'division', exc_info=e)
        print(f"\nSQL executed unsuccessfully")

In [6]:
tables = ["industry", "symbol", "ohlc"]
for table in tables:
    db_exec(f"DROP TABLE IF EXISTS {table} CASCADE", f"{table.upper()} table dropped successfully.")


INDUSTRY table dropped successfully.
SYMBOL table dropped successfully.
OHLC table dropped successfully.


In [7]:
create_ohlc_table = """CREATE TABLE IF NOT EXISTS ohlc (
    symbol VARCHAR(10) NOT NULL,
    industry VARCHAR(50),
    timestamp TIMESTAMP NOT NULL,
    open DECIMAL(10,2),
    high DECIMAL(10,2),
    low DECIMAL(10,2),
    close DECIMAL(10,2)
);
"""
db_exec(create_ohlc_table, "OHLC table created successfully")

OHLC table created successfully


#### TimescaleDB - Load OHLC Data

In [4]:
BASE_DIR = "./dataset"

In [9]:
PROFILE_PATH = os.path.join(BASE_DIR, "profile_estimate", "profile.json")

def extract_symbol_industry():
    symbol_industry = defaultdict(str)
    with open(PROFILE_PATH, "r") as file:
        company_profiles = json.load(file)
        for symbol, profile in company_profiles.items():
            symbol_industry[symbol] = profile[0]["industry"]
    return symbol_industry

In [10]:
OHLC_DIR = os.path.join(BASE_DIR, "OHLC")

symbol_industry = extract_symbol_industry()

def insert_ohlc(symbol):
    symbol_path = os.path.join(OHLC_DIR, symbol)
    new_keys = {"symbol": symbol, "industry": symbol_industry[symbol]}

    ohlc_buffer = io.StringIO()
    
    for date in os.listdir(symbol_path):
        file_path = os.path.join(symbol_path, date)
        with gzip.open(file_path, "rt") as file:
            reader = csv.DictReader(file, delimiter=",")
            for entry in reader:
                row = (
                    f"{new_keys['symbol']},"
                    f"{new_keys['industry']},"
                    f"{datetime.fromtimestamp(int(entry['timestamp']))},"
                    f"{float(entry['open'])},"
                    f"{float(entry['high'])},"
                    f"{float(entry['low'])},"
                    f"{float(entry['close'])}\n"
                )
                ohlc_buffer.write(row)

    ohlc_buffer.seek(0)

    with psycopg2.connect(CONNECTION_URL) as conn:
        with conn.cursor() as cursor:
            cursor.copy_from(
                ohlc_buffer,
                "ohlc",
                sep=",",
                columns=["symbol", "industry", "timestamp", "open", "high", "low", "close"]
            )
        conn.commit()


# While the code below works, it takes significantly longer
# OHLC_DIR = os.path.join(BASE_DIR, "OHLC")

# symbol_industry = extract_symbol_industry()

# def insert_ohlc(symbol):
#     symbol_path = os.path.join(OHLC_DIR, symbol)
#     new_keys = {"symbol": symbol, "industry": symbol_industry[symbol]}
#     for date in os.listdir(symbol_path):
#         ohlc_data = []
#         file_path = os.path.join(symbol_path, date)
#         with gzip.open(file_path, "rt") as file:
#             ohlc_data.extend(csv.DictReader(file, delimiter=","))
        
#         ohlc_data = [{**entry, **new_keys} for entry in ohlc_data]
        
#         with psycopg2.connect(CONNECTION_URL) as conn:
#             cursor = conn.cursor()
#             psycopg2.extras.execute_batch(
#                 cursor,
#                 "INSERT INTO ohlc (symbol, industry, timestamp, open, high, low, close) VALUES (%s, %s, TO_TIMESTAMP(%s), %s, %s, %s, %s)",
#                 [(entry["symbol"], entry["industry"], int(entry["timestamp"]), float(entry["open"]), float(entry["high"]), float(entry["low"]), float(entry["close"])) for entry in ohlc_data],
#                 page_size=20000  # Inserts in batches of 1000
#             )
#         conn.commit()

# insert_ohlc("AAPL") # page_size makes little to no diff, takes around 1 min

In [None]:
# uses multiprocessing to parallelize bulk insertions
def mp_insert_ohlc():
    start_time = time.time()
    ohlc_symbols = os.listdir(OHLC_DIR)
    
    with ThreadPoolExecutor() as executor:
        executor.map(insert_ohlc, ohlc_symbols)
    end_time = time.time()
    print(f"Time taken to insert all OHLC data: {end_time - start_time} seconds")

mp_insert_ohlc() # 16 seconds for AMZN and AAPL

Time taken to insert all OHLC data: 16.224417448043823 seconds


In [None]:
def create_ohlc_hypertable():
    start_time = time.time()
    db_exec("""SELECT create_hypertable('ohlc', by_range('timestamp'), migrate_data => TRUE);""")
    end_time = time.time()
    print(f"Time taken to convert to hypertable (without index): {end_time - start_time} seconds")

create_ohlc_hypertable()

SQL executed successfully.
Time taken to convert to hypertable (without index): 5.520748615264893 seconds


#### HTML Preprocessing

In [18]:
import newspaper

In [39]:
# with open("./dataset/News/AAPL/news/647.html", "r") as file:
#     content = newspaper.fulltext(file.read())
#     print(content)
# newspaper.fulltext()

def extract_text_from_html(file_path):
    try:
        with open(file_path, "r") as file:
            content = newspaper.fulltext(file.read())
        return content
    except:
        return None

as

In [None]:
%pip install --quiet --upgrade qdrant-client[fastembed]
%pip install --quiet --upgrade langchain-qdrant
%pip install --quiet --upgrade langchain-ollama
%pip install --quiet --upgrade langchain-text-splitters langchain-community langgraph

In [15]:
from langchain_ollama import ChatOllama
from langchain_ollama import OllamaEmbeddings
from langchain import hub
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

import newspaper

In [9]:
client = QdrantClient(host="localhost", port=6333)

client.create_collection(
    collection_name="news",
    vectors_config=VectorParams(
        size=3072,
        distance=Distance.COSINE,
    ),
)

True

In [8]:
client.delete_collection("news")

True

In [10]:
chatModel = ChatOllama(
    model = "llama3.2:latest",
    temperature = 0.8,
    num_predict = 256,
)

embeddingModel = OllamaEmbeddings(
    model="llama3.2:latest",
)

vector_store = QdrantVectorStore(
    client=client,
    collection_name="news",
    embedding=embeddingModel,
)

In [None]:
messages = [
    ("system", "You are a helpful translator. Translate the user sentence to French."),
    ("human", "I love programming."),
]
chatModel.invoke(messages)

AIMessage(content="Le programmation est une passion qui m'apaise.\n\n(Note: The translation is in European French, if you want American French, please let me know)\n\nAlternative:\nJ'aime le programmation.\n(J'ai)", additional_kwargs={}, response_metadata={'model': 'llama3.2:latest', 'created_at': '2025-04-04T06:36:16.414689871Z', 'done': True, 'done_reason': 'stop', 'total_duration': 1216699912, 'load_duration': 28549657, 'prompt_eval_count': 42, 'prompt_eval_duration': 259000000, 'eval_count': 45, 'eval_duration': 913000000, 'message': Message(role='assistant', content='', images=None, tool_calls=None)}, id='run-41396ad0-ee22-4077-8ffa-f93d7320a4ee-0', usage_metadata={'input_tokens': 42, 'output_tokens': 45, 'total_tokens': 87})

In [None]:
import bs4
from langchain import hub
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter


# Load and chunk contents of the blog
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    ),
)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)

# Index chunks
_ = vector_store.add_documents(documents=all_splits)

# Define prompt for question-answering
prompt = hub.pull("rlm/rag-prompt")


def ask(question: str):
    retrieved_docs = vector_store.similarity_search(question)
    docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
    messages = prompt.invoke({"question": question, "context": docs_content})
    response = chatModel.invoke(messages)
    return response.content

print(ask("What is the main topic of the blog?"))



I don't know what the main topic of the blog is, but I can tell you that it appears to be related to Language Modeling and Tool Use, as indicated by the Component Three title. It also mentions MIPS algorithms and performance comparison.


In [None]:
input_text = "The meaning of life is 42"
vector = embeddingModel.embed_query(input_text)
print(vector[:3])

[-0.00018129169, 0.009361812, 0.0051309606]


In [11]:
from tqdm import tqdm

def get_news_metadata(symbol):
    profile_path = os.path.join(BASE_DIR, "News", symbol, "news.json")
    with open(profile_path, "r") as file:
        company_profile = json.load(file)
        transformed_profile = {item["id"]: {"datetime": item["datetime"], "headline": item["headline"], "symbol": item["related"]} for item in company_profile}
        return transformed_profile

def process_news_file(file_path, news_id, news_metadata, symbol):
    with open(file_path, "r") as file:
        content = newspaper.fulltext(file.read())  # Extract full text
    metadata = news_metadata.get(news_id, {"datetime": None, "headline": None, "symbol": symbol})
    return Document(page_content=content, metadata=metadata)


def documentize(symbol):
    symbol_path = os.path.join(BASE_DIR, "News", symbol, "news")
    news_metadata = get_news_metadata(symbol)
    
    file_paths = [os.path.join(symbol_path, fname) for fname in os.listdir(symbol_path)]
    news_ids = [fname.split(".")[0] for fname in os.listdir(symbol_path)]
    
    with ThreadPoolExecutor() as executor:
        results = executor.map(lambda args: process_news_file(*args), zip(file_paths, news_ids, [news_metadata] * len(file_paths), [symbol] * len(file_paths)))

    return list(results)


def chunker(documents, chunk_size=2000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return text_splitter.split_documents(documents)


def process_symbol(symbol):
    documents = documentize(symbol)
    chunked_docs = chunker(documents=documents)
    vector_store.add_documents(documents=chunked_docs)
    

def enrich_vector_store():
    symbols = os.listdir(os.path.join(BASE_DIR, "News"))

    with ProcessPoolExecutor() as executor:
        results = list(tqdm(executor.map(process_symbol, symbols), total = len(symbols), desc="Overall Progress"))
    print("Finished enriching the vectorstore with news.")

In [16]:
enrich_vector_store()

Overall Progress: 100%|██████████| 2/2 [00:24<00:00, 12.09s/it]

Finished enriching the vectorstore with news.





#### Ragflow Setup

In [2]:
import os
from dotenv import load_dotenv
from ragflow_sdk import RAGFlow

load_dotenv()

RAGFLOW_API_KEY = os.getenv("RAGFLOW_API_KEY")


In [3]:
client = RAGFlow(api_key=RAGFLOW_API_KEY, base_url="http://localhost:9380")

In [None]:
dataset = client.create_dataset(
    name="",
    description="",
)

In [4]:
dataset_list = client.list_datasets(
    page=1, 
    page_size=30, 
    orderby="create_time", 
    desc=True,
    id=None,
    name=None
)

dataset_name = None
dataset_id = None

for dataset in dataset_list:
    print(f"Dataset name is : {dataset.name}")
    print(f"Dataset id is : {dataset.id}")
    dataset_id = dataset.id
    dataset_name = dataset.name

Dataset name is : test
Dataset id is : 31d119e8109e11f0b39c0242ac140006


In [13]:
chat_assistant = client.create_chat(
    name = "Test jelly fish", # Name of the chat assistant
    avatar = "", 
    dataset_ids= [dataset_id], # All the knowledge base id
    llm = None, 
    prompt = None
)

In [14]:
session = chat_assistant.create_session()

<generator object Session.ask at 0x7f7cef34ad60>

In [15]:
question = "Do you know about: The Jellyfish Prince of Saturn. It should be in your knowledge base data"
print("\n==================== Miss R =====================\n")

cont = ""
for ans in session.ask(question, stream=True):
    print(ans.content[len(cont):], end='', flush=True)
    cont = ans.content



<SYSTEM>: Yes, I do have information on "The Jellyfish Prince of Saturn" from my knowledge base:

Document: jellyfish.pdf 
Relevant fragments as following:
ID: 0
The coronation rippled through the gas layers like thunder. Prince Vellior,a bioluminescent jellyfish born in the pressure-slicked oceans of Enceladus, ascended to rule the Ring Court. His crown? A plasma-forged shard of Cassini.Vellior's first act as ruler: banishing the rogue AI starfish that hacked the aurora generators. They retaliated by imploding the Methane Mines on Titan. Saturn's rings shuddered.The Prince issued a royal decree via subsonic pulse: "Peace can only be encoded."
ID: 1
Meanwhile, the Moon Dragons of Hyperion, long exiled, returned with neural flame-holographic fire that burns memory instead of matter. One winged beast erased the Grand Historian's mind inVellior formed a secret alliance with the Coral Syndicate, a psychic reef collective orbiting Rhea. Together, they launched Operation Inkfall-a psychic 

### Workload 1

In [13]:
def fetch_moving_averages(window_size='1 week', symbol='AAPL', ratio_threshold=5):
    query = f"""
    WITH symbol_data AS (
        SELECT symbol, time_bucket(%s, timestamp) AS time_window, AVG(close) AS avg_close
        FROM ohlc
        WHERE timestamp >= '2014-01-01' AND symbol = %s
        GROUP BY symbol, time_window
    ),
    moving_avg AS (
        SELECT symbol, time_window, avg_close, LAG(avg_close) OVER (PARTITION BY symbol ORDER BY time_window) AS prev_avg_close
        FROM symbol_data
    )
    SELECT 
        symbol, 
        time_window, 
        avg_close, 
        prev_avg_close, 
        (avg_close - prev_avg_close) / prev_avg_close * 100 AS percent_change
    FROM moving_avg
    WHERE prev_avg_close IS NOT NULL
    AND ABS((avg_close - prev_avg_close) / prev_avg_close * 100) >= %s
    ORDER BY symbol, time_window;
    """

    start_time = time.time()
    with psycopg2.connect(CONNECTION_URL) as conn:
        with conn.cursor() as cursor:
            cursor.execute(query, (window_size, symbol, ratio_threshold))
            results = cursor.fetchall()
        
    end_time = time.time()
    print(f"Time taken to fetch moving averages: {end_time - start_time} seconds")
    
    return results

In [14]:
query_without_index = fetch_moving_averages(window_size='1 week', symbol='AAPL', ratio_threshold=5)

Time taken to fetch moving averages: 0.3182985782623291 seconds


In [29]:
def create_hypertable_index():
    start_time = time.time()
    db_exec("CREATE INDEX IF NOT EXISTS ohlc_timestamp_idx ON ohlc (timestamp DESC);", "Index created successfully.\n")
    end_time = time.time()
    print(f"Time taken to create index: {end_time - start_time} seconds")

def drop_hypertable_index():
    start_time = time.time()
    db_exec("DROP INDEX IF EXISTS ohlc_timestamp_idx;", "Index dropped successfully.\n")
    end_time = time.time()
    print(f"Time taken to drop index: {end_time - start_time} seconds")

In [35]:
def mv_avg_query_speed():
    create_hypertable_index()
    fetch_moving_averages(window_size='1 week', symbol='AAPL', ratio_threshold=5) # with index
    
    drop_hypertable_index()
    fetch_moving_averages(window_size='1 week', symbol='AAPL', ratio_threshold=5) # without index

mv_avg_query_speed() # for some reason querying without index is faster. Explore more

Index created successfully.

Time taken to create index: 1.1449158191680908 seconds
Time taken to fetch moving averages: 0.3076913356781006 seconds
Index dropped successfully.

Time taken to drop index: 0.08078527450561523 seconds
Time taken to fetch moving averages: 0.24470782279968262 seconds


In [37]:
create_hypertable_index() # reset index

Index created successfully.

Time taken to create index: 1.1028285026550293 seconds


In [None]:
def compress_ohlc(): # dont know if this actually works
    sql = """
    ALTER TABLE ohlc 
    SET (
        timescaledb.compress, 
        timescaledb.compress_segmentby='symbol', 
        timescaledb.compress_orderby='timestamp DESC'
    );
    """
    start_time = time.time()
    db_exec(sql, "Compression settings applied successfully.")
    end_time = time.time()
    print(f"Time taken to set compression settings: {end_time - start_time} seconds")

In [33]:
compress_ohlc()

Compression settings applied successfully.


In [97]:
# Without the use of timescaledb
# def get_moving_averages(symbol, window=1, ratio=5, group_by_industry=False):
#     """
#     Calculates moving averages for a given symbol with an optional industry grouping.

#     Args:
#         symbol (str): Stock symbol to analyze.
#         window (int): Window size in weeks (1 or 2).
#         ratio (int): Significant price change threshold in percentage (±5, ±10, ±15).
#         group_by_industry (bool): If True, groups by industry instead of symbol.

#     Returns:
#         List of tuples: (symbol/industry, window_start, avg_close, prev_avg_close, price_change_percentage)
#     """
#     group_column = "industry" if group_by_industry else "symbol, industry"
#     partition_column = "industry" if group_by_industry else "symbol"
    
#     sql = f"""
#     WITH ohlc_window AS (
#         SELECT 
#             {group_column},
#             DATE_TRUNC('week', timestamp) AS window_start,
#             AVG(close) AS avg_close
#         FROM ohlc
#         WHERE symbol = %s AND timestamp >= '2014-01-01'
#         GROUP BY {group_column}, window_start
#     ),
#     price_changes AS (
#         SELECT 
#             {group_column},
#             window_start,
#             avg_close,
#             LAG(avg_close) OVER (PARTITION BY {partition_column} ORDER BY window_start) AS prev_avg_close
#         FROM ohlc_window
#     )
#     SELECT 
#         {group_column},
#         window_start,
#         avg_close,
#         prev_avg_close,
#         (avg_close - prev_avg_close) / prev_avg_close * 100 AS price_change_percentage
#     FROM price_changes
#     WHERE prev_avg_close IS NOT NULL
#     AND ABS((avg_close - prev_avg_close) / prev_avg_close * 100) >= %s;
#     """

#     with psycopg2.connect(CONNECTION_URL) as conn:
#         with conn.cursor() as cursor:
#             cursor.execute(sql, (symbol, ratio))
#             result = cursor.fetchall()
    
#     return result

# # Example Usage:
# symbol = "AAPL"
# window = 2  # 2-week window
# ratio = 10  # 10% price change threshold
# group_by_industry = False  # Set to True if you want industry-level aggregation

# result = get_moving_averages(symbol, window, ratio, group_by_industry)
# print(result)


### Workload 2

In [104]:
%pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [105]:
from bs4 import BeautifulSoup

In [110]:
BASE_DIR = "./dataset"
SEC_PATH = os.path.join(BASE_DIR, "SEC-Filings")

In [142]:
from bs4 import BeautifulSoup
import os

def extract_net_sales_and_fiscal_date(file_path):
    # Open and parse the file
    with open(file_path, "r") as f:
        soup = BeautifulSoup(f, "lxml")
    
    # Find the <TEXT> section of the document
    text_section = soup.find("text")
    if not text_section:
        print("No <TEXT> section found in the document.")
        return None, None

    # Parse the HTML content inside the <TEXT> section
    html_content = BeautifulSoup(text_section.get_text(), "lxml")

    # Example: Extract net sales and fiscal ending date from tables
    net_sales = None
    fiscal_ending_date = None

    # Look for tables or specific keywords
    for table in html_content.find_all("table"):
        rows = table.find_all("tr")
        for row in rows:
            cells = row.find_all(["td", "th"])
            if len(cells) > 1:
                # Check for "Net Sales" in the row
                if "Net Sales" in cells[0].get_text(strip=True):
                    net_sales = cells[1].get_text(strip=True)
                # Check for "Fiscal Ending Date" in the row
                if "Fiscal Ending Date" in cells[0].get_text(strip=True):
                    fiscal_ending_date = cells[1].get_text(strip=True)
            
            # Stop if both values are found
            if net_sales and fiscal_ending_date:
                break
        if net_sales and fiscal_ending_date:
            break

    return net_sales, fiscal_ending_date

# Example usage
file_path = "/home/neale/cs4221_final_project/dataset/SEC-Filings/AAPL/10-Q/0000320193-17-000009/full-submission.txt"
net_sales, fiscal_ending_date = extract_net_sales_and_fiscal_date(file_path)

print("Net Sales:", net_sales)
print("Fiscal Ending Date:", fiscal_ending_date)

Net Sales: None
Fiscal Ending Date: None


In [None]:
def extract_sec_data():
    for company in os.listdir(SEC_PATH):
        company_filing = os.path.join(SEC_PATH, company, "10-Q")

        for date in os.listdir(company_filing):
            sec_filings = os.path.join(company_filing, date)
            for file in os.listdir(sec_filings):
                file_path = os.path.join(sec_filings, file)
                soup = BeautifulSoup(open(file_path, "r"), "lxml")
                xml_documents = []
                for doc in soup.find_all("document"):
                    type_tag = doc.find("type")
                    if type_tag and "XML" in type_tag.get_text(strip=True):  # Match "XML"
                        xml_documents.append(doc)
                    
                    if len(xml_documents) == 2:  # Stop after finding the first two
                        break
                return xml_documents
                
                break
            break
        break

def parse_filing(file_path):
    soup = BeautifulSoup(open(file_path, "r"), "lxml")
    xml_documents = []
    for doc in soup.find_all("document"):
        type_tag = doc.find("type")
        if type_tag and "XML" in type_tag.get_text(strip=True):  # Match "XML"
            xml_documents.append(doc)
        
        if len(xml_documents) == 2:  # Stop after finding the first two
            break
    
    doc_information, fin_statement = xml_documents

    # parse doc_information
    doc_information.findall("")
    
docs = extract_sec_data()

In [141]:
print(docs)
print(len(docs))

[<document>
<type>XML
<sequence>14
<filename>R1.htm
<description>IDEA: XBRL DOCUMENT
<text>
<title></title>
<link href="report.css" rel="stylesheet" type="text/css"/>
<script src="Show.js" type="text/javascript">/* Do Not Remove This Comment */</script><script type="text/javascript">
							function toggleNextSibling (e) {
							if (e.nextSibling.style.display=='none') {
							e.nextSibling.style.display='block';
							} else { e.nextSibling.style.display='none'; }
							}</script>
<span style="display: none;">v3.4.0.3</span><table border="0" cellspacing="2" class="report" id="idp6820538640">
<tr>
<th class="tl" colspan="1" rowspan="2"><div style="width: 200px;"><strong>Document and Entity Information - shares<br/> shares in Thousands</strong></div></th>
<th class="th" colspan="1">6 Months Ended</th>
<th class="th" colspan="1"></th>
</tr>
<tr>
<th class="th"><div>Mar. 26, 2016</div></th>
<th class="th"><div>Apr. 08, 2016</div></th>
</tr>
<tr class="re">
<td class="pl" style="border

In [None]:
with open("./dataset/profile_estimate/historical_earning_estimates.json", "r") as file:
    earnings_estimate_data = json.load(file) 

In [None]:
calc_moving_averge = """
WITH ohlc_windows AS (
    SELECT 
        time_bucket('1 week', o.timestamp) AS window_start,  
        s.symbol, 
        i.industry,
        AVG(o.close) AS moving_avg,
        LAG(AVG(o.close)) OVER (PARTITION BY s.id ORDER BY time_bucket('1 week', o.timestamp)) AS prev_moving_avg
    FROM ohlc o
    JOIN symbol s ON o.symbol_id = s.id
    JOIN industry i ON s.industry_id = i.id
    WHERE o.timestamp >= '2014-01-01'
    GROUP BY s.id, s.symbol, i.industry, time_bucket('1 week', o.timestamp)
)
SELECT 
    window_start,
    symbol,
    industry,
    moving_avg,
    prev_moving_avg,
    ((moving_avg - prev_moving_avg) / prev_moving_avg) * 100 AS percent_change
FROM ohlc_windows
WHERE ABS((moving_avg - prev_moving_avg) / prev_moving_avg) * 100 IN (5, 10, 15);
"""
db_exec(calc_moving_averge)

In [None]:
create_industry_table = """CREATE TABLE IF NOT EXISTS industry (
    id SERIAL PRIMARY KEY,
    name VARCHAR(50) UNIQUE NOT NULL
);
"""
db_exec(create_industry_table, "INDUSTRY table created successfully")

create_symbol_table = """CREATE TABLE IF NOT EXISTS symbol  (
    id SERIAL PRIMARY KEY,
    name VARCHAR(10) UNIQUE NOT NULL,
    industry_id INTEGER references industry(id) ON DELETE SET NULL
);
"""
db_exec(create_symbol_table, "SYMBOL table created successfully")

create_ohlc_table = """CREATE TABLE IF NOT EXISTS ohlc (
    id SERIAL PRIMARY KEY,
    symbol_id INTEGER REFERENCES symbol(id) ON DELETE CASCADE,
    timestamp TIMESTAMP WITHOUT TIME ZONE NOT NULL,
    open DECIMAL(10,2),
    high DECIMAL(10,2),
    low DECIMAL(10,2),
    close DECIMAL(10,2),
    volume BIGINT
);
"""
db_exec(create_ohlc_table, "OHLC table created successfully")

In [None]:

def extract_industries():
    industries = set()
    with open(PROFILE_PATH, "r") as file:
        company_profiles = json.load(file)
        for companies in company_profiles.values():
            industry = companies[0]["industry"]
            if industry:
                industries.add(industry)
    return industries

def insert_industries():
    industries = extract_industries()
    values = ", ".join(["(%s)"] * len(industries))
    sql = f"INSERT INTO industry (name) VALUES {values} ON CONFLICT (name) DO NOTHING" # batch insertion
    with psycopg2.connect(CONNECTION_URL) as conn:
        with conn.cursor() as cursor:
            cursor.execute(sql, tuple(industries))
        conn.commit()

def extract_symbols():
    symbols = set()
    with open(PROFILE_PATH, "r") as file:
        company_profiles = json.load(file)
        for symbol in company_profiles.keys():
            symbols.add(symbol)
    return symbols

def insert_symbols():
    symbols = extract_symbols()
    values = ", ".join(["(%s)"] * len(symbols))
    sql = f"INSERT INTO symbol (name, industry_id) VALUES {values} ON CONFLICT (name) DO NOTHING;"

    industry_lookup_sql = "SELECT id, name FROM industry;"

    with psycopg2.connect(CONNECTION_URL) as conn:
        with conn.cursor() as cursor:
            cursor.execute(industry_lookup_sql)
            industry_map = {name: id for id, name in cursor.fetchall()}

            params = []
            for symbol in symbols:
                industry_id = industry_map.get(symbol["industry"])
                if industry_id:
                    params.extend([symbol["name"], industry_id])

            if params:
                cursor.execute(sql, tuple(params))
                conn.commit()