In [5]:
# !pip install pandas

## TOC:
* [Files](#this-is-to-create-the-db)
* [Code](#code-to-create-database)
    * [Test Schema](#Test-schema)
    * [SP500 Schema](#sp500-schema)
    * [Finbert](#finbert-related-stuff)
    * [Finbert 10ks](#10k-finbert)
    * [VIX](#vix)
    * [VIX Preds](#vix-preds)
    * [Finbert All Tags](#finbert-all-scores)

# *THIS IS TO CREATE THE DB*
You can download the already made DB file `financial_news.db` from the sharepoint 

[practicum folder](https://gtvault-my.sharepoint.com/:f:/g/personal/ltupac3_gatech_edu/Eg2gLDzQ8H1JoWUrUIq1G04BPkOXMyxmhgcoL84Q58-5dg?e=80dziH)

[db file](https://gtvault-my.sharepoint.com/:u:/g/personal/ltupac3_gatech_edu/Edi6YX6MKPxMud1e5maTIjsBo04ISTst1j7uoxeSVH2OBA?e=XQD3Ed)

In [4]:
import duckdb
import pandas as pd 
import os 
import xml.etree.ElementTree as ET

In [None]:
import os 
from dotenv import load_dotenv
from pathlib import Path
load_dotenv()
DB_PATH = Path(os.getenv("DB_PATH"))
DB_FILE = os.getenv("DB_FILE")
duckdb_path = DB_PATH / DB_FILE

VOLUME_NEWS_PATH = Path(os.getenv("volume_news_path"))
VOLUME_NEWS_FILE = os.getenv("volume_news_file")
VOLUME_FULL_PATH = VOLUME_NEWS_PATH / VOLUME_NEWS_FILE

PRICING_NEWS_PATH = Path(os.getenv("pricing_news_path"))
PRICING_NEWS_FILE = os.getenv("pricing_news_file")
PRICING_FULL_PATH = PRICING_NEWS_PATH / PRICING_NEWS_FILE

HEADLINE_JAN25_PATH = Path(os.getenv("headline_jan25_path"))

VIX_PATH = Path(os.getenv("vix_path"))
VIX_FILE = os.getenv("vix_file")
VIX_FULL_PATH = VIX_PATH / VIX_FILE

# Code to create database
Instructions:
1. For individual files (company_info_news.txt, volume_news.csv, etc.), copy the relative path to the respective variable below
2. For headline data, put the relative path to the folder housing the ticker folders.
    - Example: MultiCap_News/Test houses the individual ticker folders. 
    - The code will recursively pick up the files from there

# Test Schema

In [None]:
con = duckdb.connect(duckdb_path)

# company_txt_path = 'MultiCap_News\\company_info_news.txt'
# volume_news_path = 'MultiCap_News\\volume_news.csv'
volume_news_path = VOLUME_FULL_PATH
# pricing_news_path = 'MultiCap_News\\pricing_news.csv'
pricing_news_path = PRICING_FULL_PATH
# multicap_Test = 'new_data\\mcap_Jan25.csv'
headline_jan25_path = HEADLINE_JAN25_PATH

In [4]:
# con.close()

In [5]:
# create the DDLs and indexes
ddl_statements = [
    "CREATE SCHEMA IF NOT EXISTS Test;",
    """
    CREATE TABLE IF NOT EXISTS Test.Articles (
        guid TEXT,
        ticker TEXT,
        description TEXT,
        article_link TEXT,
        article_pubDate TIMESTAMP,
        article_title TEXT,
        language TEXT,
        lastBuildDate TIMESTAMP,
        link TEXT,
        title TEXT,
        PRIMARY KEY (guid, ticker)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS Test.Company_Info_News (
        ticker TEXT PRIMARY KEY,
        name TEXT,
        subindustry TEXT
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS Test.Pricing_News (
        trading_day_date DATE,
        ticker TEXT,
        price FLOAT,
        PRIMARY KEY (trading_day_date, ticker)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS Test.Volume_News (
        trading_day_date DATE,
        ticker TEXT,
        volume INT,
        PRIMARY KEY (trading_day_date, ticker)
    );
    """,
    ############ Gold Layer ############
    """
    CREATE TABLE IF NOT EXISTS Test.Market_Data_Daily_Processing (
    trading_day_date DATE,
    ticker TEXT,
    price FLOAT,
    volume INT,
    PRIMARY KEY (trading_day_date, ticker)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS Test.Market_Data_Test (
    trading_day_date DATE,
    ticker TEXT,
    price FLOAT,
    volume INT,
    headline_count INT,
    PRIMARY KEY (trading_day_date, ticker)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS Test.Trading_Calendar (
    trading_date DATE PRIMARY KEY
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS Test.Articles_Trading_Day (
    guid TEXT,
    ticker TEXT,
    mapped_trading_date DATE,
    description TEXT,
    article_link TEXT,
    article_pubDate TIMESTAMP,
    article_title TEXT,
    language TEXT,
    lastBuildDate TIMESTAMP,
    link TEXT,
    title TEXT,
    PRIMARY KEY (guid, ticker)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS Test.Market_Article_Summary (
    trading_date DATE PRIMARY KEY,
    article_count INT
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS Test.Daily_Price_Movement (
    trading_date DATE,
    ticker TEXT,
    close_price FLOAT,
    next_trading_day DATE,
    close_price_next FLOAT,
    price_change FLOAT,
    price_change_percentage FLOAT,
    PRIMARY KEY (trading_date, ticker),
    FOREIGN KEY (trading_date) REFERENCES Test.trading_calendar(trading_date),
    FOREIGN KEY (next_trading_day) REFERENCES Test.trading_calendar(trading_date)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS Test.Weekly_Price_Movement (
    trading_week_start DATE,
    ticker TEXT,
    close_price_start FLOAT,
    trading_week_end DATE,
    close_price_end FLOAT,
    price_change FLOAT,
    price_change_percentage FLOAT,
    PRIMARY KEY (trading_week_start, ticker),
    FOREIGN KEY (trading_week_start) REFERENCES Test.trading_calendar(trading_date),
    FOREIGN KEY (trading_week_end) REFERENCES Test.trading_calendar(trading_date)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS Test.extreme_price_movements (
    trading_date DATE,
    ticker TEXT,
    close_price FLOAT,
    price_change FLOAT,
    price_change_percentage FLOAT,
    movement_type TEXT,  -- Drop|Surge
    PRIMARY KEY (trading_date, ticker)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS Test.articles_extreme_drops (
    trading_date DATE,
    ticker TEXT,
    guid TEXT,
    mapped_trading_date DATE, 
    title_sentiment_score FLOAT,
    title_sentiment_label TEXT,
    description_sentiment_score FLOAT,
    description_sentiment_label TEXT,
    PRIMARY KEY (trading_date, ticker, guid)
    );
    """
]

drop_statements = [
    "DROP TABLE IF EXISTS Test.Articles;",
    "DROP TABLE IF EXISTS Test.Company_Info_News;",
    "DROP TABLE IF EXISTS Test.Pricing_News;",
    "DROP TABLE IF EXISTS Test.Volume_News;",
    "DROP TABLE IF EXISTS Test.Market_Data_Daily_Processing;",
    "DROP TABLE IF EXISTS Test.Market_Data_Test;",
    "DROP TABLE IF EXISTS Test.Trading_Calendar;",
    "DROP TABLE IF EXISTS Test.Articles_Trading_Day;",
    "DROP TABLE IF EXISTS Test.Market_Article_Summary;",
    "DROP TABLE IF EXISTS Test.Daily_Price_Movement;",
    "DROP TABLE IF EXISTS Test.Weekly_Price_Movement;",
    "DROP TABLE IF EXISTS Test.extreme_price_movements;",
    "DROP TABLE IF EXISTS Test.articles_extreme_drops;"
]

index_statements = [
    "CREATE INDEX IF NOT EXISTS idx_articles_pubDate ON Test.Articles (article_pubDate);",
    "CREATE INDEX IF NOT EXISTS idx_articles_pubDate ON Test.Articles_Trading_Day (article_pubDate);"
    "CREATE INDEX IF NOT EXISTS idx_stock_movement_ticker ON Test.Daily_Price_Movement (ticker);",
    "CREATE INDEX IF NOT EXISTS idx_stock_movement_ticker ON Test.Weekly_Price_Movement (ticker);"
]

for drop in drop_statements:
    con.execute(drop)

for ddl in ddl_statements:
    con.execute(ddl)

for index in index_statements:
    con.execute(index)


## Load `Company_Info_News`

In [None]:
# # its all in one line
# with open(company_txt_path, 'r') as file:
#     lines = file.readline().split('\\n')
#     # con.execute("TRUNCATE Company_Info_News")
#     for line in lines[1:]:
#         line = line.strip().split('|')
#         # DONT RUN THIS TWICE BY MISTAKE!
#         con.execute("INSERT INTO Test.Company_Info_News VALUES (?,?,?)", line)

## Load `Volume_News` 

In [8]:
df = pd.read_csv(volume_news_path)
# df.head()

# convert the wide format to long format
volume_long_df = df.melt(id_vars=['Date'], var_name='Ticker', value_name='Volume')

# make sure they have the correct data types
volume_long_df['Date'] = pd.to_datetime(volume_long_df['Date'])
volume_long_df['Volume'] = pd.to_numeric(volume_long_df['Volume'], errors='coerce')

# con.execute("TRUNCATE Volume_News")
con.execute("INSERT INTO Test.Volume_News (trading_day_date, ticker, Volume) SELECT Date, ticker, Volume FROM volume_long_df")

<duckdb.duckdb.DuckDBPyConnection at 0x21c411619b0>

## Load `Pricing_News`

In [9]:
df = pd.read_csv(pricing_news_path)
# convert the wide format to long format
pricing_long_df = df.melt(id_vars=['Date'], var_name='Ticker', value_name='Price')

# make sure they have the correct data types
pricing_long_df['Date'] = pd.to_datetime(pricing_long_df['Date'])
pricing_long_df['Price'] = pd.to_numeric(pricing_long_df['Price'], errors='coerce')

# con.execute("TRUNCATE Pricing_News")
con.execute("INSERT INTO Test.Pricing_News (trading_day_date, Ticker, price) SELECT Date, ticker, Price FROM pricing_long_df")

<duckdb.duckdb.DuckDBPyConnection at 0x21c411619b0>

## Load `Market_Data_Daily_Processing`

In [10]:
# con.execute("TRUNCATE Market_Data_Daily_Processing")
con.execute("""
INSERT INTO Test.Market_Data_Daily_Processing
SELECT 
    pn.trading_day_date,
    pn.ticker,
    pn.price,
    vn.volume
FROM 
    Test.Pricing_News pn
LEFT JOIN 
    Test.Volume_News vn 
ON 
    pn.trading_day_date = vn.trading_day_date AND pn.ticker = vn.ticker
""")

<duckdb.duckdb.DuckDBPyConnection at 0x21c411619b0>

## Load `Trading_Calendar`

In [11]:
# pricing_dates = pricing_long_df['Date'].drop_duplicates()
# volume_dates = volume_long_df['Date'].drop_duplicates()

# trading_dates = pd.concat([pricing_dates, volume_dates]).drop_duplicates().sort_values()

# # make sure to have correct col name
# trading_dates_df = pd.DataFrame(trading_dates, columns=['trading_date'])
# con.execute("TRUNCATE Trading_Calendar")
con.execute(
"""
INSERT INTO Test.Trading_Calendar
SELECT DISTINCT trading_day_date AS trading_date
FROM (
    SELECT trading_day_date FROM Test.Pricing_News
    UNION
    SELECT trading_day_date FROM Test.Volume_News
) AS all_dates
ORDER BY trading_date;
"""
)

<duckdb.duckdb.DuckDBPyConnection at 0x21c411619b0>

## Load `Articles`

In [12]:
def xml_loader(base_dir):
    # lets do this in chunks instead
    failed_parses = pd.DataFrame()

    for root, dirs, files in os.walk(base_dir):
        data = [] 
        # extract ticker from foldername 
        ticker = os.path.basename(root)

        for file in files:
            file_path = os.path.join(root, file)
            
            try:
                tree = ET.parse(file_path)
                root_element = tree.getroot()
                
                channel = root_element.find('channel')
                if channel is not None:
                    # extract metadata info
                    language = channel.findtext("language") 
                    lastBuildDate = channel.findtext("lastBuildDate")
                    link = channel.findtext("link")
                    title = channel.findtext("title")
                    
                    # now meat and potatoes
                    for item in channel.findall("item"):
                        description = item.findtext("description")
                        guid = item.findtext("guid")
                        article_link = item.findtext("link")
                        article_pubDate = item.findtext("pubDate")
                        article_title = item.findtext("title")
                        
                        data.append({
                            "guid": guid,
                            "ticker": ticker,
                            "description": description,
                            "article_link": article_link,
                            "article_pubDate": article_pubDate,
                            "article_title": article_title,
                            "language": language,
                            "lastBuildDate": lastBuildDate,
                            "link": link,
                            "title": title
                        })
            except ET.parseError as e:
                print(f"Error parsing file {file_path}: {e}")
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")
        
        # insert the data into the database
        if data:
            df = pd.DataFrame(data)
            # print("Performing timestamp coercion for", ticker)
            df['parsed_date'] = pd.to_datetime(df['article_pubDate'], errors='coerce')
            df['lastBuildDate'] = pd.to_datetime(df['lastBuildDate'], errors='coerce')
            # print("Done timestamp coercion for", ticker)
            
            # separate failed cases to avoid nulls
            current_failed = df[df['parsed_date'].isna()]
            current_valid = df[df['parsed_date'].notna()]

            # these are good
            current_valid = current_valid.assign(article_pubDate=current_valid['parsed_date']).drop(columns=['parsed_date'])

            # remove dupes on guid and ticker
            current_valid.drop_duplicates(subset=['guid', 'ticker'], inplace=True)

            
            failed_parses = pd.concat([failed_parses, current_failed], ignore_index=True)
            
            try:
                # adding this too just in case
                con.execute("INSERT INTO Test.Articles SELECT * FROM current_valid ON CONFLICT (guid, ticker) DO NOTHING")
                print("inserted data for", ticker)
            except Exception as e:
                print(f"Error inserting data for {ticker}: {e}")
                
    failed_parses.to_csv("failed_article_dates.csv", index=False)
    return failed_parses

In [13]:
# GETTING NULLS! gonna fix the coercion logic
# con.execute("truncate Articles")

failed_df = xml_loader(headline_jan25_path)

# load multicap Test
# failed_df = xml_loader(multicap_Test)
# load new Test
# failed2_df = xml_loader(headline_august24_path)

inserted data for A
inserted data for AA
inserted data for AAL
inserted data for AAON
inserted data for AAPL
inserted data for ABBV
inserted data for ABCB
inserted data for ABG
inserted data for ABNB
inserted data for ABT
inserted data for ACA
inserted data for ACGL
inserted data for ACHC
inserted data for ACI
inserted data for ACIW
inserted data for ACM
inserted data for ACN
inserted data for ACT
inserted data for ADBE
inserted data for ADC
inserted data for ADI
inserted data for ADM
inserted data for ADMA
inserted data for ADP
inserted data for ADSK
inserted data for ADT
inserted data for AEE
inserted data for AEIS
inserted data for AEO
inserted data for AEP
inserted data for AER
inserted data for AES
inserted data for AFG
inserted data for AFL
inserted data for AFRM
inserted data for AGCO
inserted data for AGNC
inserted data for AGO
inserted data for AIG
inserted data for AIT
inserted data for AIZ
inserted data for AJG
inserted data for AKAM
inserted data for AL
inserted data for AL

  df['parsed_date'] = pd.to_datetime(df['article_pubDate'], errors='coerce')


inserted data for KBR
inserted data for KD
inserted data for KDP
inserted data for KEX
inserted data for KEY
inserted data for KEYS
inserted data for KFY
inserted data for KGC
inserted data for KHC
inserted data for KIM
inserted data for KKR
inserted data for KLAC
inserted data for KMB
inserted data for KMI
inserted data for KMPR
inserted data for KMX
inserted data for KNF
inserted data for KNSL
inserted data for KNX
inserted data for KO
inserted data for KR
inserted data for KRC
inserted data for KRG
inserted data for KRYS
inserted data for KTB
inserted data for KVUE
inserted data for KVYO
inserted data for L
inserted data for LAD
inserted data for LAMR
inserted data for LANC
inserted data for LAND
inserted data for LAZ
inserted data for LBRDA
inserted data for LBRDK
inserted data for LBTYA
inserted data for LBTYK
inserted data for LCID
inserted data for LDOS
inserted data for LEA
inserted data for LECO
inserted data for LEN
inserted data for LEVI
inserted data for LFUS
inserted data 

In [18]:
# try again with the faulty data 
failed_df['article_pubDate'] = pd.to_datetime(failed_df['article_pubDate'], errors='coerce')
failed_df = failed_df.drop(columns=['parsed_date'])
failed_df.drop_duplicates(subset=['guid', 'ticker'], inplace=True)
try:
    # adding this too just in case
    con.execute("INSERT INTO Test.Articles SELECT * FROM failed_df ON CONFLICT (guid, ticker) DO NOTHING")
except Exception as e:
    print(f"Error inserting data: {e}")

In [None]:
# # try again with the faulty data 
# failed2_df['article_pubDate'] = pd.to_datetime(failed2_df['article_pubDate'], errors='coerce')
# failed2_df = failed2_df.drop(columns=['parsed_date'])
# failed2_df.drop_duplicates(subset=['guid', 'ticker'], inplace=True)
# try:
#     # adding this too just in case
#     con.execute("INSERT INTO Test.Articles SELECT * FROM failed2_df ON CONFLICT (guid, ticker) DO NOTHING")
# except Exception as e:
#     print(f"Error inserting data: {e}")

## Load `Articles_Trading_Day`

In [19]:
con.execute("truncate Test.Articles_Trading_Day")
con.execute("""
INSERT INTO Test.Articles_Trading_Day
SELECT 
    a.guid,
    a.ticker,
    coalesce(MIN(tc.trading_date), cast(a.article_pubDate as Date)) AS mapped_trading_date,
    a.description,
    a.article_link,
    a.article_pubDate,
    a.article_title,
    a.language,
    a.lastBuildDate,
    a.link,
    a.title
FROM (
    SELECT 
        guid,
        ticker,
        description,
        article_link,
        article_pubDate,
        article_title,
        language,
        lastBuildDate,
        link,
        title,
        -- 4 PM EST adjust
        CASE 
            WHEN CAST(article_pubDate AS TIME) >= '16:00:00' 
            THEN CAST(article_pubDate AS DATE) + INTERVAL '1 day'
            ELSE CAST(article_pubDate AS DATE)
        END AS adjusted_pub_date
    FROM Test.Articles
) a
LEFT JOIN 
    Test.Trading_Calendar tc
ON 
    tc.trading_date >= a.adjusted_pub_date
GROUP BY 
    a.guid, a.ticker, a.description, a.article_link, a.article_pubDate, 
    a.article_title, a.language, a.lastBuildDate, a.link, a.title;
""")

<duckdb.duckdb.DuckDBPyConnection at 0x21c411619b0>

## Create `Market_Data_Test`

In [20]:
# con.execute("Truncate Market_Data_Test")
con.execute("""
INSERT INTO Test.Market_Data_Test
SELECT 
    md.trading_day_date,
    md.ticker,
    md.price,
    md.volume,
    COALESCE(COUNT(DISTINCT atd.guid), 0) AS headline_count
FROM 
    Test.Market_Data_Daily_Processing md
LEFT JOIN 
    Test.Articles_Trading_Day atd
ON 
    md.ticker = atd.ticker AND md.trading_day_date = atd.mapped_trading_date
GROUP BY 
    md.trading_day_date, md.ticker, md.price, md.volume;
""")

<duckdb.duckdb.DuckDBPyConnection at 0x21c411619b0>

## Load `market_article_summary`

In [21]:
# con.execute("drop table Test.Market_Article_Summary")
# con.execute("""
#                 CREATE TABLE IF NOT EXISTS Test.Market_Article_Summary (
#     trading_date DATE PRIMARY KEY,
#     article_count INT
#     );

#             """)
con.execute('''
INSERT INTO Test.Market_Article_Summary
SELECT 
    atd.mapped_trading_date AS trading_date,
    COUNT(DISTINCT atd.guid) AS total_unique_articles
FROM 
    Test.Articles_Trading_Day atd
GROUP BY 
    atd.mapped_trading_date;
''')

<duckdb.duckdb.DuckDBPyConnection at 0x21c411619b0>

## Load `Daily_Price_Movement`

In [22]:
con.execute("truncate Test.Daily_Price_Movement")
con.execute("""
            INSERT INTO Test.Daily_Price_Movement
            SELECT 
                sp1.trading_day_date AS trading_date,
                sp1.ticker,
                sp1.price AS close_price,
                sp2.trading_day_date AS next_trading_day,
                sp2.price AS close_price_next,
                ROUND(sp2.price - sp1.price, 2) AS price_change,
                ROUND((sp2.price - sp1.price) / sp1.price * 100, 2) AS price_change_percentage
            FROM Test.market_data_daily_processing sp1
            LEFT JOIN Test.market_data_daily_processing sp2 
            ON sp2.ticker = sp1.ticker 
            AND sp2.trading_day_date = (
                SELECT MIN(sp3.trading_day_date) 
                FROM Test.market_data_daily_processing sp3
                WHERE sp3.ticker = sp1.ticker
                AND sp3.trading_day_date > sp1.trading_day_date
            );
""")

<duckdb.duckdb.DuckDBPyConnection at 0x21c411619b0>

## Load `Weekly_Price_Movement`

In [23]:
con.execute("truncate Test.Weekly_Price_Movement")
con.execute("""
            INSERT INTO Test.Weekly_Price_Movement
            WITH WeeklyPrices AS (
            SELECT 
                ticker,
                MIN(tc.trading_date) AS trading_week_start,
                MAX(tc.trading_date) AS trading_week_end
            FROM Test.market_data_daily_processing mdp
            JOIN Test.trading_calendar tc 
            ON mdp.trading_day_date = tc.trading_date
            WHERE EXTRACT(DOW FROM tc.trading_date) BETWEEN 1 AND 5  -- Only weekdays
            GROUP BY ticker, DATE_TRUNC('week', tc.trading_date)
        ),
        StartPrices AS (
            SELECT 
                mdp.trading_day_date AS trading_week_start, 
                mdp.ticker, 
                mdp.price AS close_price
            FROM Test.market_data_daily_processing mdp
            JOIN WeeklyPrices wp 
            ON mdp.ticker = wp.ticker 
            AND mdp.trading_day_date = wp.trading_week_start
        ),
        EndPrices AS (
            SELECT 
                mdp.trading_day_date AS trading_week_end, 
                mdp.ticker, 
                mdp.price AS close_price_end
            FROM Test.market_data_daily_processing mdp
            JOIN WeeklyPrices wp 
            ON mdp.ticker = wp.ticker 
            AND mdp.trading_day_date = wp.trading_week_end
        )
        SELECT 
            sp.trading_week_start,
            sp.ticker,
            sp.close_price as close_price_start,
            ep.trading_week_end,
            ep.close_price_end,
            ROUND(ep.close_price_end - sp.close_price, 2) AS price_change,
            ROUND((ep.close_price_end - sp.close_price) / sp.close_price * 100, 2) AS price_change_percentage
        FROM StartPrices sp
        JOIN EndPrices ep 
        ON sp.ticker = ep.ticker 
        AND sp.trading_week_start = ep.trading_week_end - INTERVAL '4 days';
            """)

<duckdb.duckdb.DuckDBPyConnection at 0x21c411619b0>

## Load `extreme_price_movements`

In [24]:
con.execute("truncate Test.extreme_price_movements")
con.execute("""
            INSERT INTO Test.extreme_price_movements
            SELECT trading_date, ticker, close_price, price_change, price_change_percentage,
                CASE 
                    WHEN price_change_percentage < -5 THEN 'Drop'
                    WHEN price_change_percentage > 5 THEN 'Surge'
                END AS movement_type
            FROM Test.daily_price_movement
            WHERE ABS(price_change_percentage) > 5;
""")

<duckdb.duckdb.DuckDBPyConnection at 0x21c411619b0>

## Load `articles_extreme_drops`

In [27]:
##TODO: fix this logic for extreme drops and articles

In [11]:
con.execute("truncate Test.articles_extreme_drops")
df = con.execute("""
            SELECT epm.trading_date, epm.ticker, a.guid, a.mapped_trading_date,
                fs.finbert_title_score AS title_sentiment_score,
                fs.finbert_title_label AS title_sentiment_label,
                fs.finbert_description_score AS descripton_sentiment_score,
                fs.finbert_description_label AS descripton_sentiment_label
            FROM Test.extreme_price_movements epm
            JOIN Test.articles_trading_day a
            ON epm.ticker = a.ticker
            AND a.mapped_trading_date BETWEEN epm.trading_date - INTERVAL '3 days' AND epm.trading_date
            LEFT JOIN Test.finbert_analysis fs
            ON a.guid = fs.guid
""").df() 

# dedupe based on trading_date, ticker, guid 
df.drop_duplicates(subset=['trading_date', 'ticker', 'guid'], inplace=True)

con.execute("INSERT INTO Test.articles_extreme_drops select * from df ON CONFLICT (trading_date, ticker, guid) DO NOTHING")

<duckdb.duckdb.DuckDBPyConnection at 0x25659245d30>

In [12]:
con.close()

In [None]:
# raise Exception("Test Headlines Done")

# VIX DATASET

In [None]:
con = duckdb.connect(duckdb_path)
vix = VIX_FULL_PATH

In [29]:
ddl = [
    """
    CREATE TABLE sp500.VIX_Index (
    vix_date DATE PRIMARY KEY,
    vix_value FLOAT
);
    """
]

drop = [
    "DROP TABLE IF EXISTS sp500.VIX_Index;"
]

for d in drop:
    con.execute(d)
    
for d in ddl:
    con.execute(d)

## Load `VIX_Index`

In [30]:
vix_df = pd.read_csv(vix, names=["vix_date", "vix_value"], parse_dates=["vix_date"], skiprows=1) # first row is the header but not the best
vix_df.head()

Unnamed: 0,vix_date,vix_value
0,1986-01-02,18.07
1,1986-01-03,17.96
2,1986-01-06,17.05
3,1986-01-07,17.39
4,1986-01-08,19.97


In [31]:
# check for nulls 
vix_df.isnull().sum()

vix_date     0
vix_value    0
dtype: int64

In [32]:
con.execute("Truncate sp500.VIX_Index")
con.execute("INSERT INTO sp500.VIX_Index select * from vix_df")

<duckdb.duckdb.DuckDBPyConnection at 0x21c42ee4b30>

In [33]:
con.close()

In [None]:
# raise Exception("DONE WITH VIX")

Exception: DONE WITH VIX

# FinBERT All Scores

In [None]:
con = duckdb.connect(duckdb_path)
# THIS IS CREATED BY finbert_all_scores_dask_NEW_DATA.ipynb
finbert_all_tags = "../finbert/finbert_sentiments_scores.csv"

In [6]:
ddl = ["""
CREATE TABLE IF NOT EXISTS Test.finbert_analysis (
    guid UUID ,
    ticker VARCHAR(10) NOT NULL,
    description TEXT,
    article_title TEXT,
    finbert_title_label VARCHAR(20) NOT NULL,
    finbert_title_score FLOAT NOT NULL,
    finbert_title_positive FLOAT NOT NULL,
    finbert_title_neutral FLOAT NOT NULL,
    finbert_title_negative FLOAT NOT NULL,
    finbert_description_label VARCHAR(20) NOT NULL,
    finbert_description_score FLOAT NOT NULL,
    finbert_description_positive FLOAT NOT NULL,
    finbert_description_neutral FLOAT NOT NULL,
    finbert_description_negative FLOAT NOT NULL,
    PRIMARY KEY (guid, ticker)
);

"""]

drop = [
    "DROP TABLE IF EXISTS Test.finbert_analysis;"
]

for d in drop:
    con.execute(d)

for d in ddl:
    con.execute(d)

## `Test.finbert_analysis`

In [7]:
con.execute("truncate Test.finbert_analysis")
df = pd.read_csv(finbert_all_tags)
con.execute("INSERT INTO Test.finbert_analysis select guid, ticker, description, article_title, finbert_title_label, finbert_title_score, finbert_title_positive, finbert_title_neutral, finbert_title_negative, finbert_description_label, finbert_description_score, finbert_description_positive, finbert_description_neutral, finbert_description_negative from df")

<duckdb.duckdb.DuckDBPyConnection at 0x2567146ca30>

In [8]:
con.close()

In [None]:
# raise Exception("DONE WITH FINBERT")

Exception: DONE WITH FINBERT

## Training Data for VIX Model

In [None]:
con = duckdb.connect(duckdb_path)

In [None]:
ddl = [
    """
CREATE TABLE Test.weekly_training_data(guid VARCHAR,
date_t DATE,
ticker VARCHAR,
subindustry VARCHAR,
vix_t FLOAT,
vix_t_7_past FLOAT,
vix_t_7_future FLOAT,
price_t FLOAT,
price_t_7_past FLOAT,
price_change_t_7 FLOAT,
volume_t INTEGER,
volume_t_7_past INTEGER,
volume_change_t_7 DOUBLE,
sentiment_label_t VARCHAR,
sentiment_positive_t FLOAT,
sentiment_neutral_t FLOAT,
sentiment_negative_t FLOAT);
"""
]

drop = [
    "DROP TABLE IF EXISTS Test.weekly_training_data;"
]


for d in drop:
    con.execute(d)
    
for d in ddl:
    con.execute(d)

`Test.weekly_training_data`

In [None]:
con.execute("truncate Test.weekly_training_data")
con.execute("""
            INSERT INTO Test.weekly_training_data
            SELECT * FROM (
WITH vix_lagged AS (
    SELECT 
        v1.vix_date AS date_t,
        v1.vix_value AS vix_t,
        COALESCE(LAG(v1.vix_value, 1) OVER (ORDER BY v1.vix_date), 17.22) AS vix_t_7_past, -- same add last val
        COALESCE(LEAD(v1.vix_value, 1) OVER (ORDER BY v1.vix_date), 23.39) AS vix_t_7_future -- adding to handle last vix day that we don't know
    FROM sp500.vix_weekly_training v1
)
--select * from vix_lagged;
,
-- ANYTHING WITH -1 in price_t, price_t7 or volume cols should be removed! 
market_lagged AS (
    SELECT 
        m1.trading_day_date AS date_t,
        m1.ticker,
        COALESCE(m1.price, -1) AS price_t,  -- Set -1 if all price data is NULL
        COALESCE(LAG(m1.price, 1) OVER (PARTITION BY m1.ticker ORDER BY m1.trading_day_date), -1) AS price_t_7_past,
        (CASE 
            WHEN LAG(m1.price, 1) OVER (PARTITION BY m1.ticker ORDER BY m1.trading_day_date) IS NOT NULL 
            THEN ((m1.price - LAG(m1.price, 1) OVER (PARTITION BY m1.ticker ORDER BY m1.trading_day_date)) / 
                  LAG(m1.price, 1) OVER (PARTITION BY m1.ticker ORDER BY m1.trading_day_date)) * 100
            ELSE NULL 
        END) AS price_change_t_7,
        COALESCE(m1.volume, -1) AS volume_t,  -- Set -1 if all volume data is NULL
        COALESCE(LAG(m1.volume, 1) OVER (PARTITION BY m1.ticker ORDER BY m1.trading_day_date), -1) AS volume_t_7_past,
        (CASE 
            WHEN LAG(m1.volume, 1) OVER (PARTITION BY m1.ticker ORDER BY m1.trading_day_date) IS NOT NULL 
            THEN ((m1.volume - LAG(m1.volume, 1) OVER (PARTITION BY m1.ticker ORDER BY m1.trading_day_date)) / 
                  NULLIF(LAG(m1.volume, 1) OVER (PARTITION BY m1.ticker ORDER BY m1.trading_day_date), 0)) * 100
            ELSE NULL 
        END) AS volume_change_t_7
    FROM Test.Market_Data_Test m1
--    where ticker in (
--    	select * from Test.sp500_active_stocks
--    )
)
--select * from market_lagged;
,
article_sentiment AS (
    SELECT 
    	a.guid,
        a.mapped_trading_date AS date_t,
        a.ticker,
        f.finbert_description_positive AS sentiment_positive_t,
        f.finbert_description_neutral AS sentiment_neutral_t,
        f.finbert_description_negative AS sentiment_negative_t,
        f.finbert_description_label AS sentiment_label_t  -- Just default to NEUTRAL
    FROM Test.Articles_Trading_Day a
    JOIN Test.finbert_analysis f ON a.guid = f.guid
)
--select * from article_sentiment;
--select count(*) from (
SELECT distinct -- sometimes we have dupes...I think we have dupes upstream but oh whale
s.guid,
    v.date_t,
    m.ticker,
    c.subindustry,
    v.vix_t,
    v.vix_t_7_past,
    v.vix_t_7_future,
    m.price_t,
    m.price_t_7_past,
    m.price_change_t_7,
    m.volume_t,
    m.volume_t_7_past,
    m.volume_change_t_7,
    coalesce(s.sentiment_label_t, 'NEUTRAL') as sentiment_label_t,
    coalesce(s.sentiment_positive_t, 0) as sentiment_positive_t,
    coalesce(s.sentiment_neutral_t, 1) as sentiment_neutral_t,
    coalesce(s.sentiment_negative_t, 0) as sentiment_negative_t
FROM vix_lagged v
JOIN market_lagged m ON v.date_t = m.date_t
LEFT JOIN article_sentiment s ON v.date_t = s.date_t AND m.ticker = s.ticker
LEFT JOIN sp500.company_info c ON m.ticker = c.ticker  -- NEW JOIN
order by v.date_t desc) as insert_query;
""")

In [None]:
con.close()