# *THIS IS TO CREATE THE DB*
You can download the already made DB file `financial_news.db` from the sharepoint 

[practicum folder](https://gtvault-my.sharepoint.com/:f:/g/personal/ltupac3_gatech_edu/Eg2gLDzQ8H1JoWUrUIq1G04BPkOXMyxmhgcoL84Q58-5dg?e=80dziH)

[db file](https://gtvault-my.sharepoint.com/:u:/g/personal/ltupac3_gatech_edu/Edi6YX6MKPxMud1e5maTIjsBo04ISTst1j7uoxeSVH2OBA?e=XQD3Ed)

In [2]:
import duckdb
import pandas as pd 
import os 
import xml.etree.ElementTree as ET

# Code to create database
Instructions:
1. For individual files (company_info_news.txt, volume_news.csv, etc.), copy the relative path to the respective variable below
2. For headline data, put the relative path to the folder housing the ticker folders.
    - Example: MultiCap_News/HEADLINES houses the individual ticker folders. 
    - The code will recursively pick up the files from there

In [50]:
con = duckdb.connect("financial_news.db")

company_txt_path = 'MultiCap_News\\company_info_news.txt'
# volume_news_path = 'MultiCap_News\\volume_news.csv'
volume_news_path = 'Headlines_Data\\volume.csv'
# pricing_news_path = 'MultiCap_News\\pricing_news.csv'
pricing_news_path = 'Headlines_Data\\pricing.csv'
multicap_headlines = 'MultiCap_News\\HEADLINES'
headline_august24_path = 'HEADLINES_August24'

In [51]:
# create the DDLs and indexes
ddl_statements = [
    "CREATE SCHEMA IF NOT EXISTS Headlines;",
    """
    CREATE TABLE IF NOT EXISTS headlines.Articles (
        guid TEXT,
        ticker TEXT,
        description TEXT,
        article_link TEXT,
        article_pubDate TIMESTAMP,
        article_title TEXT,
        language TEXT,
        lastBuildDate TIMESTAMP,
        link TEXT,
        title TEXT,
        PRIMARY KEY (guid, ticker)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS headlines.Company_Info_News (
        ticker TEXT PRIMARY KEY,
        name TEXT,
        subindustry TEXT
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS headlines.Pricing_News (
        trading_day_date DATE,
        ticker TEXT,
        price FLOAT,
        PRIMARY KEY (trading_day_date, ticker)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS headlines.Volume_News (
        trading_day_date DATE,
        ticker TEXT,
        volume INT,
        PRIMARY KEY (trading_day_date, ticker)
    );
    """,
    ############ Gold Layer ############
    """
    CREATE TABLE IF NOT EXISTS headlines.Market_Data_Daily_Processing (
    trading_day_date DATE,
    ticker TEXT,
    price FLOAT,
    volume INT,
    PRIMARY KEY (trading_day_date, ticker)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS headlines.Market_Data_Headlines (
    trading_day_date DATE,
    ticker TEXT,
    price FLOAT,
    volume INT,
    headline_count INT,
    PRIMARY KEY (trading_day_date, ticker)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS headlines.Trading_Calendar (
    trading_date DATE PRIMARY KEY
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS headlines.Articles_Trading_Day (
    guid TEXT,
    ticker TEXT,
    mapped_trading_date DATE,
    description TEXT,
    article_link TEXT,
    article_pubDate TIMESTAMP,
    article_title TEXT,
    language TEXT,
    lastBuildDate TIMESTAMP,
    link TEXT,
    title TEXT,
    PRIMARY KEY (guid, ticker)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS headlines.Market_Article_Summary (
    trading_date DATE PRIMARY KEY,
    article_count INT
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS headlines.Daily_Price_Movement (
    trading_date DATE,
    ticker TEXT,
    close_price FLOAT,
    next_trading_day DATE,
    close_price_next FLOAT,
    price_change FLOAT,
    price_change_percentage FLOAT,
    PRIMARY KEY (trading_date, ticker),
    FOREIGN KEY (trading_date) REFERENCES headlines.trading_calendar(trading_date),
    FOREIGN KEY (next_trading_day) REFERENCES headlines.trading_calendar(trading_date)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS headlines.Weekly_Price_Movement (
    trading_week_start DATE,
    ticker TEXT,
    close_price_start FLOAT,
    trading_week_end DATE,
    close_price_end FLOAT,
    price_change FLOAT,
    price_change_percentage FLOAT,
    PRIMARY KEY (trading_week_start, ticker),
    FOREIGN KEY (trading_week_start) REFERENCES headlines.trading_calendar(trading_date),
    FOREIGN KEY (trading_week_end) REFERENCES headlines.trading_calendar(trading_date)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS headlines.extreme_price_movements (
    trading_date DATE,
    ticker TEXT,
    close_price FLOAT,
    price_change FLOAT,
    price_change_percentage FLOAT,
    movement_type TEXT,  -- Drop|Surge
    PRIMARY KEY (trading_date, ticker)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS headlines.articles_extreme_drops (
    trading_date DATE,
    ticker TEXT,
    guid TEXT,
    mapped_trading_date DATE, 
    title_sentiment_score FLOAT,
    title_sentiment_label TEXT,
    description_sentiment_score FLOAT,
    description_sentiment_label TEXT,
    PRIMARY KEY (trading_date, ticker, guid)
    );
    """
]

drop_statements = [
    # "DROP TABLE IF EXISTS headlines.Articles;",
    # "DROP TABLE IF EXISTS headlines.Company_Info_News;",
    # "DROP TABLE IF EXISTS headlines.Pricing_News;",
    # "DROP TABLE IF EXISTS headlines.Volume_News;",
    # "DROP TABLE IF EXISTS headlines.Market_Data_Daily_Processing;",
    # "DROP TABLE IF EXISTS headlines.Market_Data_Headlines;",
    # "DROP TABLE IF EXISTS headlines.Trading_Calendar;",
    # "DROP TABLE IF EXISTS headlines.Articles_Trading_Day;",
    # "DROP TABLE IF EXISTS headlines.Market_Article_Summary;",
    # "DROP TABLE IF EXISTS headlines.Daily_Price_Movement;",
    # "DROP TABLE IF EXISTS headlines.Weekly_Price_Movement;",
    "DROP TABLE IF EXISTS headlines.extreme_price_movements;",
    "DROP TABLE IF EXISTS headlines.articles_extreme_drops;"
]

index_statements = [
    # "CREATE INDEX IF NOT EXISTS idx_articles_pubDate ON headlines.Articles (article_pubDate);",
    # "CREATE INDEX IF NOT EXISTS idx_articles_pubDate ON headlines.Articles_Trading_Day (article_pubDate);"
    # "CREATE INDEX IF NOT EXISTS idx_stock_movement_ticker ON headlines.Daily_Price_Movement (ticker);",
    # "CREATE INDEX IF NOT EXISTS idx_stock_movement_ticker ON headlines.Weekly_Price_Movement (ticker);"
]

for drop in drop_statements:
    con.execute(drop)

for ddl in ddl_statements:
    con.execute(ddl)

for index in index_statements:
    con.execute(index)


# Load Company Info News

In [4]:
# its all in one line
with open(company_txt_path, 'r') as file:
    lines = file.readline().split('\\n')
    # con.execute("TRUNCATE Company_Info_News")
    for line in lines[1:]:
        line = line.strip().split('|')
        # DONT RUN THIS TWICE BY MISTAKE!
        con.execute("INSERT INTO headlines.Company_Info_News VALUES (?,?,?)", line)

# Load `Volume_News` 

In [18]:
df = pd.read_csv(volume_news_path)
# df.head()

# convert the wide format to long format
volume_long_df = df.melt(id_vars=['Date'], var_name='Ticker', value_name='Volume')

# make sure they have the correct data types
volume_long_df['Date'] = pd.to_datetime(volume_long_df['Date'])
volume_long_df['Volume'] = pd.to_numeric(volume_long_df['Volume'], errors='coerce')

# con.execute("TRUNCATE Volume_News")
con.execute("INSERT INTO headlines.Volume_News (trading_day_date, ticker, Volume) SELECT Date, ticker, Volume FROM volume_long_df")

<duckdb.duckdb.DuckDBPyConnection at 0x27dd32c67b0>

# Load `Pricing_News`

In [19]:
df = pd.read_csv(pricing_news_path)
# convert the wide format to long format
pricing_long_df = df.melt(id_vars=['Date'], var_name='Ticker', value_name='Price')

# make sure they have the correct data types
pricing_long_df['Date'] = pd.to_datetime(pricing_long_df['Date'])
pricing_long_df['Price'] = pd.to_numeric(pricing_long_df['Price'], errors='coerce')

# con.execute("TRUNCATE Pricing_News")
con.execute("INSERT INTO headlines.Pricing_News (trading_day_date, Ticker, price) SELECT Date, ticker, Price FROM pricing_long_df")

<duckdb.duckdb.DuckDBPyConnection at 0x27dd32c67b0>

# Load `Market_Data_Daily_Processing`

In [20]:
# con.execute("TRUNCATE Market_Data_Daily_Processing")
con.execute("""
INSERT INTO headlines.Market_Data_Daily_Processing
SELECT 
    pn.trading_day_date,
    pn.ticker,
    pn.price,
    vn.volume
FROM 
    headlines.Pricing_News pn
LEFT JOIN 
    headlines.Volume_News vn 
ON 
    pn.trading_day_date = vn.trading_day_date AND pn.ticker = vn.ticker
""")

<duckdb.duckdb.DuckDBPyConnection at 0x27dd32c67b0>

# Load `Trading_Calendar`

In [21]:
# pricing_dates = pricing_long_df['Date'].drop_duplicates()
# volume_dates = volume_long_df['Date'].drop_duplicates()

# trading_dates = pd.concat([pricing_dates, volume_dates]).drop_duplicates().sort_values()

# # make sure to have correct col name
# trading_dates_df = pd.DataFrame(trading_dates, columns=['trading_date'])
# con.execute("TRUNCATE Trading_Calendar")
con.execute(
"""
INSERT INTO headlines.Trading_Calendar
SELECT DISTINCT trading_day_date AS trading_date
FROM (
    SELECT trading_day_date FROM headlines.Pricing_News
    UNION
    SELECT trading_day_date FROM headlines.Volume_News
) AS all_dates
ORDER BY trading_date;
"""
)

<duckdb.duckdb.DuckDBPyConnection at 0x27dd32c67b0>

# Load `Articles`

In [12]:
def xml_loader(base_dir):
    # lets do this in chunks instead
    failed_parses = pd.DataFrame()

    for root, dirs, files in os.walk(base_dir):
        data = [] 
        # extract ticker from foldername 
        ticker = os.path.basename(root)

        for file in files:
            file_path = os.path.join(root, file)
            
            try:
                tree = ET.parse(file_path)
                root_element = tree.getroot()
                
                channel = root_element.find('channel')
                if channel is not None:
                    # extract metadata info
                    language = channel.findtext("language") 
                    lastBuildDate = channel.findtext("lastBuildDate")
                    link = channel.findtext("link")
                    title = channel.findtext("title")
                    
                    # now meat and potatoes
                    for item in channel.findall("item"):
                        description = item.findtext("description")
                        guid = item.findtext("guid")
                        article_link = item.findtext("link")
                        article_pubDate = item.findtext("pubDate")
                        article_title = item.findtext("title")
                        
                        data.append({
                            "guid": guid,
                            "ticker": ticker,
                            "description": description,
                            "article_link": article_link,
                            "article_pubDate": article_pubDate,
                            "article_title": article_title,
                            "language": language,
                            "lastBuildDate": lastBuildDate,
                            "link": link,
                            "title": title
                        })
            except ET.parseError as e:
                print(f"Error parsing file {file_path}: {e}")
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")
        
        # insert the data into the database
        if data:
            df = pd.DataFrame(data)
            # print("Performing timestamp coercion for", ticker)
            df['parsed_date'] = pd.to_datetime(df['article_pubDate'], errors='coerce')
            df['lastBuildDate'] = pd.to_datetime(df['lastBuildDate'], errors='coerce')
            # print("Done timestamp coercion for", ticker)
            
            # separate failed cases to avoid nulls
            current_failed = df[df['parsed_date'].isna()]
            current_valid = df[df['parsed_date'].notna()]

            # these are good
            current_valid = current_valid.assign(article_pubDate=current_valid['parsed_date']).drop(columns=['parsed_date'])

            # remove dupes on guid and ticker
            current_valid.drop_duplicates(subset=['guid', 'ticker'], inplace=True)

            
            failed_parses = pd.concat([failed_parses, current_failed], ignore_index=True)
            
            try:
                # adding this too just in case
                con.execute("INSERT INTO headlines.Articles SELECT * FROM current_valid ON CONFLICT (guid, ticker) DO NOTHING")
                print("inserted data for", ticker)
            except Exception as e:
                print(f"Error inserting data for {ticker}: {e}")
                
    failed_parses.to_csv("failed_article_dates.csv", index=False)
    return failed_parses

In [13]:
# GETTING NULLS! gonna fix the coercion logic
# con.execute("truncate Articles")

# load multicap headlines
failed_df = xml_loader(multicap_headlines)
# load new headlines
failed2_df = xml_loader(headline_august24_path)

inserted data for A
inserted data for AA
inserted data for AAL
inserted data for AAON
inserted data for AAP
inserted data for AAPL
inserted data for ABBV
inserted data for ABG
inserted data for ABNB
inserted data for ABT
inserted data for ACA
inserted data for ACAD
inserted data for ACGL
inserted data for ACHC
inserted data for ACI
inserted data for ACIW
inserted data for ACLS
inserted data for ACLX
inserted data for ACM
inserted data for ACN
inserted data for ACT
inserted data for ADBE
inserted data for ADC
inserted data for ADI
inserted data for ADM
inserted data for ADP
inserted data for ADSK
inserted data for ADT
inserted data for AEE
inserted data for AEIS
inserted data for AEL
inserted data for AEO
inserted data for AEP
inserted data for AER
inserted data for AES
inserted data for AFG
inserted data for AFL
inserted data for AFRM
inserted data for AGCO
inserted data for AGNC
inserted data for AGO
inserted data for AGR
inserted data for AI
inserted data for AIG
inserted data for AI

  df['parsed_date'] = pd.to_datetime(df['article_pubDate'], errors='coerce')


inserted data for Z
inserted data for ZBH
inserted data for ZBRA
inserted data for ZG
inserted data for ZI
inserted data for ZION
inserted data for ZM
inserted data for ZS
inserted data for ZTS
inserted data for ZWS
inserted data for A
inserted data for AA
inserted data for AAL
inserted data for AAON
inserted data for AAP
inserted data for AAPL
inserted data for ABBV
inserted data for ABCB
inserted data for ABG
inserted data for ABNB
inserted data for ABT
inserted data for ACA
inserted data for ACAD
inserted data for ACGL
inserted data for ACHC
inserted data for ACI
inserted data for ACIW
inserted data for ACLS
inserted data for ACLX
inserted data for ACM
inserted data for ACN
inserted data for ACT
inserted data for ADBE
inserted data for ADC
inserted data for ADI
inserted data for ADM
inserted data for ADMA
inserted data for ADP
inserted data for ADSK
inserted data for ADT
inserted data for AEE
inserted data for AEIS
inserted data for AEO
inserted data for AEP
inserted data for AER
in

In [14]:
failed_df.head()

Unnamed: 0,guid,ticker,description,article_link,article_pubDate,article_title,language,lastBuildDate,link,title,parsed_date
0,2d7d8468-a424-37c5-bd5a-b8bdb3574707,AAL,"NORTHAMPTON, MA / ACCESSWIRE / April 30, 2024 ...",https://finance.yahoo.com/news/ve-got-back-mee...,"Tue, 30 Apr 2024 14:45:00 +0000",Theyâve Got Your Back: Meet Americanâs Sys...,en-US,2024-05-04 13:00:58+00:00,http://finance.yahoo.com/q/h?s=AAL,Yahoo! Finance: AAL News,NaT
1,628e541a-a3ce-3f16-8e7d-10c27b5e76cd,AAL,Want AAdvantage Platinum status? It isn't abou...,https://www.fool.com/the-ascent/credit-cards/a...,"Sun, 28 Apr 2024 14:30:11 +0000",How Much Do You Need to Fly to Earn American A...,en-US,2024-05-04 13:00:58+00:00,http://finance.yahoo.com/q/h?s=AAL,Yahoo! Finance: AAL News,NaT
2,570c87a0-a8c8-331e-837f-b8477e62866b,AAL,"It's not how much you fly, it's how much you s...",https://www.fool.com/the-ascent/credit-cards/a...,"Sat, 27 Apr 2024 14:30:12 +0000",How Much Do You Need to Fly to Earn American A...,en-US,2024-05-04 13:00:58+00:00,http://finance.yahoo.com/q/h?s=AAL,Yahoo! Finance: AAL News,NaT
3,2b1b6b71-b2b5-3268-8cf8-f6b716e8d915,AAL,American Airlines Group Inc. ( NASDAQ:AAL ) la...,https://finance.yahoo.com/news/american-airlin...,"Sat, 27 Apr 2024 12:25:19 +0000",American Airlines Group Inc. (NASDAQ:AAL) Just...,en-US,2024-05-04 13:00:58+00:00,http://finance.yahoo.com/q/h?s=AAL,Yahoo! Finance: AAL News,NaT
4,76a935dc-e538-31d2-84c0-4b0d36787ad2,AAL,American Airlines Group Inc. (NASDAQ:AAL) Q1 2...,https://finance.yahoo.com/news/american-airlin...,"Sat, 27 Apr 2024 12:24:23 +0000",American Airlines Group Inc. (NASDAQ:AAL) Q1 2...,en-US,2024-05-04 13:00:58+00:00,http://finance.yahoo.com/q/h?s=AAL,Yahoo! Finance: AAL News,NaT


In [15]:
# try again with the faulty data 
failed_df['article_pubDate'] = pd.to_datetime(failed_df['article_pubDate'], errors='coerce')
failed_df = failed_df.drop(columns=['parsed_date'])
failed_df.drop_duplicates(subset=['guid', 'ticker'], inplace=True)
try:
    # adding this too just in case
    con.execute("INSERT INTO headlines.Articles SELECT * FROM failed_df ON CONFLICT (guid, ticker) DO NOTHING")
except Exception as e:
    print(f"Error inserting data: {e}")

In [16]:
# try again with the faulty data 
failed2_df['article_pubDate'] = pd.to_datetime(failed2_df['article_pubDate'], errors='coerce')
failed2_df = failed2_df.drop(columns=['parsed_date'])
failed2_df.drop_duplicates(subset=['guid', 'ticker'], inplace=True)
try:
    # adding this too just in case
    con.execute("INSERT INTO headlines.Articles SELECT * FROM failed2_df ON CONFLICT (guid, ticker) DO NOTHING")
except Exception as e:
    print(f"Error inserting data: {e}")

# Load `Articles_Trading_Day`

In [4]:
con.execute("truncate headlines.Articles_Trading_Day")
con.execute("""
INSERT INTO headlines.Articles_Trading_Day
SELECT 
    a.guid,
    a.ticker,
    coalesce(MIN(tc.trading_date), cast(a.article_pubDate as Date)) AS mapped_trading_date,
    a.description,
    a.article_link,
    a.article_pubDate,
    a.article_title,
    a.language,
    a.lastBuildDate,
    a.link,
    a.title
FROM (
    SELECT 
        guid,
        ticker,
        description,
        article_link,
        article_pubDate,
        article_title,
        language,
        lastBuildDate,
        link,
        title,
        -- 4 PM EST adjust
        CASE 
            WHEN CAST(article_pubDate AS TIME) >= '16:00:00' 
            THEN CAST(article_pubDate AS DATE) + INTERVAL '1 day'
            ELSE CAST(article_pubDate AS DATE)
        END AS adjusted_pub_date
    FROM headlines.Articles
) a
LEFT JOIN 
    headlines.Trading_Calendar tc
ON 
    tc.trading_date >= a.adjusted_pub_date
GROUP BY 
    a.guid, a.ticker, a.description, a.article_link, a.article_pubDate, 
    a.article_title, a.language, a.lastBuildDate, a.link, a.title;
""")

<duckdb.duckdb.DuckDBPyConnection at 0x1fa0bd7a070>

# Create `Market_Data_Headlines`

In [23]:
# con.execute("Truncate Market_Data_Headlines")
con.execute("""
INSERT INTO headlines.Market_Data_Headlines
SELECT 
    md.trading_day_date,
    md.ticker,
    md.price,
    md.volume,
    COALESCE(COUNT(DISTINCT atd.guid), 0) AS headline_count
FROM 
    headlines.Market_Data_Daily_Processing md
LEFT JOIN 
    headlines.Articles_Trading_Day atd
ON 
    md.ticker = atd.ticker AND md.trading_day_date = atd.mapped_trading_date
GROUP BY 
    md.trading_day_date, md.ticker, md.price, md.volume;
""")

<duckdb.duckdb.DuckDBPyConnection at 0x27dd32c67b0>

# Load `market_article_summary`

In [24]:
# con.execute("drop table headlines.Market_Article_Summary")
# con.execute("""
#                 CREATE TABLE IF NOT EXISTS headlines.Market_Article_Summary (
#     trading_date DATE PRIMARY KEY,
#     article_count INT
#     );

#             """)
con.execute('''
INSERT INTO headlines.Market_Article_Summary
SELECT 
    atd.mapped_trading_date AS trading_date,
    COUNT(DISTINCT atd.guid) AS total_unique_articles
FROM 
    headlines.Articles_Trading_Day atd
GROUP BY 
    atd.mapped_trading_date;
''')

<duckdb.duckdb.DuckDBPyConnection at 0x27dd32c67b0>

# Load `Daily_Price_Movement`

In [18]:
con.execute("truncate headlines.Daily_Price_Movement")
con.execute("""
            INSERT INTO headlines.Daily_Price_Movement
            SELECT 
                sp1.trading_day_date AS trading_date,
                sp1.ticker,
                sp1.price AS close_price,
                sp2.trading_day_date AS next_trading_day,
                sp2.price AS close_price_next,
                ROUND(sp2.price - sp1.price, 2) AS price_change,
                ROUND((sp2.price - sp1.price) / sp1.price * 100, 2) AS price_change_percentage
            FROM headlines.market_data_daily_processing sp1
            LEFT JOIN headlines.market_data_daily_processing sp2 
            ON sp2.ticker = sp1.ticker 
            AND sp2.trading_day_date = (
                SELECT MIN(sp3.trading_day_date) 
                FROM headlines.market_data_daily_processing sp3
                WHERE sp3.ticker = sp1.ticker
                AND sp3.trading_day_date > sp1.trading_day_date
            );
""")

<duckdb.duckdb.DuckDBPyConnection at 0x1fa2d5835b0>

# Load `Weekly_Price_Movement`

In [20]:
con.execute("truncate headlines.Weekly_Price_Movement")
con.execute("""
            INSERT INTO headlines.Weekly_Price_Movement
            WITH WeeklyPrices AS (
            SELECT 
                ticker,
                MIN(tc.trading_date) AS trading_week_start,
                MAX(tc.trading_date) AS trading_week_end
            FROM headlines.market_data_daily_processing mdp
            JOIN headlines.trading_calendar tc 
            ON mdp.trading_day_date = tc.trading_date
            WHERE EXTRACT(DOW FROM tc.trading_date) BETWEEN 1 AND 5  -- Only weekdays
            GROUP BY ticker, DATE_TRUNC('week', tc.trading_date)
        ),
        StartPrices AS (
            SELECT 
                mdp.trading_day_date AS trading_week_start, 
                mdp.ticker, 
                mdp.price AS close_price
            FROM headlines.market_data_daily_processing mdp
            JOIN WeeklyPrices wp 
            ON mdp.ticker = wp.ticker 
            AND mdp.trading_day_date = wp.trading_week_start
        ),
        EndPrices AS (
            SELECT 
                mdp.trading_day_date AS trading_week_end, 
                mdp.ticker, 
                mdp.price AS close_price_end
            FROM headlines.market_data_daily_processing mdp
            JOIN WeeklyPrices wp 
            ON mdp.ticker = wp.ticker 
            AND mdp.trading_day_date = wp.trading_week_end
        )
        SELECT 
            sp.trading_week_start,
            sp.ticker,
            sp.close_price as close_price_start,
            ep.trading_week_end,
            ep.close_price_end,
            ROUND(ep.close_price_end - sp.close_price, 2) AS price_change,
            ROUND((ep.close_price_end - sp.close_price) / sp.close_price * 100, 2) AS price_change_percentage
        FROM StartPrices sp
        JOIN EndPrices ep 
        ON sp.ticker = ep.ticker 
        AND sp.trading_week_start = ep.trading_week_end - INTERVAL '4 days';
            """)

<duckdb.duckdb.DuckDBPyConnection at 0x1fa2d5835b0>

# Load `extreme_price_movements`

In [52]:
con.execute("truncate headlines.extreme_price_movements")
con.execute("""
            INSERT INTO headlines.extreme_price_movements
            SELECT trading_date, ticker, close_price, price_change, price_change_percentage,
                CASE 
                    WHEN price_change_percentage < -5 THEN 'Drop'
                    WHEN price_change_percentage > 5 THEN 'Surge'
                END AS movement_type
            FROM headlines.daily_price_movement
            WHERE ABS(price_change_percentage) > 5;
""")

<duckdb.duckdb.DuckDBPyConnection at 0x1bfe96580b0>

# Load `articles_extreme_drops`

In [53]:
con.execute("truncate headlines.articles_extreme_drops")
df = con.execute("""
            SELECT epm.trading_date, epm.ticker, a.guid, a.mapped_trading_date,
                fs.finbert_title_score AS title_sentiment_score,
                fs.finbert_title_label AS title_sentiment_label,
                fs.finbert_description_score AS descripton_sentiment_score,
                fs.finbert_description_label AS descripton_sentiment_label
            FROM headlines.extreme_price_movements epm
            JOIN headlines.articles_trading_day a
            ON epm.ticker = a.ticker
            AND a.mapped_trading_date BETWEEN epm.trading_date - INTERVAL '3 days' AND epm.trading_date
            LEFT JOIN headlines.finbert_sentiment fs
            ON a.guid = fs.guid
""").df() 

# dedupe based on trading_date, ticker, guid 
df.drop_duplicates(subset=['trading_date', 'ticker', 'guid'], inplace=True)

con.execute("INSERT INTO headlines.articles_extreme_drops select * from df ON CONFLICT (trading_date, ticker, guid) DO NOTHING")

<duckdb.duckdb.DuckDBPyConnection at 0x1bfe96580b0>

In [54]:
con.close()

# GOT S&P mapped, I'll do it later

In [22]:
con = duckdb.connect("financial_news.db")
sp500_volume_weekly_path = 'SP500\\volume.csv'
sp500_price_weekly_path = 'SP500\\price.csv'
sp500_price_daily_path = 'SP500\\price_daily.csv'
sp500_company_path = 'SP500\\company_info_sp500.txt'
sp500_price_sp500_path = 'SP500\\price_SP500.csv'
sp500_item1_path = 'SP500\\sp500_item1_sec_filings_0.txt'
sp500_item1a_path = 'SP500\\sp500_item1a_sec_filings_0.txt'
sp500_item7_path = 'SP500\\sp500_item7_sec_filings_0.txt'

In [23]:
# drop table statements 
drop_statements = [
    "DROP TABLE IF EXISTS SP500.Volume_Weekly;",
    "DROP TABLE IF EXISTS SP500.Price_Daily;",
    "DROP TABLE IF EXISTS SP500.Company_Info;",
    "DROP TABLE IF EXISTS SP500.Weekly_Market_Data;",
    "DROP TABLE IF EXISTS SP500.Price_Weekly;",
    "DROP TABLE IF EXISTS SP500.Price_Weekly_SP500;",
    "DROP TABLE IF EXISTS SP500.item7;",
    "DROP TABLE IF EXISTS SP500.item1a;",
    "DROP TABLE IF EXISTS SP500.item1;",
    "DROP TABLE IF EXISTS SP500.SEC_Item_Filings;"
]

ddl_statements = [
    "CREATE SCHEMA IF NOT EXISTS SP500;",
"""
CREATE TABLE IF NOT EXISTS SP500.Volume_Weekly (
    trading_week_date DATE NOT NULL,
    cik TEXT NOT NULL,
    volume FLOAT,
    PRIMARY KEY (trading_week_date, cik)
);
""",
"""
CREATE TABLE IF NOT EXISTS SP500.Price_Daily (
    trading_day_date DATE NOT NULL,
    cik TEXT NOT NULL,
    price FLOAT,
    PRIMARY KEY (trading_day_date, cik)
);
""",
"""
CREATE TABLE IF NOT EXISTS SP500.Company_Info (
    cik TEXT PRIMARY KEY,
    ticker TEXT,  
    name TEXT, 
    subindustry TEXT 
);
""",
"""
CREATE TABLE IF NOT EXISTS SP500.Weekly_Market_Data (
    trading_week_date DATE NOT NULL,
    cik TEXT NOT NULL,
    price FLOAT NOT NULL,
    volume FLOAT NOT NULL,
    PRIMARY KEY (trading_week_date, cik)
);
""",
"""
CREATE TABLE IF NOT EXISTS SP500.Price_Weekly (
    trading_week_date DATE NOT NULL,
    cik TEXT NOT NULL,
    price FLOAT,
    PRIMARY KEY (trading_week_date, cik)
);
""",
"""
CREATE TABLE IF NOT EXISTS SP500.Price_Weekly_SP500 (
    trading_week_date DATE PRIMARY KEY,
    SP500CapWeighted FLOAT,
    SP500EqualWeighted FLOAT
);
""",
"""
CREATE TABLE IF NOT EXISTS SP500.item7 (
    company TEXT NOT NULL,           
    filing_ts TIMESTAMP NOT NULL,              
    link TEXT,                       
    type TEXT,                       
    cik TEXT,                         
    item7 TEXT,                      
    PRIMARY KEY (company, filing_ts)      
);
""",
"""
CREATE TABLE IF NOT EXISTS SP500.item1a (
    company TEXT NOT NULL,           
    filing_ts TIMESTAMP NOT NULL,              
    link TEXT,                       
    type TEXT,                       
    cik TEXT,                         
    item1a TEXT,                     
    PRIMARY KEY (company, filing_ts)      
);
""",
"""
CREATE TABLE IF NOT EXISTS SP500.item1 (
    company TEXT NOT NULL,           
    filing_ts TIMESTAMP NOT NULL,              
    link TEXT,                       
    type TEXT,                       
    cik TEXT,                         
    item1 TEXT,                      
    PRIMARY KEY (company, filing_ts)      
);
""",
"""
CREATE TABLE IF NOT EXISTS SP500.SEC_Item_Filings (
    cik TEXT NOT NULL,               
    filing_ts TIMESTAMP NOT NULL,              
    item_filing TEXT NOT NULL,       
    company TEXT,                    
    link TEXT,                       
    type TEXT,                       
    item_description TEXT,           
    PRIMARY KEY (cik, filing_ts, item_filing)
);
"""
]

for drop in drop_statements:
    con.execute(drop)

for ddl in ddl_statements:
    con.execute(ddl)

# Load `volume_weekly`

In [24]:
df = pd.read_csv(sp500_volume_weekly_path)
# convert the wide format to long format
# volume_long_df = wide_to_long(df, ['Date'], 'Volume', 'cik')
volume_long_df = df.melt(id_vars=['Date'], var_name='cik', value_name='Volume')

# make sure they have the correct data types
volume_long_df['Date'] = pd.to_datetime(volume_long_df['Date'])
volume_long_df['Volume'] = pd.to_numeric(volume_long_df['Volume'], errors='coerce')

# default null volume values to 0. CIK 1534701 is all nulls so better to just drop the column but keep it for now
# nah jk leaving it as null for now. will coalesce the final table
# volume_long_df['Volume'] = volume_long_df['Volume'].fillna(0)

con.execute("INSERT INTO SP500.Volume_Weekly (trading_week_date, cik, volume) SELECT date, cik, volume FROM volume_long_df")

<duckdb.duckdb.DuckDBPyConnection at 0x21f97f4d870>

# Load `Price_Daily`

In [25]:
df = pd.read_csv(sp500_price_daily_path)
# convert the wide format to long format
# price_long_df = wide_to_long(df, ['Date'], 'Price', 'cik')
price_long_df = df.melt(id_vars=['Date'], var_name='cik', value_name='Price')

# make sure they have the correct data types
price_long_df['Date'] = pd.to_datetime(price_long_df['Date'])
price_long_df['Price'] = pd.to_numeric(price_long_df['Price'], errors='coerce')

# con.execute("TRUNCATE SP500.Price_Daily")
con.execute("INSERT INTO SP500.Price_Daily (trading_day_date, cik, price) SELECT Date, cik, Price FROM price_long_df")

<duckdb.duckdb.DuckDBPyConnection at 0x21f97f4d870>

In [37]:
price_long_df

Unnamed: 0,Date,cik,Price
0,2000-01-07,1534701,
1,2000-01-14,1534701,
2,2000-01-21,1534701,
3,2000-01-28,1534701,
4,2000-02-04,1534701,
...,...,...,...
1122945,2023-03-10,814585,10.92
1122946,2023-03-17,814585,8.68
1122947,2023-03-24,814585,8.20
1122948,2023-03-31,814585,9.26


# Load `Price_Weekly`

<b>Looks like theres an issue with this dataset. When the office hours recordings come out I'll watch it and fix it. until then it'll be null </b>

In [26]:
df.head()

Unnamed: 0,Date,1534701,1341439,792985,1489393,86312,96289,1393612,40704,726513,...,75829,906345,84839,1122304,1526520,1013871,920760,814585,SP500CapWeighted,SP500EqualWeighted
0,2000-01-03,,29.53125,14.5,,33.0,54.0,,17.21875,52.5,...,20.5,26.8125,0.888889,,,,7.126582,34.458333,1455.219971,1414.1586
1,2000-01-04,,26.921875,14.8125,,32.5625,53.9375,,16.90625,51.125,...,20.1875,26.125,0.881573,,,,7.070687,32.75,1399.420044,1375.8559
2,2000-01-05,,25.5,15.625,,32.3125,56.3125,,16.8125,50.375,...,19.5,26.1875,0.877915,,,,7.098634,33.041667,1402.109985,1382.7999
3,2000-01-06,,24.0,16.75,,32.9375,51.375,,16.78125,50.5,...,20.8125,26.6875,0.885231,,,,7.098634,33.916667,1403.449951,1398.3869
4,2000-01-07,,25.84375,17.8125,,34.25,50.9375,,16.8125,50.5,...,21.0625,27.1875,0.892547,,,,7.238371,34.25,1441.469971,1429.7778


In [27]:
df = pd.read_csv(sp500_price_weekly_path)
# convert the wide format to long format
# price_long_df = wide_to_long(df, ['Date'], 'Price', 'cik')
price_long_df = df.melt(id_vars=['Date'], var_name='cik', value_name='Price')

# make sure they have the correct data types
price_long_df['Date'] = pd.to_datetime(price_long_df['Date'])
price_long_df['Price'] = pd.to_numeric(price_long_df['Price'], errors='coerce')

# con.execute("TRUNCATE SP500.Price_Weekly")
# con.execute("INSERT INTO SP500.Price_Weekly (trading_week_date, cik, price) SELECT Date, cik, price FROM price_long_df")

# Load `Company_Info`

In [28]:
# its all in one line
with open(sp500_company_path, 'r') as file:
    lines = file.readline().split('\\n')
    for line in lines[1:]:
        line = line.strip().split('|')
        # DONT RUN THIS TWICE BY MISTAKE!
        con.execute("INSERT INTO SP500.Company_Info VALUES (?,?,?,?)", line)

# Load `Weekly_Market_Data`

In [29]:
con.execute("""
INSERT INTO SP500.Weekly_Market_Data
SELECT 
    pw.trading_week_date AS trading_week_date,
    pw.cik AS cik,
    coalesce(pw.price, 0) AS price,
    coalesce(vw.volume, 0) AS volume
FROM 
    SP500.Price_Weekly pw
LEFT JOIN 
    SP500.Volume_Weekly vw
ON 
    pw.trading_week_date = vw.trading_week_date AND pw.cik = vw.cik
""")

<duckdb.duckdb.DuckDBPyConnection at 0x21f97f4d870>

# Load `SP500.Price_Weekly_SP500`

In [30]:
df = pd.read_csv(sp500_price_sp500_path)

df['Date'] = pd.to_datetime(df['Date'])
df['SP500CapWeighted'] = pd.to_numeric(df['SP500CapWeighted'], errors='coerce')
df['SP500EqualWeighted'] = pd.to_numeric(df['SP500EqualWeighted'], errors='coerce')

con.execute("truncate SP500.Price_Weekly_SP500")
con.execute("INSERT INTO SP500.Price_Weekly_SP500 (trading_week_date,SP500CapWeighted, SP500EqualWeighted) SELECT Date, SP500CapWeighted, SP500EqualWeighted FROM df")

<duckdb.duckdb.DuckDBPyConnection at 0x21f97f4d870>

# Load `item7 `

Sometimes the last columns comes in multiple lines. Sucks b/c its last column and I can't rely on the pipe, so gotta code for that 

In [31]:
# cols: company|date|link|type|cik|item7 
# con.execute("TRUNCATE SP500.item7")
def parse_items(file_path, table_name):
    # use 2 pointer approach to check if next line is a continuation of the current line
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        prev_line = None  

        for line in lines[1:]:
            line = line.strip()
            fields = line.split('|')

            if prev_line is None:
                prev_line = fields
                continue

            # if current line has required number of elements, insert prev line
            if len(fields) == 6:
                con.execute(f"INSERT INTO SP500.{table_name} VALUES (?,?,?,?,?,?)", prev_line)
                prev_line = fields
            else:
                prev_line[-1] += " " + line

        # the last record
        if prev_line:
            con.execute(f"INSERT INTO SP500.{table_name} VALUES (?,?,?,?,?,?)", prev_line)

In [32]:
con.execute("TRUNCATE SP500.item7")
parse_items(sp500_item7_path, 'item7')

# Load `item1`

In [33]:
con.execute("TRUNCATE SP500.item1")
parse_items(sp500_item1_path, 'item1')

# Load `item1a`

In [34]:
con.execute("TRUNCATE SP500.item1a")
parse_items(sp500_item1a_path, 'item1a')

# Load `SEC_Item_Filings`

I think the only thing different between the 3 item tables are the item filing # and the item description

With that said, I believe its better to use a longer table for simplicity vs a wider table 
- wide = instead of 1 item_filing and 1 item_description columns, we make a column for each filing and description

In [35]:
con.execute("""
INSERT INTO SP500.SEC_Item_Filings
SELECT 
    cik,
    filing_ts,
    '7' AS item_filing,
    company,
    link,
    type,
    item7 AS item_description
FROM 
    SP500.item7
UNION ALL
SELECT 
    cik,
    filing_ts,
    '1a' AS item_filing,
    company,
    link,
    type,
    item1a AS item_description
FROM 
    SP500.item1a
UNION ALL
SELECT 
    cik,
    filing_ts,
    '1' AS item_filing,
    company,
    link,
    type,
    item1 AS item_description
FROM 
    SP500.item1;
""")

<duckdb.duckdb.DuckDBPyConnection at 0x21f97f4d870>

In [36]:
con.close()

# FinBERT Scores

In [27]:
con = duckdb.connect("financial_news.db")
finbert_csv = "articles_with_all_finbert_scores.csv"
# SO THIS ONE WILL HAVE DUPES! NEED TO DEDUPE!
tokens_csv = "tokens.csv"

In [28]:
ddl = ["""
CREATE TABLE IF NOT EXISTS headlines.finbert_sentiment (
    guid TEXT,
    description TEXT,
    article_title TEXT,
    ticker TEXT,
    finbert_title_label TEXT,
    finbert_title_score FLOAT,
    finbert_description_label TEXT,
    finbert_description_score FLOAT,
    PRIMARY KEY (guid, ticker)
);
""",
"""
CREATE TABLE IF NOT EXISTS headlines.tokens_description (
    guid TEXT,
    token TEXT,
    token_lemmatized TEXT,
    frequency INT,
    PRIMARY KEY (guid, token, token_lemmatized)
);
""",
"""
CREATE TABLE IF NOT EXISTS headlines.tokens_title (
    guid TEXT,
    token TEXT,
    token_lemmatized TEXT,
    frequency INT,
    PRIMARY KEY (guid, token, token_lemmatized)
);
"""
]

drop = [
    "DROP TABLE IF EXISTS headlines.finbert_sentiment;",
    "DROP TABLE IF EXISTS headlines.tokens_description;",
    "DROP TABLE IF EXISTS headlines.tokens_title;"
    ]

for d in drop:
    con.execute(d)
    
for d in ddl:
    con.execute(d)

# Load `finbert_sentiment`

In [29]:
df = pd.read_csv(finbert_csv)
# df.head()
filtered_df = df[['guid', 'description', 'article_title', 'ticker', 'finbert_title_label', 'finbert_title_score', 'finbert_description_label', 'finbert_description_score']]
filtered_df.head()

Unnamed: 0,guid,description,article_title,ticker,finbert_title_label,finbert_title_score,finbert_description_label,finbert_description_score
0,76ceb11d-33eb-3af8-82f1-74e4068911f5,Agilent (A) adds a water immersion and confoca...,Agilent (A) Enhances BioTek Cytation C10 With ...,A,Positive,1.0,Positive,0.999999
1,56dc485e-c740-3fcc-ab3a-4e0d707a8f4d,"SANTA CLARA, Calif., December 07, 2023--Agilen...",Agilent Resolve Raman Receives Multiple Recogn...,A,Neutral,0.861646,Positive,0.99987
2,367bed80-8d07-3dce-8092-fd53d70578fe,"Artisan Partners, an investment management com...",Hereâs Why Artisan Partners Mid Cap Fund Har...,A,Neutral,0.999948,Neutral,0.846491
3,7bf92827-a505-3d56-98a3-4c9d60794e64,Generally speaking the aim of active stock pic...,Agilent Technologies' (NYSE:A) 14% CAGR outpac...,A,Positive,1.0,Positive,0.999604
4,8e5bdc52-73a9-30b1-ae97-493cd82da360,"SANTA CLARA, Calif., December 04, 2023--Agilen...",Agilent BioTek Cytation C10 Confocal Imaging R...,A,Positive,1.0,Positive,0.999978


In [30]:
con.execute("truncate headlines.finbert_sentiment")
con.execute("INSERT INTO headlines.finbert_sentiment select * from filtered_df")

<duckdb.duckdb.DuckDBPyConnection at 0x1bf7fd925b0>

# Load `tokens_description`

In [31]:
# SO THIS ONE WILL HAVE DUPES! NEED TO DEDUPE!
df = pd.read_csv(tokens_csv)
print(df.size)

# remove df duplicates based on guid 
df.drop_duplicates(subset=['guid'], inplace=True)
print(df.size)
df.head()

1091460
842712


Unnamed: 0,guid,tokens_description,tokens_title
0,76ceb11d-33eb-3af8-82f1-74e4068911f5,"[('agilent', 'agilent'), ('adds', 'add'), ('wa...","[('agilent', 'agilent'), ('enhances', 'enhance..."
1,56dc485e-c740-3fcc-ab3a-4e0d707a8f4d,"[('santa', 'santa'), ('clara', 'clara'), ('cal...","[('agilent', 'agilent'), ('resolve', 'resolve'..."
2,367bed80-8d07-3dce-8092-fd53d70578fe,"[('artisan', 'artisan'), ('partners', 'partner...","[('artisan', 'artisan'), ('partners', 'partner..."
3,7bf92827-a505-3d56-98a3-4c9d60794e64,"[('generally', 'generally'), ('speaking', 'spe...","[('agilent', 'agilent'), ('technologies', 'tec..."
4,8e5bdc52-73a9-30b1-ae97-493cd82da360,"[('santa', 'santa'), ('clara', 'clara'), ('cal...","[('agilent', 'agilent'), ('biotek', 'biotek'),..."


In [32]:
tokens_description_data = []
tokens_title_data = []

for _, row in df.iterrows():
    guid = row['guid']
    
    tokens_description = eval(row['tokens_description'])
    tokens_title = eval(row['tokens_title'])
    
    for token, lemma  in tokens_description:
        tokens_description_data.append((guid, token, lemma))
    
    for token, lemma  in tokens_title:
        tokens_title_data.append((guid, token, lemma))

df_tokens_description = pd.DataFrame(tokens_description_data, columns=["guid", "token", "token_lemmatized"])
df_tokens_title = pd.DataFrame(tokens_title_data, columns=["guid", "token", "token_lemmatized"])

In [33]:
# df_tokens_description.head()
# find guid 367bed80-8d07-3dce-8092-fd53d70578fe with token quarter
# df_tokens_description[(df_tokens_description['guid'] == '367bed80-8d07-3dce-8092-fd53d70578fe' ) & (df_tokens_description['token'] == 'quarter')]
# aggregate any duplicates and count them and add them to column frequency
### OKAY LETS KEEP token_lemmatized HERE BC LEMMAS RELY ON CONTEXT! 
df_tokens_description = df_tokens_description.groupby(['guid', 'token', 'token_lemmatized']).size().reset_index(name='frequency')
# df_tokens_title = df_tokens_title.groupby(['guid', 'token', 'token_lemmatized']).size().reset_index(name='frequency')
df_tokens_description.head()

Unnamed: 0,guid,token,token_lemmatized,frequency
0,000026c6-886f-3930-a121-e633a8456b07,attributes,attribute,1
1,000026c6-886f-3930-a121-e633a8456b07,could,could,1
2,000026c6-886f-3930-a121-e633a8456b07,exceptional,exceptional,1
3,000026c6-886f-3930-a121-e633a8456b07,growth,growth,1
4,000026c6-886f-3930-a121-e633a8456b07,produce,produce,1


In [34]:
# 000705ff-4abf-355e-bed9-aeb6733f92b3 with token spending 
df_tokens_description[(df_tokens_description['guid'] == '000705ff-4abf-355e-bed9-aeb6733f92b3' ) & (df_tokens_description['token'] == 'spending')].head()

Unnamed: 0,guid,token,token_lemmatized,frequency
522,000705ff-4abf-355e-bed9-aeb6733f92b3,spending,spend,1
523,000705ff-4abf-355e-bed9-aeb6733f92b3,spending,spending,1


In [35]:
con.execute("truncate headlines.tokens_description")
con.execute("INSERT INTO headlines.tokens_description select * from df_tokens_description")

<duckdb.duckdb.DuckDBPyConnection at 0x1bf7fd925b0>

# Load `tokens_title`

In [36]:
df_tokens_title = df_tokens_title.groupby(['guid', 'token', 'token_lemmatized']).size().reset_index(name='frequency')
df_tokens_title.head()

Unnamed: 0,guid,token,token_lemmatized,frequency
0,000026c6-886f-3930-a121-e633a8456b07,growth,growth,1
1,000026c6-886f-3930-a121-e633a8456b07,reasons,reason,1
2,000026c6-886f-3930-a121-e633a8456b07,roadhouse,roadhouse,1
3,000026c6-886f-3930-a121-e633a8456b07,solid,solid,1
4,000026c6-886f-3930-a121-e633a8456b07,stock,stock,1


In [37]:
con.execute("truncate headlines.tokens_title")
con.execute("INSERT INTO headlines.tokens_title select * from df_tokens_title")

<duckdb.duckdb.DuckDBPyConnection at 0x1bf7fd925b0>

In [38]:
con.close()

# 10k FinBERT

In [3]:
con = duckdb.connect("financial_news.db")
finbert_csv = "sp500_with_all_finbert_scores.csv"
# # SO THIS ONE WILL HAVE DUPES! NEED TO DEDUPE!
# tokens_csv = "tokens.csv"

In [7]:
ddl = [
    """
    CREATE TABLE if not exists sp500.SEC_Item_Filings_FinBERT (
    cik TEXT,
    filing_ts TIMESTAMP,
    item_filing TEXT,
    finbert_description_label TEXT,
    finbert_description_score FLOAT,
    PRIMARY KEY (cik, filing_ts, item_filing),
    FOREIGN KEY (cik, filing_ts, item_filing) 
        REFERENCES sp500.SEC_Item_Filings (cik, filing_ts, item_filing) 
    );
    """
]

drop = [
    "DROP TABLE IF EXISTS sp500.SEC_Item_Filings_FinBERT;"
]

for d in drop:
    con.execute(d)
    
for d in ddl:
    con.execute(d)



# SEC_Item_Filings_FinBERT

In [11]:
finbert_10k_df = pd.read_csv(finbert_csv)
finbert_10k_df.head()

Unnamed: 0,cik,filing_ts,item_filing,type,item_description,finbert_description,finbert_description_label,finbert_description_score
0,66740,2022-02-09 20:13:29,7,10-K,Item 7. Managements Discussion and Analysis o...,"{'label': 'Neutral', 'score': 0.9999117851257324}",Neutral,0.999912
1,66740,2021-02-04 18:53:11,7,10-K,Item 7. Managements Discussion and Analysis o...,"{'label': 'Neutral', 'score': 0.9999666213989258}",Neutral,0.999967
2,66740,2020-02-06 21:16:31,7,10-K,Item 7. Managements Discussion and Analysis o...,"{'label': 'Neutral', 'score': 0.9999666213989258}",Neutral,0.999967
3,66740,2019-02-07 22:15:37,7,10-K,Item 7. Managements Discussion and Analysis o...,"{'label': 'Neutral', 'score': 0.999957799911499}",Neutral,0.999958
4,66740,2018-02-08 22:14:52,7,10-K,Item 7. Managements Discussion and Analysis o...,"{'label': 'Neutral', 'score': 0.999957799911499}",Neutral,0.999958


In [12]:
finbert_10k_df = finbert_10k_df[['cik', 'filing_ts', 'item_filing', 'finbert_description_label', 'finbert_description_score']]

In [None]:
con.execute("truncate sp500.SEC_Item_Filings_FinBERT")
con.execute("INSERT INTO sp500.SEC_Item_Filings_FinBERT select * from finbert_10k_df")

<duckdb.duckdb.DuckDBPyConnection at 0x23d20b8ebf0>

In [14]:
con.close()

# VIX DATASET

In [16]:
con = duckdb.connect("financial_news.db")
vix = "SP500/vixGaTechSP25.csv"

In [17]:
ddl = [
    """
    CREATE TABLE sp500.VIX_Index (
    vix_date DATE PRIMARY KEY,
    vix_value FLOAT
);
    """
]

drop = [
    "DROP TABLE IF EXISTS sp500.VIX_Index;"
]

for d in drop:
    con.execute(d)
    
for d in ddl:
    con.execute(d)

# VIX

In [22]:
vix_df = pd.read_csv(vix, names=["vix_date", "vix_value"], parse_dates=["vix_date"], skiprows=1) # first row is the header but not the best
vix_df.head()

Unnamed: 0,vix_date,vix_value
0,1986-01-02,18.07
1,1986-01-03,17.96
2,1986-01-06,17.05
3,1986-01-07,17.39
4,1986-01-08,19.97


In [23]:
# check for nulls 
vix_df.isnull().sum()

vix_date     0
vix_value    0
dtype: int64

In [24]:
con.execute("Truncate sp500.VIX_Index")
con.execute("INSERT INTO sp500.VIX_Index select * from vix_df")

<duckdb.duckdb.DuckDBPyConnection at 0x23d430d31f0>

In [25]:
con.close()