In [None]:
### Please READ ###

# This notebook is meant to be used to test if the reading pickle function works as well as inserting raw data into the database. This is not meant for production. Please execute `python backfill_db.py` as that is the intended pipeline.

In [None]:
import pandas as pd
import os
import sys
import psycopg2

sys.path.append(os.path.abspath(os.path.join('.')))
from db.utils import DatabaseConnector


In [None]:
cleaned_trades_df = pd.read_csv("cleaned_trades.csv")
cleaned_trades_df.drop(axis=1, columns=['Unnamed: 0'], inplace=True)

# Load PostgreSQL connection details from docker-compose.yml environment variables
postgres_user = os.getenv('POSTGRES_USER', 'myuser')
postgres_password = os.getenv('POSTGRES_PASSWORD', 'mypassword')
postgres_host = os.getenv('POSTGRES_HOST', 'localhost')
postgres_port = os.getenv('POSTGRES_PORT', '5432')
postgres_db = os.getenv('POSTGRES_DB', 'mydb')


In [None]:
cleaned_trades_df.columns = [
    'politician', 'party', 'traded_company_name', 'traded_company_ticker',
    'trade_filed_date', 'trade_owner', 'trade_type', 'trade_size',
    'stock_price', 'published_datetime', 'traded_datetime', 'trade_id'
]


In [None]:
# Pre-process to ensure consistent format
cleaned_trades_df['published_datetime'] = cleaned_trades_df['published_datetime'].apply(lambda x: x if len(x.strip().split()) == 3 else x.split(' ')[0] + ' ' + x.split(' ')[1] + ' ' + x.split(' ')[2])

# Convert to datetime
cleaned_trades_df['published_datetime'] = pd.to_datetime(cleaned_trades_df['published_datetime'], format='%d %b %Y', errors='coerce')

# cleaned_trades_df['published_datetime'] = pd.to_datetime(cleaned_trades_df['published_datetime'], format='mixed', errors='coerce')
cleaned_trades_df.head(10)

In [None]:
from sqlalchemy import create_engine

engine = create_engine(f"postgresql://{postgres_user}:{postgres_password}@{postgres_host}:{postgres_port}/{postgres_db}")

cleaned_trades_df.to_sql('capitol_trades', engine, if_exists='replace', index=False)

In [None]:
# cleaned_trades_df[[cleaned_trades_df['politician'] == 'Ro Khanna'] ]

cleaned_trades_df[
    (cleaned_trades_df['politician'] == 'Ro Khanna') &
    (cleaned_trades_df['published_datetime'].isna())
]

In [None]:
cleaned_trades_df['published_datetime'].isnull().sum()