In [7]:
# import libraries
import requests
import pandas as pd
import time
from datetime import datetime, timedelta
from sqlalchemy import create_engine
from dotenv import load_dotenv
import os

In [8]:
# function to fetch data from the api
def fetch_data_from_api(url, params=None):
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching data: {response.text}")
        return {}

# function to get quotes from the api
def get_quotes(symbol, api_key):
    quote_url = f"https://finnhub.io/api/v1/quote?symbol={symbol}&token={api_key}"
    return fetch_data_from_api(quote_url)

# function to get financials as reported from the api
def get_financials_reported(symbol, api_key):
    reported_financials_url = f"https://finnhub.io/api/v1/stock/financials-reported?symbol={symbol}&token={api_key}"
    return fetch_data_from_api(reported_financials_url)

# function to get company basic financials from the api
def get_company_basic_financials(symbol, api_key):
    basic_financials_url = f"https://finnhub.io/api/v1/stock/metric?symbol={symbol}&metric=all&token={api_key}"
    return fetch_data_from_api(basic_financials_url)

# function to get company profile from api
def get_company_profile(symbol, api_key):
    profile_url = f"https://finnhub.io/api/v1/stock/profile2?symbol={symbol}&token={api_key}"
    return fetch_data_from_api(profile_url)

# function to get company news from the api
def get_company_news(symbol, api_key, start_date, end_date):
    news_url = f"https://finnhub.io/api/v1/company-news?symbol={symbol}&from={start_date}&to={end_date}&token={api_key}"
    return fetch_data_from_api(news_url)


In [9]:
# extract financial data from json response
def extract_financial_data(data, section, concepts=None):
    financial_data = []
    if 'data' in data:
        for report in data['data']:
            # check if the section is present in the report
            if 'report' in report and section in report['report']:
                section_data = report['report'][section]
                if concepts:
                    section_data = [item for item in section_data if item['concept'] in concepts]
                financial_data.extend(section_data)
    return financial_data


In [10]:
# process and flatten basic_financials data
def process_basic_financials_data(data, symbol):
    # check if data has metric key
    if 'metric' not in data:
        print(f"No 'metric' data available for {symbol}.")
        return pd.DataFrame()
    # extract relevant metrics
    metrics = data['metric']
    relevant_keys = [
        'quickRatioAnnual', 'currentRatioAnnual', 'netProfitMarginAnnual', 'inventoryTurnoverAnnual', 'grossMarginAnnual', 'totalDebt/totalEquityAnnual',
        'assetTurnoverAnnual', 'receivablesTurnoverAnnual', 'roeTTM', 'epsAnnual', 'roiAnnual', 'cashFlowPerShareAnnual', 'ebitdPerShareAnnual', 'ebitdaCagr5Y', 'revenueGrowthTTMYoy'
    ]
    # create a dataframe with relevant metrics
    relevant_metrics = {k: metrics.get(k) for k in relevant_keys}
    relevant_metrics['symbol'] = symbol
    return pd.DataFrame([relevant_metrics])

In [11]:
# process and flatten company news data
def process_news_data(news_data, symbol):
    processed_news = []
    for news_item in news_data:
        processed_news.append({
            'symbol': symbol,
            'datetime': pd.to_datetime(news_item['datetime'], unit='s'),  # Convert UNIX timestamp to datetime
            'headline': news_item['headline'],
            'source': news_item['source'],
            'summary': news_item['summary'],
            'url': news_item['url']
        })
    return processed_news

In [None]:
# load environment variables from .env file
load_dotenv()

# connection parameters
username = os.getenv('username')
password = os.getenv('password')
host = os.getenv('host')
port = os.getenv('port')
database = os.getenv('database')

api_key = os.getenv('FINNHUB_API_KEY')

# engine for connecting to db
engine = create_engine(f'mysql+pymysql://{username}:{password}@{host}:{port}/{database}')

# define the start and end dates for the data fetch
end_date = datetime.now() - timedelta(days=1)
start_date = end_date - timedelta(days=364)
formatted_start_date = datetime.strptime(start_date.strftime('%Y-%m-%d'), '%Y-%m-%d')
formatted_end_date = datetime.strptime(end_date.strftime('%Y-%m-%d'), '%Y-%m-%d')

# list of companies and concepts to fetch
company_symbols = ["JPM", "GS", "C", "JLL", "DIS", "TPR", "F", "XOM", "AAPL", "AMZN", "PFE", "MRK"]
concepts = ['us-gaap_AssetsCurrent', 'us-gaap_AssetsNoncurrent', 'us-gaap_Assets',
            'us-gaap_LiabilitiesCurrent', 'us-gaap_LiabilitiesNoncurrent', 'us-gaap_Liabilities',
            'us-gaap_StockholdersEquity', 'us-gaap_CommonStocksIncludingAdditionalPaidInCapital',
            'us-gaap_RetainedEarningsAccumulatedDeficit']

# create lists to store data
quotes_data = []
financials_reported_data = []
basic_financials_data = []
symbols_data = []
news_data = []

# loop through each company symbol
for symbol in company_symbols:
    
    # fetch and process company profile
    profile_data = get_company_profile(symbol, api_key)
    description = profile_data.get('name', 'N/A') 
    symbols_data.append({'symbol': symbol, 'description': description})

    # fetch and process quotes
    quote = get_quotes(symbol, api_key)
    if quote:
        quote['symbol'] = symbol
        quotes_data.append(quote)

    # fetch and process financials reported
    financial_data = get_financials_reported(symbol, api_key)
    bs_data = extract_financial_data(financial_data, 'bs', concepts)
    for item in bs_data:
        item['symbol'] = symbol
    financials_reported_data.extend(bs_data)

    # fetch and process basic financials
    basic_financials_response = get_company_basic_financials(symbol, api_key)
    if basic_financials_response:
        processed_data = process_basic_financials_data(basic_financials_response, symbol)
        basic_financials_data.append(processed_data)

    # fetch and process news
    news_response = get_company_news(symbol, api_key, formatted_start_date.strftime('%Y-%m-%d'), formatted_end_date.strftime('%Y-%m-%d'))
    if news_response:
        processed_news = process_news_data(news_response, symbol)
        news_data.extend(processed_news)

    time.sleep(1)

# convert lists to dfs
symbols_df = pd.DataFrame(symbols_data)
quotes_df = pd.DataFrame(quotes_data)
financials_reported_df = pd.DataFrame(financials_reported_data)
basic_financials_with_median_df = pd.concat(basic_financials_data, ignore_index=True)
symbols_df = pd.DataFrame({'symbol': company_symbols})
news_df = pd.DataFrame(news_data)

# handle missing values by imputing with median, excluding 'symbol'
numerical_cols = basic_financials_with_median_df.columns.drop('symbol') 
medians = basic_financials_with_median_df[numerical_cols].median()
basic_financials_with_median_df[numerical_cols] = basic_financials_with_median_df[numerical_cols].fillna(medians)

# upload each df to db
symbols_df.to_sql(name='symbols', con=engine, if_exists='replace', index=False)
quotes_df.to_sql(name='quotes', con=engine, if_exists='replace', index=False)
financials_reported_df.to_sql(name='financials_reported', con=engine, if_exists='replace', index=False)
basic_financials_with_median_df.to_sql(name='basic_financials', con=engine, if_exists='replace', index=False)
news_df.to_sql(name='company_news', con=engine, if_exists='replace', index=False)


print("Data uploaded successfully.")

In [13]:
# load environment variables from .env file
load_dotenv()

# connection parameters
username = os.getenv('username') 
password = os.getenv('password')  
host = os.getenv('host') 
port = os.getenv('port')  
database = os.getenv('database') 

# instance connection for testing
test_engine = create_engine(f'mysql+pymysql://{username}:{password}@{host}:{port}')

try:
    with test_engine.connect() as test_connection:
        print("RDS instance connection successful!")
except Exception as e:
    print(f"Failed to connect to RDS instance: {e}")

# if successful, connect to database
engine = create_engine(f'mysql+pymysql://{username}:{password}@{host}:{port}/{database}')

try:
    with engine.connect() as connection:
        print(f"Connected successfully to the database '{database}'!")
except Exception as e:
    print(f"Failed to connect to the database '{database}': {e}")

RDS instance connection successful!
Connected successfully to the database 'mayah_bosworth_sql_project'!
