In [None]:
# Necessary Libraries.
import requests
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import yfinance as yf
import os
from dotenv import load_dotenv

In [None]:
# Specify the stock that you want to analyze.
stock = ''

load_dotenv()

api_key = os.getenv("FINNHUB_API_KEY")

# Initialize an empty DataFrame to store all data
all_data = pd.DataFrame()

# Define a start date
start_date = pd.to_datetime("2020-01-01")

while start_date < pd.to_datetime("today"):
    # Define the end date for this chunk (100 days later, for example)
    end_date = start_date + pd.Timedelta(days=100)
    
    # Request for data from Finnhub.io
    url = f'https://finnhub.io/api/v1/stock/insider-transactions?symbol={stock}&from={start_date.strftime("%Y-%m-%d")}&to={end_date.strftime("%Y-%m-%d")}&token={api_key}'

    r = requests.get(url)
    
    # Load the JSON file as a string.
    test = json.loads(r.text)
    
    # Convert the data into a dataframe.
    df = pd.DataFrame(data=test['data'])
    
    # Derived attributes from the data.
    df['dollarAmount'] = df['change']*df['transactionPrice']
    df['insiderPortfolioChange'] = df['change']/(df['share'] - df['change'])
    conditions = [
        (df['change'] >= 0) & (df['transactionPrice'] > 0),
        (df['change'] <= 0) & (df['transactionPrice'] > 0),
        (df['transactionPrice'] == 0)
    ]
    values = ['Buy', 'Sale', 'Gift']
    df['buyOrSale'] = np.select(conditions, values)
    df = df[['change', 'filingDate', 'name', 'share', 'source', 'symbol', 'transactionCode', 'transactionDate', 'transactionPrice', 'dollarAmount', 'insiderPortfolioChange', 'buyOrSale']]
    
    # Append the data to the main DataFrame
    all_data = pd.concat([all_data, df])
    
    # Move the start date to the next chunk
    start_date = end_date + pd.Timedelta(days=1)
    
    # Sleep for a short duration to avoid hitting rate limits
    time.sleep(1)

all_data.reset_index(drop=True, inplace=True)
all_data.head()


In [None]:
# Number of records
num_records = all_data.shape[0]
print(f"Number of Records: {num_records}")

# Number of duplicates
num_duplicates = all_data.duplicated().sum()
print(f"Number of Duplicates: {num_duplicates}")

# Number of null values for each column
null_values = all_data.isnull().sum()
print("\nNumber of Null Values for Each Column:")
print(null_values)

# Summary statistics
print("\nSummary Statistics:")
print(all_data.describe())

# More detailed information:
print("\nDataFrame Info:")
print(all_data.info())



In [None]:
df = all_data
# Remove duplicates
df = df.drop_duplicates()

# Verify duplicates are removed
print(f"Number of Duplicates After Removal: {df.duplicated().sum()}")

# Drop rows where 'share' is NaN
df.dropna(subset=['share'], inplace=True)

# Reset the index after dropping rows
df.reset_index(drop=True, inplace=True)

# Filter out rows with future transaction dates
df = df[df['transactionDate'] <= pd.Timestamp.today().strftime('%Y-%m-%d')]

# Convert the 'transactionDate' column to datetime format
df['transactionDate'] = pd.to_datetime(df['transactionDate'])

# Filter out transactions that happened within the last week
one_week_ago = pd.Timestamp.today() - pd.Timedelta(days=7)
df = df[df['transactionDate'] <= one_week_ago]


In [None]:
def fetch_prices(symbol, start_date, end_date):
    print(f"{symbol}: looking for data between {start_date} and {end_date}")
    try:
        stock_data = yf.download(symbol, start=start_date, end=end_date)
        return stock_data['Close'].iloc[0], stock_data['Close'].iloc[-1]
    except Exception as e:
        print(f"Failed to fetch data for {symbol}. Reason: {e}")
        return np.nan, np.nan

def calculate_relative_performance(row):
    # Convert the transactionDate from string to datetime
    transaction_date = pd.to_datetime(row['transactionDate'])
    
    # Fetch stock and S&P 500 prices for the transaction date and 1 week later
    stock_start, stock_end = fetch_prices(row['symbol'], transaction_date, transaction_date + pd.Timedelta(days=7))
    sp500_start, sp500_end = fetch_prices('^GSPC', transaction_date, transaction_date + pd.Timedelta(days=7))
    
    # Calculate relative performance
    stock_performance = (stock_end - stock_start) / stock_start
    sp500_performance = (sp500_end - sp500_start) / sp500_start
    
    return stock_performance - sp500_performance

# Apply the function to your dataframe
df['relativePerformance'] = df.apply(calculate_relative_performance, axis=1)


In [None]:
# Drop rows where 'share' is NaN
df.dropna(subset=['relativePerformance'], inplace=True)
df.dropna(subset=['insiderPortfolioChange'], inplace=True)
# Reset the index after dropping rows
df.reset_index(drop=True, inplace=True)
# Number of null values for each column
null_values = df.isnull().sum()
print("\nNumber of Null Values for Each Column:")
print(null_values)

In [None]:
if (stock == ''):
    stock = 'all'

df.to_csv('../data/'+stock+'.csv', index=False) 