In [86]:
import pandas as pd 
import requests 
import os
import requests
import numpy as np
import pickle
from sklearn.preprocessing import OneHotEncoder
#Helper function to make directory
def make_directory(path):
    if not os.path.exists(path):
        os.makedirs(path)

In [87]:
# Getting available symbols & informations in NASDAQ
def fetch_stock_symbols():
    url = 'https://api.nasdaq.com/api/screener/stocks?tableonly=true&limit=25&offset=0&exchange=NASDAQ&download=true'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', "Upgrade-Insecure-Requests": "1","DNT": "1","Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8","Accept-Language": "en-US,en;q=0.5","Accept-Encoding": "gzip, deflate"}
    dataString = requests.get(url,headers=headers).content
    json_data = json.loads(dataString)['data']
    tickersRawData = pd.DataFrame(json_data['rows'],columns=json_data['headers'])
    tickersRawData.columns = tickersRawData.columns.str.capitalize()
    # Hard to assume missing value -> Drop 
    tickersRawData.replace('', np.nan, inplace=True)
    tickersRawData.dropna(inplace=True,how='any')
    tickersRawData.reset_index(inplace=True)
    path ='data'
    filepath = f'data/NASDAQ.csv'
    make_directory(path)
    tickersRawData.to_csv(filepath, index=False)
    print(tickersRawData.columns)
    return tickersRawData
    
df = fetch_stock_symbols()


Index(['index', 'Symbol', 'Name', 'Lastsale', 'Netchange', 'Pctchange',
       'Marketcap', 'Country', 'Ipoyear', 'Volume', 'Sector', 'Industry',
       'Url'],
      dtype='object')


In [88]:
# Features to preprocess: Marketcap, Country, Ipoyear, Sector, Industry , Volume
# Onehot Encoding: Country, Industry, Sector
# Ordinal Encoding: Ipoyear, Marketcap, Volume
oe_target = ['Country','Industry','Sector']
oe = OneHotEncoder(sparse_output=False)
oe.fit(df[oe_target])

#Save onehotencoder for later usage
with open('data/onehot_encoder.pkl', 'wb') as to_write:
    pickle.dump(oe, to_write)

transformed_data = oe.transform(df[oe_target])
transformed_df = pd.DataFrame(transformed_data, columns=oe.get_feature_names_out(oe_target))

df = df.drop(columns=oe_target)
df = pd.concat([df, transformed_df], axis=1)

In [89]:
df.columns

Index(['index', 'Symbol', 'Name', 'Lastsale', 'Netchange', 'Pctchange',
       'Marketcap', 'Ipoyear', 'Volume', 'Url',
       ...
       'Sector_Consumer Staples', 'Sector_Energy', 'Sector_Finance',
       'Sector_Health Care', 'Sector_Industrials', 'Sector_Miscellaneous',
       'Sector_Real Estate', 'Sector_Technology', 'Sector_Telecommunications',
       'Sector_Utilities'],
      dtype='object', length=192)

In [90]:
# Ordinal: arrange by percentile cut off (4 Groups)
# Ipoyear, Marketcap 
ordinal_target = ['Ipoyear', 'Marketcap', 'Volume']

# How far till now of Ipoyear
df['Ipoyear'] = pd.to_numeric(df['Ipoyear'])
Ipoyear = - (df['Ipoyear'] - 2024)

df['Marketcap'] = pd.to_numeric(df['Marketcap'])
df['Volume'] = pd.to_numeric(df['Volume'])
for target in ordinal_target:
    print(target)
    df[f'{target}_label'] = pd.qcut(df[target], 4, labels=False)

Ipoyear
Marketcap
Volume


In [91]:
display(df[['Ipoyear_label','Marketcap_label','Volume_label']])

Unnamed: 0,Ipoyear_label,Marketcap_label,Volume_label
0,0,1,1
1,2,0,2
2,2,0,2
3,2,1,2
4,2,0,0
...,...,...,...
2276,1,3,3
2277,1,2,2
2278,1,3,3
2279,0,2,3


In [92]:
df.to_csv('data/NASDAQ_preprocess.csv',index=False)