In [12]:
import pandas as pd
import json
import os 
import sys

In [14]:
dir = os.path.join('dataset', 'kaggle-dse')

files = os.listdir(dir)
files.sort()
files = [f for f in files if f != 'securities.json']
print(files)

['prices_2008.json', 'prices_2009.json', 'prices_2010.json', 'prices_2011.json', 'prices_2012.json', 'prices_2013.json', 'prices_2014.json', 'prices_2015.json', 'prices_2016.json', 'prices_2017.json', 'prices_2018.json', 'prices_2019.json', 'prices_2020.json', 'prices_2021.json', 'prices_2022.json']


In [33]:
# load all the json files
data = []
for f in files:
    with open(os.path.join(dir, f)) as file:
        data.append(json.load(file))
len(data)

15

In [34]:
# convert to pandas dataframe
dfs = []
for i in range(1,len(data)):
    dfs.append(pd.DataFrame(data[i]))
df = pd.concat(dfs)

In [35]:
df.shape

(1715852, 11)

In [36]:
# count the unique values in the column
df['trading_code'].value_counts()
# keep only the top 30 trading codes
top_30 = df['trading_code'].value_counts().head(30).index
top_30

Index(['STANCERAM', 'JAMUNABANK', 'JANATAINS', 'JUTESPINN', 'KARNAPHULI',
       'KAY&QUE', 'KEYACOSMET', 'KOHINOOR', 'LANKABAFIN', 'LEGACYFOOT',
       'LIBRAINFU', 'LINDEBD', 'MAKSONSPIN', 'MEGCONMILK', 'MEGHNACEM',
       'MEGHNALIFE', 'MEGHNAPET', 'MERCANBANK', 'MERCINS', 'METROSPIN',
       'MIDASFIN', 'MIRACLEIND', 'MONNOCERA', 'MPETROLEUM', 'NATLIFEINS',
       'NBL', 'NCCBANK', 'JAMUNAOIL', 'ISNLTD', 'EXIMBANK'],
      dtype='object', name='trading_code')

In [37]:
# drop the rows with trading codes not in the top 30
df = df[df['trading_code'].isin(top_30)]
# drop the rows with date and trading code is not unique
df = df.drop_duplicates(subset=['date', 'trading_code'])
df['date'].value_counts()

date
2009-12-30 00:00:00    30
2017-04-20 00:00:00    30
2018-10-09 00:00:00    30
2018-10-08 00:00:00    30
2018-10-07 00:00:00    30
                       ..
2013-04-24 00:00:00    30
2013-04-23 00:00:00    30
2013-04-22 00:00:00    30
2013-04-21 00:00:00    30
2022-01-02 00:00:00    30
Name: count, Length: 3333, dtype: int64

In [25]:
df['trading_code'].value_counts()

trading_code
EXIMBANK      3333
ISNLTD        3333
NCCBANK       3333
NBL           3333
NATLIFEINS    3333
MPETROLEUM    3333
MONNOCERA     3333
MIRACLEIND    3333
MIDASFIN      3333
METROSPIN     3333
MERCINS       3333
MERCANBANK    3333
MEGHNAPET     3333
MEGHNALIFE    3333
MEGHNACEM     3333
MEGCONMILK    3333
MAKSONSPIN    3333
LINDEBD       3333
LIBRAINFU     3333
LEGACYFOOT    3333
LANKABAFIN    3333
KOHINOOR      3333
KEYACOSMET    3333
KAY&QUE       3333
KARNAPHULI    3333
JUTESPINN     3333
JANATAINS     3333
JAMUNAOIL     3333
JAMUNABANK    3333
STANCERAM     3333
Name: count, dtype: int64

In [38]:
# per year minimum number of trading days
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df.groupby('year').size().min()

6240

In [40]:
# for each year, keep the first 6240 rows sorted by date
new_df = df.sort_values('date').groupby('year').head(6240)
new_df['trading_code'].value_counts()
new_df['year'].value_counts()

year
2009    6240
2010    6240
2011    6240
2012    6240
2013    6240
2014    6240
2015    6240
2016    6240
2017    6240
2018    6240
2019    6240
2020    6240
2021    6240
2022    6240
Name: count, dtype: int64