# JPX Tokio Stock Exchange

## Reading data paths

In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/jpx-tokyo-stock-exchange-prediction/stock_list.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/sample_submission.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/options.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/financials.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/secondary_stock_prices.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/trades.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/stock_prices.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/jpx_tokyo_market_prediction/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/jpx-tokyo-stock-exchange-prediction/jpx_tokyo_market_prediction/__init__.py
/kaggle/input/jpx-tokyo-stock-exchange-prediction/data_specifications/stock_fin_spec.csv
/kaggle/input/jpx-tokyo-stock-exchange-prediction/data_specifications/trades_spec.csv
/kaggle/input/jpx-tokyo-stock-

## Reading the Sample Submission file for clarity

**Observations - Features :** 
- Data
- SecuritiesCode
- Rank

In [2]:
df_example = pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/example_test_files/sample_submission.csv")
df_example

Unnamed: 0,Date,SecuritiesCode,Rank
0,2021-12-06,1301,0
1,2021-12-06,1332,1
2,2021-12-06,1333,2
3,2021-12-06,1375,3
4,2021-12-06,1376,4
...,...,...,...
111995,2022-02-28,9990,1995
111996,2022-02-28,9991,1996
111997,2022-02-28,9993,1997
111998,2022-02-28,9994,1998


In [3]:
df_example.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112000 entries, 0 to 111999
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   Date            112000 non-null  object
 1   SecuritiesCode  112000 non-null  int64 
 2   Rank            112000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 2.6+ MB


## Time-Series API test

iter_test method creates a generator.

Generated tuples Are equivalent of the data samples

In [4]:
import jpx_tokyo_market_prediction
env = jpx_tokyo_market_prediction.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files


# for (prices, options, financials, trades, secondary_prices, sample_prediction) in iter_test:
#    sample_prediction_df['Rank'] = np.arange(len(sample_prediction))  # make your predictions here
#    env.predict(df_example["Rank"])   # register your predictions

In [5]:
a = next(iter_test)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [6]:
a[0]

Unnamed: 0,Date,RowId,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag
0,2021-12-06,20211206_1301,1301,2982.0,2982.0,2965.0,2971.0,8900,1.0,,False
1,2021-12-06,20211206_1332,1332,592.0,599.0,588.0,589.0,1360800,1.0,,False
2,2021-12-06,20211206_1333,1333,2368.0,2388.0,2360.0,2377.0,125900,1.0,,False
3,2021-12-06,20211206_1375,1375,1230.0,1239.0,1224.0,1224.0,81100,1.0,,False
4,2021-12-06,20211206_1376,1376,1339.0,1372.0,1339.0,1351.0,6200,1.0,,False
...,...,...,...,...,...,...,...,...,...,...,...
1995,2021-12-06,20211206_9990,9990,528.0,531.0,516.0,517.0,65300,1.0,,False
1996,2021-12-06,20211206_9991,9991,796.0,800.0,785.0,785.0,29100,1.0,,False
1997,2021-12-06,20211206_9993,9993,1645.0,1653.0,1627.0,1627.0,6200,1.0,,False
1998,2021-12-06,20211206_9994,9994,2394.0,2433.0,2393.0,2418.0,7800,1.0,,False


In [7]:
env.predict(pd.DataFrame(a[5]["Rank"]))

## Data observations

### Stock prices (training set)

- Stocks are identified by their SecuritiesCode
- Target is to be used for prediction for a given date -> (target(t+2) - target(t+1))



In [8]:
df_stock_prices = pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/stock_prices.csv")
# df_stock_prices[df_stock_prices['SecuritiesCode'] == 1301]
df_stock_prices.tail(-20)

Unnamed: 0,RowId,Date,SecuritiesCode,Open,High,Low,Close,Volume,AdjustmentFactor,ExpectedDividend,SupervisionFlag,Target
20,20170104_1719,2017-01-04,1719,783.0,797.0,779.0,795.0,2039100,1.0,,False,-0.005051
21,20170104_1720,2017-01-04,1720,937.0,952.0,937.0,950.0,634400,1.0,,False,0.005219
22,20170104_1721,2017-01-04,1721,2154.0,2198.0,2147.0,2187.0,510500,1.0,,False,0.012582
23,20170104_1723,2017-01-04,1723,2033.0,2120.0,2021.0,2088.0,48000,1.0,,False,-0.006226
24,20170104_1726,2017-01-04,1726,254.0,261.0,251.0,261.0,539500,1.0,,False,0.011194
...,...,...,...,...,...,...,...,...,...,...,...,...
2332526,20211203_9990,2021-12-03,9990,514.0,528.0,513.0,528.0,44200,1.0,,False,0.034816
2332527,20211203_9991,2021-12-03,9991,782.0,794.0,782.0,794.0,35900,1.0,,False,0.025478
2332528,20211203_9993,2021-12-03,9993,1690.0,1690.0,1645.0,1645.0,7200,1.0,,False,-0.004302
2332529,20211203_9994,2021-12-03,9994,2388.0,2396.0,2380.0,2389.0,6500,1.0,,False,0.009098


In [9]:
df_options = pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/options.csv")
df_options[df_options["OptionsCode"] == 132010018]


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,DateCode,Date,OptionsCode,WholeDayOpen,WholeDayHigh,WholeDayLow,WholeDayClose,NightSessionOpen,NightSessionHigh,NightSessionLow,...,Putcall,LastTradingDay,SpecialQuotationDay,SettlementPrice,TheoreticalPrice,BaseVolatility,ImpliedVolatility,InterestRate,DividendRate,Dividend
0,20170104_132010018,2017-01-04,132010018,650.0,650.0,480.0,480.0,0.0,0.0,0.0,...,1,20170112,20170113,480.0,478.4587,17.4736,17.5865,0.0091,0.0,0.0
2174,20170105_132010018,2017-01-05,132010018,480.0,500.0,455.0,500.0,480.0,480.0,455.0,...,1,20170112,20170113,520.0,517.4457,15.82165,16.1341,0.0091,0.0,0.0
4354,20170106_132010018,2017-01-06,132010018,570.0,570.0,570.0,570.0,0.0,0.0,0.0,...,1,20170112,20170113,570.0,569.9998,15.006,16.4274,0.0091,0.0,0.0
6534,20170110_132010018,2017-01-10,132010018,580.0,580.0,425.0,545.0,580.0,580.0,425.0,...,1,20170112,20170113,700.0,699.6336,18.2758,17.0031,0.0091,0.0,0.0
8714,20170111_132010018,2017-01-11,132010018,615.0,615.0,615.0,615.0,615.0,615.0,615.0,...,1,20170112,20170113,645.0,641.4201,17.9209,25.1,0.0091,0.0,0.0
10894,20170112_132010018,2017-01-12,132010018,620.0,620.0,620.0,620.0,620.0,620.0,620.0,...,1,20170112,20170113,870.0,865.7822,17.783,32.5,0.0091,0.0,0.0


In [10]:
df_options.describe()

Unnamed: 0,OptionsCode,WholeDayOpen,WholeDayHigh,WholeDayLow,WholeDayClose,DaySessionOpen,DaySessionHigh,DaySessionLow,DaySessionClose,TradingVolume,...,Putcall,LastTradingDay,SpecialQuotationDay,SettlementPrice,TheoreticalPrice,BaseVolatility,ImpliedVolatility,InterestRate,DividendRate,Dividend
count,3567694.0,3567694.0,3567694.0,3567694.0,3567694.0,3567694.0,3567694.0,3567694.0,3567694.0,3567694.0,...,3567694.0,3567694.0,3567694.0,3567694.0,3567694.0,3567694.0,3567694.0,3567694.0,3567694.0,3567694.0
mean,154725900.0,24.5556,26.19204,23.02928,24.4509,20.0308,20.97014,19.16712,20.02161,41.83561,...,1.5,20215390.0,20215400.0,3285.953,3294.785,18.07872,22.41038,0.1236648,1.664867,0.0
std,23608560.0,188.1537,192.7197,184.5273,188.503,167.9464,170.6218,165.6358,168.096,331.3341,...,0.5,28388.69,28388.69,3326.602,3345.038,7.588283,14.92312,0.03697845,1.239988,0.0
min,130060000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,20170110.0,20170110.0,1.0,0.0,8.28535,1.0,-0.0318,0.0,0.0
25%,136103000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,20191210.0,20191210.0,640.0,634.831,13.9833,17.5498,0.1264,1.3975,0.0
50%,144110700.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.5,20210710.0,20210710.0,2300.0,2307.775,16.66065,19.3904,0.1364,1.7185,0.0
75%,184037700.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,20231210.0,20231210.0,4910.0,4912.884,19.3584,23.19487,0.1464,1.9897,0.0
max,199248000.0,22850.0,22850.0,22850.0,22850.0,15270.0,15350.0,15270.0,15350.0,28496.0,...,2.0,20290610.0,20290610.0,36350.0,36160.8,75.8263,320.0,0.1564,24.8994,0.0


In [11]:
df_trades = pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/trades.csv")
df_trades

Unnamed: 0,Date,StartDate,EndDate,Section,TotalSales,TotalPurchases,TotalTotal,TotalBalance,ProprietarySales,ProprietaryPurchases,...,CityBKsRegionalBKsEtcTotal,CityBKsRegionalBKsEtcBalance,TrustBanksSales,TrustBanksPurchases,TrustBanksTotal,TrustBanksBalance,OtherFinancialInstitutionsSales,OtherFinancialInstitutionsPurchases,OtherFinancialInstitutionsTotal,OtherFinancialInstitutionsBalance
0,2017-01-04,,,,,,,,,,...,,,,,,,,,,
1,2017-01-05,,,,,,,,,,...,,,,,,,,,,
2,2017-01-06,,,,,,,,,,...,,,,,,,,,,
3,2017-01-10,,,,,,,,,,...,,,,,,,,,,
4,2017-01-11,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1707,2021-12-01,,,,,,,,,,...,,,,,,,,,,
1708,2021-12-02,2021-11-22,2021-11-26,Growth Market (Mothers/JASDAQ),1.143466e+09,1.143923e+09,2.287389e+09,456677.0,3.663919e+07,3.496068e+07,...,396230.0,-275608.0,6696755.0,6886122.0,13582877.0,189367.0,234653.0,298525.0,533178.0,63872.0
1709,2021-12-02,2021-11-22,2021-11-26,Prime Market (First Section),1.138343e+10,1.137621e+10,2.275964e+10,-7214179.0,1.499660e+09,1.230944e+09,...,35957940.0,-17510292.0,254580089.0,261919512.0,516499601.0,7339423.0,11959898.0,16368287.0,28328185.0,4408389.0
1710,2021-12-02,2021-11-22,2021-11-26,Standard Market (Second Section),1.069969e+08,1.075036e+08,2.145004e+08,506702.0,2.811025e+06,3.273163e+06,...,42127.0,-42127.0,438928.0,243817.0,682745.0,-195111.0,60291.0,6985.0,67276.0,-53306.0


In [12]:
df_trades.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1712 entries, 0 to 1711
Data columns (total 56 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Date                                 1712 non-null   object 
 1   StartDate                            765 non-null    object 
 2   EndDate                              765 non-null    object 
 3   Section                              765 non-null    object 
 4   TotalSales                           765 non-null    float64
 5   TotalPurchases                       765 non-null    float64
 6   TotalTotal                           765 non-null    float64
 7   TotalBalance                         765 non-null    float64
 8   ProprietarySales                     765 non-null    float64
 9   ProprietaryPurchases                 765 non-null    float64
 10  ProprietaryTotal                     765 non-null    float64
 11  ProprietaryBalance            

## Financials

**TO DO : Data browsing**
- Inspect relation security codes between dataframes

In [13]:
df_financials = pd.read_csv("/kaggle/input/jpx-tokyo-stock-exchange-prediction/train_files/financials.csv")
df_financials

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,DisclosureNumber,DateCode,Date,SecuritiesCode,DisclosedDate,DisclosedTime,DisclosedUnixTime,TypeOfDocument,CurrentPeriodEndDate,TypeOfCurrentPeriod,...,ForecastEarningsPerShare,ApplyingOfSpecificAccountingOfTheQuarterlyFinancialStatements,MaterialChangesInSubsidiaries,ChangesBasedOnRevisionsOfAccountingStandard,ChangesOtherThanOnesBasedOnRevisionsOfAccountingStandard,ChangesInAccountingEstimates,RetrospectiveRestatement,NumberOfIssuedAndOutstandingSharesAtTheEndOfFiscalYearIncludingTreasuryStock,NumberOfTreasuryStockAtTheEndOfFiscalYear,AverageNumberOfShares
0,2.016121e+13,20170104_2753,2017-01-04,2753.0,2017-01-04,07:30:00,1.483483e+09,3QFinancialStatements_Consolidated_JP,2016-12-31,3Q,...,319.76,,False,True,False,False,False,6848800.0,－,6848800.0
1,2.017010e+13,20170104_3353,2017-01-04,3353.0,2017-01-04,15:00:00,1.483510e+09,3QFinancialStatements_Consolidated_JP,2016-11-30,3Q,...,485.36,,False,True,False,False,False,2035000.0,118917,1916083.0
2,2.016123e+13,20170104_4575,2017-01-04,4575.0,2017-01-04,12:00:00,1.483499e+09,ForecastRevision,2016-12-31,2Q,...,-93.11,,,,,,,,,
3,2.017010e+13,20170105_2659,2017-01-05,2659.0,2017-01-05,15:00:00,1.483596e+09,3QFinancialStatements_Consolidated_JP,2016-11-30,3Q,...,285.05,,False,True,False,False,False,31981654.0,18257,31963405.0
4,2.017011e+13,20170105_3050,2017-01-05,3050.0,2017-01-05,15:30:00,1.483598e+09,ForecastRevision,2017-02-28,FY,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92951,2.021112e+13,20211203_6040,2021-12-03,6040.0,2021-12-03,15:00:00,1.638511e+09,1QFinancialStatements_Consolidated_JP,2021-10-31,1Q,...,－,,False,True,False,False,False,16000400.0,836400,15164000.0
92952,2.021120e+13,20211203_6898,2021-12-03,6898.0,2021-12-03,16:00:00,1.638515e+09,3QFinancialStatements_Consolidated_JP,2021-10-31,3Q,...,142.01,,False,False,False,False,False,816979.0,157541,659486.0
92953,2.021120e+13,20211203_6969,2021-12-03,6969.0,2021-12-03,15:00:00,1.638511e+09,ForecastRevision,2022-03-31,FY,...,-147.87,,,,,,,,,
92954,2.021112e+13,20211203_8057,2021-12-03,8057.0,2021-12-03,17:00:00,1.638518e+09,1QFinancialStatements_Consolidated_JP,2021-10-20,1Q,...,330.92,,False,True,False,False,False,10419371.0,614032,9805339.0


In [14]:
df_financials.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92956 entries, 0 to 92955
Data columns (total 45 columns):
 #   Column                                                                        Non-Null Count  Dtype  
---  ------                                                                        --------------  -----  
 0   DisclosureNumber                                                              92954 non-null  float64
 1   DateCode                                                                      92954 non-null  object 
 2   Date                                                                          92956 non-null  object 
 3   SecuritiesCode                                                                92954 non-null  float64
 4   DisclosedDate                                                                 92954 non-null  object 
 5   DisclosedTime                                                                 92954 non-null  object 
 6   DisclosedUnixTime             