In [1]:
import pandas as pd
import pathlib
from sklearn import preprocessing
import numpy as np
import random
import tensorflow as tf
from IPython.display import clear_output


In [2]:
prices = pd.read_csv('../../data/854_1575_bundle_archive/prices-split-adjusted.csv')
prices

Unnamed: 0,date,symbol,open,close,low,high,volume
0,2016-01-05,WLTW,123.430000,125.839996,122.309998,126.250000,2163600.0
1,2016-01-06,WLTW,125.239998,119.980003,119.940002,125.540001,2386400.0
2,2016-01-07,WLTW,116.379997,114.949997,114.930000,119.739998,2489500.0
3,2016-01-08,WLTW,115.480003,116.620003,113.500000,117.440002,2006300.0
4,2016-01-11,WLTW,117.010002,114.970001,114.089996,117.330002,1408600.0
...,...,...,...,...,...,...,...
851259,2016-12-30,ZBH,103.309998,103.199997,102.849998,103.930000,973800.0
851260,2016-12-30,ZION,43.070000,43.040001,42.689999,43.310001,1938100.0
851261,2016-12-30,ZTS,53.639999,53.529999,53.270000,53.740002,1701200.0
851262,2016-12-30,AIV,44.730000,45.450001,44.410000,45.590000,1380900.0


In [3]:
len(np.unique(prices.symbol))

501

In [4]:
prices[prices.symbol=="WLTW"].close.to_numpy()

array([125.839996, 119.980003, 114.949997, 116.620003, 114.970001,
       115.550003, 112.849998, 114.379997, 112.529999, 110.379997,
       109.300003, 110.      , 111.949997, 110.120003, 111.      ,
       110.709999, 112.580002, 114.470001, 114.5     , 110.559998,
       114.050003, 115.709999, 114.019997, 111.160004, 110.650002,
       107.519997, 107.129997, 107.839996, 110.769997, 111.239998,
       111.599998, 110.330002, 113.040001, 111.889999, 111.559998,
       112.879997, 112.75    , 113.32    , 115.510002, 116.779999,
       117.      , 117.190002, 116.949997, 116.709999, 116.489998,
       116.82    , 120.620003, 120.629997, 120.699997, 120.82    ,
       124.029999, 121.269997, 121.449997, 121.470001, 119.379997,
       119.410004, 118.720001, 117.980003, 118.739998, 118.660004,
       119.93    , 117.650002, 114.07    , 115.470001, 113.540001,
       113.410004, 114.639999, 115.790001, 117.989998, 117.720001,
       118.449997, 121.260002, 121.68    , 122.309998, 123.010

In [5]:
unique_symbols = pd.unique(prices.symbol)
sum_change = 0
count = 0
for symbol in unique_symbols:
    selected_data = prices[prices.symbol==symbol].close.to_numpy()
    for i in range(len(selected_data)-1):
        sum_change += (selected_data[i+1]-selected_data[i])/selected_data[i]
        count += 1
    clear_output(True)
    print('%{}'.format(count/len(prices)*100))
    print('{} / {}'.format(count, len(prices)))
average_change = sum_change / count * 100

%76.77770938275317
653581 / 851264


KeyboardInterrupt: 

In [None]:
average_change * 100

In [None]:
unique_symbols = pd.unique(prices.symbol)
count = 0
positive_sum_change = 0
positive_count = 0
negative_sum_change = 0
negative_count = 0
for symbol in unique_symbols:
    selected_data = prices[prices.symbol==symbol].close.to_numpy()
    for i in range(len(selected_data)-1):
        change = (selected_data[i+1]-selected_data[i])
        if change>0:
            positive_sum_change += (selected_data[i+1]-selected_data[i])/selected_data[i]
            positive_count += 1
        elif change<0:
            negative_sum_change += (selected_data[i+1]-selected_data[i])/selected_data[i]
            negative_count += 1
        count += 1
    clear_output(True)
    print('%{}'.format(count/len(prices)*100))
    print('{} / {}'.format(count, len(prices)))
average_positive_change = positive_sum_change / positive_count * 100
average_negative_change = negative_sum_change / negative_count * 100

In [None]:
average_positive_change

In [None]:
average_negative_change

In [None]:
prices.date[0]

In [3]:
prices["year"] = prices.date.str[:4]

In [4]:
prices.year

0         2016
1         2016
2         2016
3         2016
4         2016
          ... 
851259    2016
851260    2016
851261    2016
851262    2016
851263    2016
Name: year, Length: 851264, dtype: object

In [5]:
min(prices.date)

'2010-01-04'

In [6]:
details = pd.read_csv('../../data/854_1575_bundle_archive/securities.csv')
details[["Ticker symbol","GICS Sector","GICS Sub Industry"]]

Unnamed: 0,Ticker symbol,GICS Sector,GICS Sub Industry
0,MMM,Industrials,Industrial Conglomerates
1,ABT,Health Care,Health Care Equipment
2,ABBV,Health Care,Pharmaceuticals
3,ACN,Information Technology,IT Consulting & Other Services
4,ATVI,Information Technology,Home Entertainment Software
...,...,...,...
500,YHOO,Information Technology,Internet Software & Services
501,YUM,Consumer Discretionary,Restaurants
502,ZBH,Health Care,Health Care Equipment
503,ZION,Financials,Regional Banks


In [7]:
en_tokenizer = tf.keras.preprocessing.text.Tokenizer()
en_tokenizer.fit_on_texts(details["GICS Sub Industry"])

data_en = en_tokenizer.texts_to_sequences(details["GICS Sub Industry"])
data_en = tf.keras.preprocessing.sequence.pad_sequences(data_en,padding='post')
data_en

array([[  8,  13,   0,   0,   0],
       [  2,   3,   6,   0,   0],
       [ 31,   0,   0,   0,   0],
       ...,
       [  2,   3,   6,   0,   0],
       [114,  12,   0,   0,   0],
       [ 31,   0,   0,   0,   0]], dtype=int32)

In [8]:
en_tokenizer.get_config()

{'num_words': None,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'lower': True,
 'split': ' ',
 'char_level': False,
 'oov_token': None,
 'document_count': 505,
 'word_counts': '{"industrial": 30, "conglomerates": 21, "health": 43, "care": 40, "equipment": 35, "pharmaceuticals": 10, "it": 5, "consulting": 9, "other": 5, "services": 60, "home": 5, "entertainment": 2, "software": 31, "electrical": 2, "components": 4, "application": 5, "automotive": 1, "retail": 23, "independent": 2, "power": 2, "producers": 2, "energy": 2, "traders": 2, "managed": 6, "asset": 6, "management": 6, "custody": 6, "banks": 22, "life": 5, "insurance": 17, "gases": 2, "internet": 24, "airlines": 5, "specialty": 14, "chemicals": 13, "biotechnology": 7, "building": 5, "products": 15, "data": 4, "processing": 3, "outsourced": 3, "electric": 12, "utilities": 13, "property": 8, "casualty": 8, "tobacco": 3, "direct": 4, "marketing": 10, "multiutilities": 13, "consumer": 8, "finance": 7, "specialized": 3, "rei

In [9]:
import json 
len(json.loads(en_tokenizer.get_config()["word_index"]))

184

In [10]:
ts = np.append(np.array(details["Ticker symbol"]).reshape((505,1)),data_en,axis=-1)

In [11]:
ts[:,0]

array(['MMM', 'ABT', 'ABBV', 'ACN', 'ATVI', 'AYI', 'ADBE', 'AAP', 'AES',
       'AET', 'AMG', 'AFL', 'A', 'APD', 'AKAM', 'ALK', 'ALB', 'ALXN',
       'ALLE', 'AGN', 'ADS', 'LNT', 'ALL', 'GOOGL', 'GOOG', 'MO', 'AMZN',
       'AEE', 'AAL', 'AEP', 'AXP', 'AIG', 'AMT', 'AWK', 'AMP', 'ABC',
       'AME', 'AMGN', 'APH', 'APC', 'ADI', 'ANTM', 'AON', 'APA', 'AIV',
       'AAPL', 'AMAT', 'ADM', 'ARNC', 'AJG', 'AIZ', 'T', 'ADSK', 'ADP',
       'AN', 'AZO', 'AVB', 'AVY', 'BHI', 'BLL', 'BAC', 'BCR', 'BAX',
       'BBT', 'BDX', 'BBBY', 'BRK.B', 'BBY', 'BIIB', 'BLK', 'HRB', 'BA',
       'BWA', 'BXP', 'BSX', 'BMY', 'AVGO', 'BF.B', 'CHRW', 'CA', 'COG',
       'CPB', 'COF', 'CAH', 'KMX', 'CCL', 'CAT', 'CBG', 'CBS', 'CELG',
       'CNC', 'CNP', 'CTL', 'CERN', 'CF', 'SCHW', 'CHTR', 'CHK', 'CVX',
       'CMG', 'CB', 'CHD', 'CI', 'XEC', 'CINF', 'CTAS', 'CSCO', 'C',
       'CFG', 'CTXS', 'CME', 'CMS', 'COH', 'KO', 'CTSH', 'CL', 'CMCSA',
       'CMA', 'CAG', 'CXO', 'COP', 'ED', 'STZ', 'GLW', 'COST', 'COTY',


In [12]:
i = np.argwhere(ts[:,0]=='HIG')[0,0]

In [13]:
data_en.shape

(505, 5)

In [14]:
a = np.empty((2,data_en.shape[1]))

In [15]:
a[0] = ts[i][1:]

In [16]:
a[1] = ts[i][1:]

In [17]:
a

array([[41., 42., 17.,  0.,  0.],
       [41., 42., 17.,  0.,  0.]])

In [18]:
ts[i][1:]

array([41, 42, 17, 0, 0], dtype=object)

In [19]:
len(details["GICS Sector"].unique())

11

In [20]:
len(details["GICS Sub Industry"].unique())

124

In [21]:
fundamentals = pd.read_csv('../../data/854_1575_bundle_archive/fundamentals.csv')
fundamentals = fundamentals.drop(columns=fundamentals.columns[0]).dropna()
fundamentals["Key"] = fundamentals["Ticker Symbol"]+fundamentals["For Year"].astype(int).astype(str)
fundamentals = fundamentals.drop(columns="Ticker Symbol")
fundamentals = fundamentals.drop(columns="Period Ending")
fundamentals = fundamentals.drop(columns="For Year")
fundamentals

Unnamed: 0,Accounts Payable,Accounts Receivable,Add'l income/expense items,After Tax ROE,Capital Expenditures,Capital Surplus,Cash Ratio,Cash and Cash Equivalents,Changes in Inventories,Common Stocks,...,Total Current Assets,Total Current Liabilities,Total Equity,Total Liabilities,Total Liabilities & Equity,Total Revenue,Treasury Stock,Earnings Per Share,Estimated Shares Outstanding,Key
0,3.068000e+09,-222000000.0,-1.961000e+09,23.0,-1.888000e+09,4.695000e+09,53.0,1.330000e+09,0.0,127000000.0,...,7.072000e+09,9.011000e+09,-7.987000e+09,2.489100e+10,1.690400e+10,2.485500e+10,-3.670000e+08,-5.60,3.350000e+08,AAL2012
1,4.975000e+09,-93000000.0,-2.723000e+09,67.0,-3.114000e+09,1.059200e+10,75.0,2.175000e+09,0.0,5000000.0,...,1.432300e+10,1.380600e+10,-2.731000e+09,4.500900e+10,4.227800e+10,2.674300e+10,0.000000e+00,-11.25,1.630222e+08,AAL2013
2,4.668000e+09,-160000000.0,-1.500000e+08,143.0,-5.311000e+09,1.513500e+10,60.0,1.768000e+09,0.0,7000000.0,...,1.175000e+10,1.340400e+10,2.021000e+09,4.120400e+10,4.322500e+10,4.265000e+10,0.000000e+00,4.02,7.169154e+08,AAL2014
3,5.102000e+09,352000000.0,-7.080000e+08,135.0,-6.151000e+09,1.159100e+10,51.0,1.085000e+09,0.0,6000000.0,...,9.985000e+09,1.360500e+10,5.635000e+09,4.278000e+10,4.841500e+10,4.099000e+10,0.000000e+00,11.39,6.681299e+08,AAL2015
4,2.409453e+09,-89482000.0,6.000000e+05,32.0,-2.711820e+08,5.202150e+08,23.0,5.981110e+08,-260298000.0,7000.0,...,3.184200e+09,2.559638e+09,1.210694e+09,3.403120e+09,4.613814e+09,6.205003e+09,-2.709500e+07,5.29,7.328355e+07,AAP2012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1771,2.255000e+08,-40400000.0,-3.480000e+07,11.0,-3.423000e+08,4.330700e+09,166.0,1.083300e+09,-164600000.0,2700000.0,...,4.313300e+09,1.024000e+09,6.549900e+09,3.108100e+09,9.658000e+09,4.673300e+09,-6.183700e+09,4.26,1.690845e+08,ZBH2014
1772,4.320000e+08,-56100000.0,-2.750000e+07,1.0,-4.341000e+08,8.195300e+09,100.0,1.459300e+09,-205400000.0,3000000.0,...,5.862900e+09,1.617900e+09,9.887900e+09,1.733160e+10,2.721950e+10,5.997800e+09,-6.329100e+09,0.78,1.884615e+08,ZBH2015
1777,1.381000e+09,-99000000.0,9.000000e+06,54.0,-1.840000e+08,8.780000e+08,43.0,6.100000e+08,-178000000.0,5000000.0,...,3.357000e+09,1.415000e+09,9.400000e+08,5.618000e+09,6.558000e+09,4.561000e+09,0.000000e+00,1.01,4.990099e+08,ZTS2013
1778,1.071000e+09,69000000.0,-7.000000e+06,44.0,-1.800000e+08,9.580000e+08,81.0,8.820000e+08,-110000000.0,5000000.0,...,3.465000e+09,1.086000e+09,1.311000e+09,5.277000e+09,6.588000e+09,4.785000e+09,0.000000e+00,1.16,5.025862e+08,ZTS2014


In [22]:
fundamentals["Key"]

0       AAL2012
1       AAL2013
2       AAL2014
3       AAL2015
4       AAP2012
         ...   
1771    ZBH2014
1772    ZBH2015
1777    ZTS2013
1778    ZTS2014
1779    ZTS2015
Name: Key, Length: 1299, dtype: object

In [23]:
prices["Key"] = prices.symbol+prices.year
#prices = prices.drop(columns="symbol")
#prices = prices.drop(columns="date")


In [24]:
prices["Key"]

0         WLTW2016
1         WLTW2016
2         WLTW2016
3         WLTW2016
4         WLTW2016
            ...   
851259     ZBH2016
851260    ZION2016
851261     ZTS2016
851262     AIV2016
851263     FTV2016
Name: Key, Length: 851264, dtype: object

In [25]:
prices

Unnamed: 0,date,symbol,open,close,low,high,volume,year,Key
0,2016-01-05,WLTW,123.430000,125.839996,122.309998,126.250000,2163600.0,2016,WLTW2016
1,2016-01-06,WLTW,125.239998,119.980003,119.940002,125.540001,2386400.0,2016,WLTW2016
2,2016-01-07,WLTW,116.379997,114.949997,114.930000,119.739998,2489500.0,2016,WLTW2016
3,2016-01-08,WLTW,115.480003,116.620003,113.500000,117.440002,2006300.0,2016,WLTW2016
4,2016-01-11,WLTW,117.010002,114.970001,114.089996,117.330002,1408600.0,2016,WLTW2016
...,...,...,...,...,...,...,...,...,...
851259,2016-12-30,ZBH,103.309998,103.199997,102.849998,103.930000,973800.0,2016,ZBH2016
851260,2016-12-30,ZION,43.070000,43.040001,42.689999,43.310001,1938100.0,2016,ZION2016
851261,2016-12-30,ZTS,53.639999,53.529999,53.270000,53.740002,1701200.0,2016,ZTS2016
851262,2016-12-30,AIV,44.730000,45.450001,44.410000,45.590000,1380900.0,2016,AIV2016


In [26]:
data = prices.merge(fundamentals, on='Key')
data

Unnamed: 0,date,symbol,open,close,low,high,volume,year,Key,Accounts Payable,...,Total Assets,Total Current Assets,Total Current Liabilities,Total Equity,Total Liabilities,Total Liabilities & Equity,Total Revenue,Treasury Stock,Earnings Per Share,Estimated Shares Outstanding
0,2012-01-03,AAL,5.200000,5.120000,5.070000,5.220000,6105900.0,2012,AAL2012,3.068000e+09,...,2.351000e+10,7.072000e+09,9.011000e+09,-7.987000e+09,2.489100e+10,1.690400e+10,2.485500e+10,-367000000.0,-5.60,3.350000e+08
1,2012-01-04,AAL,5.090000,5.030000,4.970000,5.180000,5268700.0,2012,AAL2012,3.068000e+09,...,2.351000e+10,7.072000e+09,9.011000e+09,-7.987000e+09,2.489100e+10,1.690400e+10,2.485500e+10,-367000000.0,-5.60,3.350000e+08
2,2012-01-05,AAL,5.150000,5.470000,5.070000,5.480000,8216100.0,2012,AAL2012,3.068000e+09,...,2.351000e+10,7.072000e+09,9.011000e+09,-7.987000e+09,2.489100e+10,1.690400e+10,2.485500e+10,-367000000.0,-5.60,3.350000e+08
3,2012-01-06,AAL,5.440000,5.600000,5.400000,5.670000,8323000.0,2012,AAL2012,3.068000e+09,...,2.351000e+10,7.072000e+09,9.011000e+09,-7.987000e+09,2.489100e+10,1.690400e+10,2.485500e+10,-367000000.0,-5.60,3.350000e+08
4,2012-01-09,AAL,5.560000,5.720000,5.550000,5.800000,8029900.0,2012,AAL2012,3.068000e+09,...,2.351000e+10,7.072000e+09,9.011000e+09,-7.987000e+09,2.489100e+10,1.690400e+10,2.485500e+10,-367000000.0,-5.60,3.350000e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323947,2016-12-23,XLNX,60.380001,60.560001,60.080002,60.570000,1105700.0,2016,XLNX2016,3.072220e+08,...,4.823154e+09,3.918933e+09,9.460860e+08,2.589893e+09,2.233261e+09,4.823154e+09,2.213881e+09,0.0,2.14,2.574145e+08
323948,2016-12-27,XLNX,61.070000,61.240002,60.889999,61.580002,1594400.0,2016,XLNX2016,3.072220e+08,...,4.823154e+09,3.918933e+09,9.460860e+08,2.589893e+09,2.233261e+09,4.823154e+09,2.213881e+09,0.0,2.14,2.574145e+08
323949,2016-12-28,XLNX,62.139999,60.630001,60.590000,62.240002,2141000.0,2016,XLNX2016,3.072220e+08,...,4.823154e+09,3.918933e+09,9.460860e+08,2.589893e+09,2.233261e+09,4.823154e+09,2.213881e+09,0.0,2.14,2.574145e+08
323950,2016-12-29,XLNX,60.599998,60.740002,60.360001,61.119999,1398100.0,2016,XLNX2016,3.072220e+08,...,4.823154e+09,3.918933e+09,9.460860e+08,2.589893e+09,2.233261e+09,4.823154e+09,2.213881e+09,0.0,2.14,2.574145e+08


In [27]:
window_size = 32
run_lenght = 32
sampled_symbols = random.sample(list(data.symbol.unique()), 3)
data[data.symbol.isin(sampled_symbols)]
batch_data = []
for symbol in sampled_symbols:
    index = random.randint(window_size+run_lenght, len(data[data.symbol==symbol]))
    selected_data = data[data.symbol==symbol][index-(window_size+run_lenght):index]
    selected_data = selected_data.drop(["symbol","date","year","Key"],axis=1)
    batch_data.append(selected_data)   
batch_data[2]

Unnamed: 0,open,close,low,high,volume,Accounts Payable,Accounts Receivable,Add'l income/expense items,After Tax ROE,Capital Expenditures,...,Total Assets,Total Current Assets,Total Current Liabilities,Total Equity,Total Liabilities,Total Liabilities & Equity,Total Revenue,Treasury Stock,Earnings Per Share,Estimated Shares Outstanding
187977,23.240000,23.400000,22.910000,23.480000,4684200.0,787000000.0,0.0,157000000.0,5.0,-1.110000e+09,...,2.491600e+10,5.439000e+09,2.198000e+09,1.027400e+10,1.464200e+10,2.491600e+10,7.292000e+09,0.0,1.02,4.980392e+08
187978,23.500000,23.160000,23.070000,23.520000,4283300.0,787000000.0,0.0,157000000.0,5.0,-1.110000e+09,...,2.491600e+10,5.439000e+09,2.198000e+09,1.027400e+10,1.464200e+10,2.491600e+10,7.292000e+09,0.0,1.02,4.980392e+08
187979,22.990000,23.309999,22.889999,23.430000,4414800.0,787000000.0,0.0,157000000.0,5.0,-1.110000e+09,...,2.491600e+10,5.439000e+09,2.198000e+09,1.027400e+10,1.464200e+10,2.491600e+10,7.292000e+09,0.0,1.02,4.980392e+08
187980,23.330000,23.740000,23.230000,23.740000,5886600.0,787000000.0,0.0,157000000.0,5.0,-1.110000e+09,...,2.491600e+10,5.439000e+09,2.198000e+09,1.027400e+10,1.464200e+10,2.491600e+10,7.292000e+09,0.0,1.02,4.980392e+08
187981,24.070000,24.540001,23.910000,24.590000,12448000.0,787000000.0,0.0,157000000.0,5.0,-1.110000e+09,...,2.491600e+10,5.439000e+09,2.198000e+09,1.027400e+10,1.464200e+10,2.491600e+10,7.292000e+09,0.0,1.02,4.980392e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188036,25.860001,25.290001,25.030001,25.889999,6349300.0,787000000.0,0.0,157000000.0,5.0,-1.110000e+09,...,2.491600e+10,5.439000e+09,2.198000e+09,1.027400e+10,1.464200e+10,2.491600e+10,7.292000e+09,0.0,1.02,4.980392e+08
188037,25.459999,25.540001,25.209999,25.610001,5863800.0,787000000.0,0.0,157000000.0,5.0,-1.110000e+09,...,2.491600e+10,5.439000e+09,2.198000e+09,1.027400e+10,1.464200e+10,2.491600e+10,7.292000e+09,0.0,1.02,4.980392e+08
188038,25.389999,25.370001,25.180000,25.559999,6760700.0,787000000.0,0.0,157000000.0,5.0,-1.110000e+09,...,2.491600e+10,5.439000e+09,2.198000e+09,1.027400e+10,1.464200e+10,2.491600e+10,7.292000e+09,0.0,1.02,4.980392e+08
188039,25.370001,25.590000,25.100000,25.620001,6588700.0,787000000.0,0.0,157000000.0,5.0,-1.110000e+09,...,2.491600e+10,5.439000e+09,2.198000e+09,1.027400e+10,1.464200e+10,2.491600e+10,7.292000e+09,0.0,1.02,4.980392e+08


In [36]:
sampled_symbols = list(data.symbol.unique())
lens = []
for symbol in sampled_symbols:
    lens.append(len(data[data.symbol==symbol]))

NameError: name 'data' is not defined

In [37]:
min(lens)

NameError: name 'lens' is not defined

In [None]:
fit_data = data.drop(["symbol","date","year","Key"], axis=1)
scaler = preprocessing.StandardScaler().fit(fit_data)
selected_data = scaler.transform(batch_data[0])
batch_data[0]

In [None]:
batch_data[0].close.to_numpy()[2]

In [None]:
returndata = list(map(lambda x: scaler.transform(x[0:4]),batch_data))

In [None]:
batch_data[0].iloc[[0]].close

In [None]:
unique_symbols = pd.unique(data.symbol)
sampled_symbols = np.random.choice(unique_symbols, int(len(unique_symbols)*0.2))
test_batch_data = []
window_size = 32
run_lenght = 32
for symbol in sampled_symbols:
    end_index = random.randint(window_size+run_lenght, len(data[data.symbol==symbol]))
    selected_data = data[data.symbol==symbol][end_index-(window_size+run_lenght):end_index]
    # selected_data = selected_data.drop(["symbol","date","year","Key"], axis=1)
    test_batch_data.append(selected_data)
test_batch_data    


In [None]:
test_batch_data[3]

In [None]:
unique_symbols = pd.unique(data.symbol)
sampled_symbols = np.random.choice(unique_symbols, int(len(unique_symbols)*0.2), replace=False)

In [None]:
len(unique_symbols)

In [None]:
len(np.array([i for i in unique_symbols if i not in sampled_symbols]))

In [None]:
len(np.setdiff1d(unique_symbols, sampled_symbols))+len(sampled_symbols)

In [None]:
len(np.setdiff1d(unique_symbols, sampled_symbols))

In [None]:
np.random.choice(np.setdiff1d(unique_symbols, sampled_symbols), 32, replace=False)

In [None]:
len(sampled_symbols)

In [None]:
batch_data[0].to_numpy()