In [476]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import set_config; set_config(display='diagram')
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_validate

In [477]:
df = pd.read_csv('../data/stocks_quarterly.csv')

In [478]:
df.drop_duplicates(inplace=True)

In [479]:
cols_to_drop = df.isnull().mean().sort_values(ascending = False).head(45).index

In [480]:
cols_to_drop = list(cols_to_drop) + ['fiscalDateEnding','reportedDate','price','nasd_price','next_year_date','next_year_price','nasd_ny_price','Nasdaq_Performance', 'Stock_Performance']

In [481]:
df.drop(columns = cols_to_drop, inplace = True)

In [482]:
df = df.fillna(0)

In [485]:
columns = df.columns[0:-2]

In [486]:
column_transformer = make_column_transformer((MinMaxScaler(),columns),
                                              remainder="passthrough")

pipe = make_pipeline(column_transformer, KNNImputer())

In [487]:
X = column_transformer .fit_transform(df)


In [488]:
X.shape

(8776, 48)

In [489]:
df

Unnamed: 0,grossProfit,totalRevenue,costOfRevenue,costofGoodsAndServicesSold,operatingIncome,sellingGeneralAndAdministrative,operatingExpenses,interestExpense,depreciationAndAmortization,incomeBeforeTax,...,cashflowFromFinancing,proceedsFromRepurchaseOfEquity,changeInCashAndCashEquivalents,netIncome_y,reportedEPS,estimatedEPS,surprise,surprisePercentage,symbol,Label
0,4.027000e+09,4.027000e+09,4.485000e+09,1.240000e+09,-2.515000e+09,186000000.0,1.127000e+09,376000000.0,483000000.0,-2.841000e+09,...,1.269000e+09,1.443000e+09,-2.515000e+09,-2.178000e+09,-3.86,-4.1212,0.2612,6.3380,AAL,0
1,-2.310000e+09,2.833000e+09,5.143000e+09,1.156000e+09,-2.871000e+09,70000000.0,1.941000e+09,340000000.0,498000000.0,-3.095000e+09,...,1.511000e+09,0.000000e+00,-2.399000e+09,-2.399000e+09,-5.54,-5.7547,0.2147,3.7309,AAL,1
2,-2.601000e+09,1.368000e+09,3.969000e+09,8.660000e+08,-2.486000e+09,43000000.0,1.133000e+09,254000000.0,499000000.0,-2.659000e+09,...,7.688000e+09,1.525000e+09,-2.067000e+09,-2.067000e+09,-7.82,-7.8956,0.0756,0.9575,AAL,1
3,1.241000e+09,8.258000e+09,7.017000e+09,2.197000e+09,-2.549000e+09,305000000.0,2.857000e+09,257000000.0,560000000.0,-2.890000e+09,...,5.260000e+08,-1.710000e+08,1.960000e+08,-2.241000e+09,-2.65,-2.3551,-0.2949,-12.5218,AAL,1
4,1.118900e+10,1.131300e+10,1.240000e+08,2.633000e+09,7.290000e+08,503000000.0,1.240000e+08,265000000.0,513000000.0,5.710000e+08,...,-1.272000e+09,-2.720000e+08,7.710000e+08,4.140000e+08,1.15,1.1559,-0.0059,-0.5104,AAL,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9103,2.969290e+08,7.005270e+08,4.035980e+08,4.035980e+08,-3.611350e+08,166667000.0,8.724380e+08,127513000.0,191590000.0,-4.899980e+08,...,-2.341000e+07,0.000000e+00,9.212000e+06,-5.054890e+08,0.67,0.5457,0.1243,22.7781,ENDP,1
9104,2.629950e+08,7.686400e+08,5.056450e+08,5.056450e+08,-3.035760e+08,161199000.0,4.413000e+08,129215000.0,240829000.0,-5.211930e+08,...,-3.164000e+07,0.000000e+00,-1.519050e+08,-3.684170e+08,0.77,0.6119,0.1581,25.8376,ENDP,1
9105,2.723650e+08,7.868870e+08,5.145220e+08,1.619700e+07,-2.372000e+06,135880000.0,4.559370e+08,128672000.0,243280000.0,-1.247790e+08,...,-3.577000e+07,0.000000e+00,4.103700e+07,-9.667000e+07,0.91,0.8504,0.0596,7.0085,ENDP,1
9106,3.363300e+08,8.757310e+08,5.394010e+08,5.428100e+07,-5.867280e+08,155555000.0,1.114758e+09,123354000.0,212801000.0,-1.453998e+09,...,-4.638900e+07,0.000000e+00,8.359200e+07,-1.396518e+09,0.93,0.7302,0.1998,27.3624,ENDP,1


In [490]:
df = pd.DataFrame(X,columns = df.columns)

In [492]:
y = np.array(df.groupby('symbol').head(1)['Label'])

In [493]:
tickers  = df['symbol'].unique()

In [494]:
new_data = []

for ticker in tickers:
    
    temp = df[df['symbol'] == ticker]
    temp.drop(columns = 'symbol', inplace = True)

    if len(temp)<17:
        add = pd.DataFrame(data=np.full((17 - len(temp),temp.shape[1]), -100_000),columns=temp.columns)
        temp = pd.concat([temp,add])
    
    new_data.append(np.array(temp))
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [495]:
X_new = np.array(new_data).astype('float32')

In [496]:
X_new.shape

(775, 17, 47)

In [497]:
y = y.astype('float32')

In [498]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_new,y,test_size=0.2)

In [499]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Flatten,Dropout,Masking

def init_model():

    model = Sequential()
    model.add(Masking(mask_value=-100_000, input_shape=(17,47)))
    model.add(LSTM(50, activation = 'relu'))
    model.add(Dense(50, activation = 'relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation = 'sigmoid'))
    
    model.compile(loss = 'BinaryCrossentropy',
                  optimizer = 'rmsprop',
                  metrics =['accuracy'])
    
    
    return model
    

In [500]:
from tensorflow.keras.callbacks import EarlyStopping

model = init_model()
es = EarlyStopping(patience=1)

history = model.fit(X_train, y_train, 
                      batch_size=16, 
                      epochs=40, 
                      validation_split=0.2)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [502]:
model.evaluate(X_test, y_test)



[1.4515619568555849e-06, 1.0]

In [504]:
y_train

array([0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 1.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
       1., 0., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1., 1., 0.,
       1., 1., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 1., 1., 0., 0., 1.,
       0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1.,
       1., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0.,
       1., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1.,
       1., 0., 0., 1., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 1., 0.,
       1., 0., 0., 1., 1., 0., 1., 0., 1., 1., 1., 0., 1., 0., 1., 0., 1.,
       0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 0., 1., 1.,
       0., 1., 0., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 0., 0., 1.,
       0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 1., 0., 1., 1.