In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Activation, Dense, Conv1D, MaxPooling1D, MaxPooling2D, AveragePooling2D, Conv2D, Flatten, Dropout, Input, concatenate, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from scipy import stats
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import random


#The following is needed in order to display every column of a given row inside a database
pd.options.display.max_columns = None

The idea of this notebook is to play around with a dataset obtained from Kaggle at the following link:
https://www.kaggle.com/datasets/dgawlik/nyse

Inside of the link there will be four .csv files:
- prices;
- fundamentals;
- securities;
- prices adjusted;

The description for each of these is found at the link above. The notebook is devoted to try and apply a very simple 1-feature linear regression between two columns of the dataset "fundamentals", here named "dataset_fundamentals".

The first thing I've done is to load the datasets:

In [2]:
dataset_prices = pd.read_csv(r'NewYorkStockExchange/prices.csv')
dataset_fundamentals = pd.read_csv(r'NewYorkStockExchange/fundamentals.csv')
dataset_securities = pd.read_csv(r'NewYorkStockExchange/securities.csv')
dataset_adjusted = pd.read_csv(r'NewYorkStockExchange/prices-split-adjusted.csv')

Check if there are any columns inside the database that contain string-like values; if this happens then store those inside the "col_to_numeric" list for later use.

In [3]:
col_to_numeric = []

for column in dataset_fundamentals:
    if type(dataset_fundamentals[column][0]) == type('stringa'):
        col_to_numeric = np.append(col_to_numeric, column)
        print(column)

Ticker Symbol
Period Ending


Using the above list, it is possible to employ the "to_numeric()" method to parse these string values into numeric values; the "errors = coerce" argument makes sure that if the parsing cannot be done, the value employed is a NaN, to be treated later.

In [4]:
for col in col_to_numeric:
    dataset_fundamentals[col] = pd.to_numeric(dataset_fundamentals[col], errors = 'coerce')

Knowing which columns contain string values, after parsing the values into numericals which result in most of them being NaN, these are filled with 0's. This doesn't represent a problem in the dataset, since the training via a ML model will be done on other columns.

In [5]:
for col in col_to_numeric:
    dataset_fundamentals[col] = dataset_fundamentals[col].fillna(0)

Check whether there are any NaN values left over.

In [6]:
dataset_fundamentals.isna().sum()

Unnamed: 0                        0
Ticker Symbol                     0
Period Ending                     0
Accounts Payable                  0
Accounts Receivable               0
                               ... 
Total Revenue                     0
Treasury Stock                    0
For Year                        173
Earnings Per Share              219
Estimated Shares Outstanding    219
Length: 79, dtype: int64

Here is the implementation of seaborn via the construction of a correlation matrix, which gives an idea of how much correlated the different columns inside the dataset are. The values range from 0 to 1 for correlation, 0 meaning no correlation at all and 1 meaning very strong correlation. To help the search of closely correlated columns, from the correlation matrix I only kept the correlation values higher than or equal to 0.8.

In [7]:
%matplotlib qt

corr_matrix = dataset_fundamentals.corr()
above_corr = corr_matrix[corr_matrix >= 0.8]
plt.figure(figsize = (20, 9))
sns.heatmap(above_corr, cmap = 'Blues', center = 0, annot = False, square = True)

<AxesSubplot:>

I picked two columns that were strongly correlated and decided to saw how this correlation looked like. Since the idea was to go for a ML model for a linear regression, I was looking for two columns that were linearly correlated with one another.

In [8]:
sns.pairplot(dataset_fundamentals[['Net Cash Flow-Operating', 'Earnings Before Interest and Tax']], diag_kind = 'kde')

<seaborn.axisgrid.PairGrid at 0x18e578c05e0>

The following is a part of preprocessing, both for the features as well as for the labels. Noting that the values to train the model on weren't many, I've decided to generate more. But in order to do so I needed to find the distribution according to which the values were distributed, to then sample from there.

The following cell details how I did this, while the one after this one takes all of these same steps but applies them to the labels.
First of all I took note of the distribution of values in the column of interest for the features, here 'Net Cash Flow-Operating'; then I normalized the heights of the bins under "len_trial"; after that I took out the last value of the edges of the bins, in "arr_hist_trial"; these steps are needed for the np.random.choice function, which generates samples that are stored in "features_distrib".
The plot resulting from this cell shows how distributed the original values (in blue) and the generated values (in orange) are.

In [9]:
k = 250
size = 1500

arr_hist = plt.hist(dataset_fundamentals['Net Cash Flow-Operating'], bins = k)
len_trial = arr_hist[0] / np.sum(arr_hist[0])
arr_hist_trial = arr_hist[1][:-1]

features_distrib = np.random.choice(a = arr_hist_trial, p = len_trial, size = size)

plt.hist(features_distrib, bins = k)

(array([  3.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   3.,   4.,  10.,  17.,  67.,
        376., 309., 151.,  64.,  92.,  75.,  50.,  34.,  36.,  19.,  21.,
         13.,  21.,  11.,  10.,   9.,   6.,   6.,   8.,   7.,   8.,   0.,
          1.,   0.,   7.,   5.,   2.,   2.,   0.,   4.,   0.,   1.,   3.,
          2.,   1.,   0.,   0.,   0.,   3.,   0.,   1.,   0.,   0.,   1.,
          0.,   0.,   2.,   0.,   0.,   2.,   1.,   0.,   0.,   0.,   2.,
          0.,   2.,   2.,   0.,   1.,   1.,   3.,   1.,   0.,   0.,   0.,
          0.,   0.,   0.,   3.,   0.,   1.,   0.,   3.,   0.,   0.,   0.,
          1.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   1.,   0.,   3.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0., 

The same is done here, but for the labels which are taken as the column of the dataset, named 'Earnings Before Interest and Tax'.

In [10]:
arr_hist = plt.hist(dataset_fundamentals['Earnings Before Interest and Tax'], bins = k)
len_trial = arr_hist[0] / np.sum(arr_hist[0])
arr_hist_trial = arr_hist[1][:-1]

labels_distrib = np.random.choice(a = arr_hist_trial, p = len_trial, size = size)

plt.hist(labels_distrib, bins = k)

(array([  1.,   0.,   0.,   0.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   4.,   0.,   0.,
          0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,
          0.,   0.,   1.,   1.,   0.,   0.,   0.,   4.,   0.,   3.,   2.,
          0.,   4.,   0.,   2.,   4.,   3.,   7.,  11., 122., 335., 280.,
        171.,  77.,  68.,   0.,  46.,  38.,  46.,  35.,  26.,  20.,   9.,
         18.,  14.,  14.,   7.,   4.,  11.,  10.,   0.,   2.,   5.,   8.,
          3.,   4.,   0.,   4.,   3.,   6.,   1.,   3.,   1.,   5.,   1.,
          0.,   0.,   4.,   3.,   0.,   0.,   0.,   4.,   1.,   0.,   1.,
          0.,   0.,   0.,   0.,   0.,   4.,   0.,   0.,   0.,   2.,   0.,
          1.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   0.,   3.,
          1.,   2.,   5.,   0.,   0.,   0.,   0.,   0.,   0.,   2.,   1.,
          0.,   2.,   0.,   1.,   0.,   0.,   0.,   0.,   0.,   1.,   2.,
          1.,   0.,   0.,   0.,   0., 

The scatter plot shows how the generated and the original values fare one against the other.

In [11]:
plt.scatter(features_distrib, labels_distrib)
plt.scatter(dataset_fundamentals['Net Cash Flow-Operating'], dataset_fundamentals['Earnings Before Interest and Tax'])

<matplotlib.collections.PathCollection at 0x18e63462a60>

Add the generated data to the original one, both for the features and for the labels.

In [12]:
features_noisy = np.concatenate((dataset_fundamentals['Net Cash Flow-Operating'], features_distrib), axis = 0)
labels_noisy = np.concatenate((dataset_fundamentals['Earnings Before Interest and Tax'], labels_distrib), axis = 0)

Another scatter plot that shows how the final data looks like with respect to the original one.

In [13]:
plt.scatter(dataset_fundamentals['Earnings Before Interest and Tax'], dataset_fundamentals['Net Cash Flow-Operating'])
plt.scatter(features_noisy, labels_noisy)

<matplotlib.collections.PathCollection at 0x18e676afdc0>

Now, the data is prepared for the ML model.
First, shuffle randomly but with the same seed the two vectors prepared earlier.
From the "features_noisy" numpy array, both the train as well as the test features are obtained via the train_test_split function and a test_size set to 0.3. The same is repeated for the train and test labels, starting from the "labels_noisy" array. Later, since the feature and the label train arrays slightly differ in terms of scales, a normalization process starts, with the employment of the StandardScaler() function for both the x and the y.


In [14]:
k = 99

random.Random(k).shuffle(features_noisy)
random.Random(k).shuffle(labels_noisy)


train_features, test_features = train_test_split(features_noisy, test_size = 0.3, random_state = 0)
train_labels, test_labels = train_test_split(labels_noisy, test_size = 0.3, random_state = 0)

scaler_X = StandardScaler()
x_train = scaler_X.fit_transform(train_features.reshape(-1, 1)).flatten()

scaler_Y = StandardScaler()
y_train = scaler_Y.fit_transform(train_labels.reshape(-1, 1)).flatten()

Build the model, here called "model". This consists in a very simple model containing only 1 Dense node and 1 Dropout node, which goes to eliminate 40% of the data fed to the model each time. I included this since it let the model avoid overfitting and actually learn something. As it can be seen by the model summary, the total number of parameters equals 2, which is necessary for a linear regression to a straight line. The optimizer chosen is Adam, while the quantity to be monitored for the loss is the mean squared error.
A brief prediction on how the model handles the first ten values of the "train_features" array is also shown, in order to be sure that the shape of the output is the correct one.

In [32]:
model = tf.keras.Sequential([
    layers.Dense(units = 1, input_shape = (1,)),
    Dropout(0.4)
])

model.summary()
model.compile(optimizer = 'Adam', loss = 'mse')
keras.utils.plot_model(model)
model.predict(train_features[:10])

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 1)                 2         
_________________________________________________________________
dropout_3 (Dropout)          (None, 1)                 0         
Total params: 2
Trainable params: 2
Non-trainable params: 0
_________________________________________________________________


array([[8.3276211e+09],
       [4.8843505e+09],
       [4.5204490e+08],
       [4.8251725e+09],
       [4.0318523e+09],
       [1.2189226e+10],
       [1.2210601e+09],
       [2.9127124e+09],
       [7.7075743e+09],
       [1.3673005e+09]], dtype=float32)

In [33]:
epochs = 15
history = model.fit(x_train, y_train, epochs = epochs, verbose = 1, batch_size = 3, validation_split = 0.5)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [34]:
train_loss_fn = history.history["loss"]
eval_loss_fn = history.history["val_loss"]
epochs_range = range(epochs)

plt.figure(figsize=(8,8))
plt.subplot(1, 1, 1)
plt.plot(epochs_range, train_loss_fn, label = "Train Loss function")
plt.plot(epochs_range, eval_loss_fn, label = "Eval Loss function")
plt.legend(loc = "upper right")
plt.title("Loss function")
plt.xlabel("Epochs")
plt.grid()

plt.yscale('log')
plt.ylabel("Loss function")
plt.show()

In [35]:
preds = scaler_Y.inverse_transform((model.predict(scaler_X.transform(test_features.reshape(-1, 1)).flatten())))

In [36]:
x = np.linspace(np.min(dataset_fundamentals['Net Cash Flow-Operating']), np.max(dataset_fundamentals['Net Cash Flow-Operating']), len(test_labels))
y = scaler_Y.inverse_transform(model.predict(scaler_X.transform(x.reshape(-1, 1)).flatten()))

plt.scatter(dataset_fundamentals['Net Cash Flow-Operating'], dataset_fundamentals['Earnings Before Interest and Tax'],
           label = 'Data')
plt.plot(x, y, color = 'k', label = 'Predictions')
plt.xlabel('Net Cash Flow-Operating')
plt.ylabel('Earnings Before Interest and Tax')
plt.legend()

<matplotlib.legend.Legend at 0x18e72c8a7f0>

In [38]:
from sklearn.metrics import r2_score
r2_score(test_labels, preds)

0.2855159048107925