In [1]:
# imports
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
import math
import json
import matplotlib.pyplot as plt
from preprocessing_tools import check_for_nan_columns, drop_nan_columns
from ml_tools import stratified_clf_cv_accuracy
from functools import partial
from scipy import stats
from scipy.stats import randint

from sklearn.dummy import DummyClassifier
from sklearn.model_selection import (train_test_split, cross_val_score, 
                                     GridSearchCV, RandomizedSearchCV,
                                     cross_val_predict)
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import mean_squared_error, make_scorer, confusion_matrix

In [2]:
df = pd.read_parquet('NYSE_dataset_v2_linregress.parquet')
df = df.drop(['priceRatioRelativeToS&P_1Q', 'priceRatioRelativeToS&P_3Q', 
              'priceRatioRelativeToS&P_4Q'], axis=1)

# making the output binary
df['priceRatioRelativeToS&P_2Q'] = \
    df['priceRatioRelativeToS&P_2Q'].apply(lambda x: 1 if x>=1 else 0)

input_df = df.drop('priceRatioRelativeToS&P_2Q', axis=1).copy()
output_df = df['priceRatioRelativeToS&P_2Q'].copy().to_frame()

X_train, X_test, y_train, y_test = train_test_split(input_df, 
                                                    output_df,
                                                    random_state=42,
                                                    test_size=0.2
                                                   )

# scaling data
input_scaler = StandardScaler()
X_train_scaled = input_scaler.fit_transform(X_train)
X_test_scaled = input_scaler.transform(X_test)
y_train = np.array(y_train).ravel()
y_test = np.array(y_test).ravel()
# categorical outputs dont need to be scaled

## Stochastic Gradient Descent
We first inspect the distribution of outcomes in the y_train set to determine if there is any imbalance in the classes. We anticipate each class to represent around 50% of the total. This is intuitive: the metric measures whether a company's stock price has done better than the market average. We would expect half of the companies to have done better than the market, and half worse.

In [3]:
(y_train == 0).sum()/len(y_train)

0.5128573648228714

We thus see that our intuitions are correct. **Since the outcomes in the dataset are balanced, we can safely use accuracy as a performance metric on this data.** We proceed now to train the SGDClassifier

In [4]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train_scaled, y_train)

# using cross-validation and checking accuracy
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=5, scoring='accuracy')

array([0.52631579, 0.53777433, 0.49873762, 0.53311323, 0.53496503])

In [6]:
# We note that we get the same result from our custom-built CV function
stratified_clf_cv_accuracy(sgd_clf, X_train_scaled, y_train, n_splits=5)

array([0.52631579, 0.53777433, 0.49873762, 0.53311323, 0.53496503])

In [7]:
# comparison to a dummy classifier
dummy_clf = DummyClassifier()
dummy_clf.fit(X_train, y_train)
cross_val_score(dummy_clf, X_train_scaled, y_train, cv=5, scoring='accuracy')

array([0.51272092, 0.51291513, 0.51291513, 0.51291513, 0.51282051])

We see that our SGDClassifier is only marginally better than a dummy classifier. This is not very impressive! We now inspect the confusion matrix. We see that we are getting many false positives and many false negatives. The model seems to be slightly better at predicting the positive outcomes.

In [11]:
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=5)
cm = confusion_matrix(y_train, y_train_pred)
cm

array([[7531, 5672],
       [6526, 6015]], dtype=int64)