In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (accuracy_score, confusion_matrix, roc_curve, auc, classification_report, 
ConfusionMatrixDisplay, RocCurveDisplay, precision_recall_curve, PrecisionRecallDisplay, PredictionErrorDisplay, make_scorer)
from sklearn.utils.discovery import all_displays
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import LearningCurveDisplay, learning_curve
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample


from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from pyrcn.echo_state_network import ESNClassifier

from keras.models import Sequential
from keras.layers import LSTM, Dense

import torch

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

2024-04-21 22:21:45.454681: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
data = pd.read_csv('/Users/michael/Documents/MA Stats/STAT 8090/final dataset/full dataset.csv')

data.head()

Unnamed: 0,Symbol,Company_Name,Exchange,Orig_Range_Low,Orig_Range_High,Price,Shares,Offer_Amount,ipo_date,IPO_Year,...,plustwo_volume_pct,plusthree_change_pct,plusthree_volume_pct,plus90_change_pct,plus90_volume_pct,plus180_change_pct,plus180_volume_pct,d1_d180,d1_pop,d1_d180_pre_price
0,KVYO,Klaviyo,NYSE,25.0,27.0,30.0,19200000,576000000,9/20/23,2023,...,0.093943,0.007383,0.130651,-0.048713,0.053359,0.02205,0.084599,0,1,1
1,CART,Maplebear,Nasdaq,26.0,28.0,30.0,22000000,660000000,9/19/23,2023,...,0.330195,-0.04943,0.2027,0.014551,0.063364,0.039052,0.193695,1,1,1
2,NMRA,Neumora Therapeutics,Nasdaq,16.0,18.0,17.0,14710000,250070000,9/15/23,2023,...,0.064895,0.063768,0.067886,-0.000683,0.01361,-0.073217,0.260952,0,0,0
3,ARM,Arm Holdings,Nasdaq,47.0,51.0,51.0,95500000,4870500000,9/14/23,2023,...,0.191713,-0.040964,0.171408,0.003058,0.07355,0.036,0.190792,1,1,1
4,SRFM,Surf Air Mobility,NYSE,20.0,20.0,20.0,20423622,408472440,7/27/23,2023,...,0.171561,-0.204082,0.153038,0.05042,0.001763,-0.015504,0.001312,0,0,0


In [3]:
label_encoder = LabelEncoder()
data['Exchange'] = label_encoder.fit_transform(data['Exchange'])
data['Sector'] = label_encoder.fit_transform(data['Sector'])
data['Industry'] = label_encoder.fit_transform(data['Industry'])

In [4]:
data = data.drop(columns=['Symbol', 'Company_Name', 'ipo_date', 'Country_HQ', 'Pitchbook_Number',
                          'Orig_Range_Low', 'Orig_Range_High', 'Year_Founded', 'Raised_to_IPO',
                          'ipo_date_open','ipo_date_close','ipo_date_volume','plusone_open','plusone_close',
                          'plusone_volume','plustwo_open','plustwo_close','plustwo_volume','plusthree_open','plusthree_close',
                          'plusthree_volume','plus90_open','plus90_close','plus90_volume','plus180_open','plus180_close',
                          'plus180_volume','day_one_bump','ipo_date_change_pct','ipo_date_volume_pct','plusone_change_pct',
                          'plusone_volume_pct','plustwo_change_pct','plustwo_volume_pct','plusthree_change_pct',
                          'plusthree_volume_pct','plus90_change_pct','plus90_volume_pct','plus180_change_pct','plus180_volume_pct',
                          'd1_d180', 'd1_d180_pre_price'])

In [5]:
data.head()

Unnamed: 0,Exchange,Price,Shares,Offer_Amount,IPO_Year,Years_to_IPO,Price_Low_Delta,Price_High_Delta,Sector,Industry,...,market_min5,market_min4,market_min3,market_min2,market_min1,market_min0,sent_negative,sent_neutral,sent_positive,d1_pop
0,0,30.0,19200000,576000000,2023,11,0.2,0.111111,8,79,...,15507.15789,15230.52727,15560.37143,16104.995,15980.94783,15786.03,1,0,0,1
1,1,30.0,22000000,660000000,2023,11,0.153846,0.071429,1,53,...,12073.46421,12446.53,13463.7781,14033.091,13739.49652,13585.84,1,1,0,1
2,1,17.0,14710000,250070000,2023,4,0.0625,-0.055556,5,12,...,12073.46421,12446.53,13463.7781,14033.091,13739.49652,13585.84,0,0,0,0
3,1,51.0,95500000,4870500000,2023,33,0.085106,0.0,8,77,...,12073.46421,12446.53,13463.7781,14033.091,13739.49652,13585.84,1,1,0,1
4,0,20.0,20423622,408472440,2023,12,0.0,0.0,6,1,...,15827.17895,15069.23478,15507.15789,15230.52727,15560.37143,16104.995,0,0,0,0


#### Echo State Network

In [6]:
np.random.seed(8090)

In [7]:
X = data.drop(columns=['d1_pop'])
y = data['d1_pop']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8090)

under_sampler = RandomUnderSampler()

X_train, y_train = under_sampler.fit_resample(X_train, y_train)

unique_values = set(X_train['Industry'])

# Check if at least one sample from each unique value is present in the training set
for value in unique_values:
    if value not in X_train['Industry'].unique():
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8090)
        break

In [8]:
num_bootstrap_samples = 1000

bootstrap_samples_X = []
bootstrap_samples_y = []

for _ in range(num_bootstrap_samples):
    # Resample with replacement
    X_bootstrap_sample, y_bootstrap_sample = resample(X_train, y_train, replace=True)
    
    # Append the bootstrap samples to the list
    bootstrap_samples_X.append(X_bootstrap_sample)
    bootstrap_samples_y.append(y_bootstrap_sample)

In [9]:
X_train = pd.concat(bootstrap_samples_X)
y_train = pd.concat(bootstrap_samples_y)

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid to search
param_grid = {
    'n_reservoir': [1,2,3],  # Number of reservoir units
    'spectral_radius': [0.4, 0.5, 0.6],  # Spectral radius
    'leakage': [0.9, 0.95, 1.0],  # Leakage rate
    'solver': ['lsqr', 'ridge'],  # Solver for linear regression
    'alpha': [0.8, 0.9, 1.0]  # Ridge regression parameter
}

# Initialize ESN classifier
esn = ESNClassifier()

In [11]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=esn, param_grid=param_grid, cv=10, scoring='accuracy', verbose=1)
grid_search.fit(X_train_scaled, y_train)

Fitting 10 folds for each of 162 candidates, totalling 1620 fits


KeyboardInterrupt: 

In [None]:
print("Best Parameters:", grid_search.best_params_)

In [None]:
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test_scaled)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
cm

In [None]:
report = classification_report(y_test, y_pred)

print("Classification Report:")
print(report)

In [None]:
class_labels = ['Class 0', 'Class 1']

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.savefig('/Users/michael/Documents/MA Stats/STAT 8090/cm_reservoir_1.png', dpi=600, bbox_inches="tight")
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, best_estimator.predict_proba(X_test_scaled))
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.savefig('/Users/michael/Documents/MA Stats/STAT 8090/roc_reservoir_1.png', dpi=600, bbox_inches="tight")
plt.show()

In [None]:
precision, recall, _ = precision_recall_curve(y_test, best_estimator.predict_proba(X_test_scaled))

plt.figure()
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall Curve')
plt.savefig('/Users/michael/Documents/MA Stats/STAT 8090/prc_reservoir_1.png', dpi=600, bbox_inches="tight")
plt.show()

## Feature Importance Models

### Reservior Computing

In [None]:
model = RandomForestClassifier()
model.fit(X, y)

feature_importance = pd.DataFrame(model.feature_importances_,
                                   index=X.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)

In [None]:
fi = feature_importance[feature_importance['importance'] >= 0.01]
fi = fi.index.tolist()

In [None]:
fi_data = X[fi]

In [None]:
fi_data.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(fi_data, y, test_size=0.3, random_state=8090)

under_sampler = RandomUnderSampler()

X_train, y_train = under_sampler.fit_resample(X_train, y_train)

unique_values = set(X_train['Industry'])

# Check if at least one sample from each unique value is present in the training set
for value in unique_values:
    if value not in X_train['Industry'].unique():
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8090)
        break

In [None]:
num_bootstrap_samples = 1000

bootstrap_samples_X = []
bootstrap_samples_y = []

for _ in range(num_bootstrap_samples):
    # Resample with replacement
    X_bootstrap_sample, y_bootstrap_sample = resample(X_train, y_train, replace=True)
    
    # Append the bootstrap samples to the list
    bootstrap_samples_X.append(X_bootstrap_sample)
    bootstrap_samples_y.append(y_bootstrap_sample)

In [None]:
X_train = pd.concat(bootstrap_samples_X)
y_train = pd.concat(bootstrap_samples_y)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the parameter grid to search
param_grid = {
    'n_reservoir': [1,2,3,4,5], 
    'spectral_radius': [0.45, 0.5, 0.55, 0.6],  
    'leakage': [0.9, 0.95, 1.0],  
    'solver': ['lsqr', 'ridge'], 
    'alpha': [0.8, 0.85, 0.9, 0.95, 1.0] 
}

# Initialize ESN classifier
esn = ESNClassifier()

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=esn, param_grid=param_grid, cv=10, scoring='accuracy', verbose=1)
grid_search.fit(X_train_scaled, y_train)

In [None]:
print("Best Parameters:", grid_search.best_params_)

In [None]:
best_estimator = grid_search.best_estimator_
y_pred = best_estimator.predict(X_test_scaled)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
cm

In [None]:
report = classification_report(y_test, y_pred)

print("Classification Report:")
print(report)

In [None]:
class_labels = ['Class 0', 'Class 1']

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_labels)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.savefig('/Users/michael/Documents/MA Stats/STAT 8090/cm_reservoir_1_fi.png', dpi=600, bbox_inches="tight")
plt.show()

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, best_estimator.predict_proba(X_test_scaled))
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.savefig('/Users/michael/Documents/MA Stats/STAT 8090/roc_reservoir_1_fi.png', dpi=600, bbox_inches="tight")
plt.show()

In [None]:
precision, recall, _ = precision_recall_curve(y_test, best_estimator.predict_proba(X_test_scaled))

plt.figure()
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Precision-Recall Curve')
plt.savefig('/Users/michael/Documents/MA Stats/STAT 8090/prc_reservoir_1_fi.png', dpi=600, bbox_inches="tight")
plt.show()

---
### End of Notebook