In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LassoCV, LogisticRegression, LogisticRegressionCV 
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import (confusion_matrix, roc_curve, auc, classification_report, 
ConfusionMatrixDisplay, RocCurveDisplay, precision_recall_curve, PrecisionRecallDisplay, PredictionErrorDisplay)
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel, RFE
from sklearn.utils.discovery import all_displays
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import LearningCurveDisplay, learning_curve

from sklearn.feature_selection import RFE
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.inspection import permutation_importance

from xgboost import XGBClassifier

from scipy.stats import fisher_exact

from yellowbrick.classifier import DiscriminationThreshold
from yellowbrick.classifier.threshold import discrimination_threshold

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

import json

import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
np.random.seed(8090)

In [3]:
data = pd.read_csv('/Users/michael/Documents/MA Stats/STAT 8090/final dataset/full dataset.csv')

data.head()

Unnamed: 0,Symbol,Company_Name,Exchange,Orig_Range_Low,Orig_Range_High,Price,Shares,Offer_Amount,ipo_date,is_Friday,...,plustwo_volume_pct,plusthree_change_pct,plusthree_volume_pct,plus90_change_pct,plus90_volume_pct,plus180_change_pct,plus180_volume_pct,d1_d180,d1_pop,d1_d180_pre_price
0,CRGX,Cargo Therapeutics,Nasdaq,15.0,17.0,15.0,18750000,281250000,11/10/23,1,...,0.015489,0.015478,0.008329,0.004261,1e-06,0.028773,1.6e-09,1,0,1
1,HG,Hamilton Insurance,NYSE,16.0,18.0,15.0,15000000,225000000,11/10/23,1,...,0.050263,-0.010451,0.004091,0.024725,1e-06,0.032742,1.13333e-09,1,0,1
2,LXEO,Lexeo Therapeutics,Nasdaq,13.0,15.0,11.0,9090910,100000010,11/3/23,1,...,0.013554,-0.005607,0.027481,-0.007088,0.008493,-0.010459,0.008693079,1,0,1
3,MNR,Mach Resources,NYSE,19.0,21.0,19.0,10000000,190000000,10/25/23,0,...,0.014016,-0.060652,0.014147,0.015936,0.004482,-0.021266,0.0077789,1,0,1
4,BIRK,Birkenstock,NYSE,44.0,49.0,46.0,32258064,1483870944,10/11/23,0,...,1e-06,0.037633,0.094204,-0.012492,0.017129,-0.002016,0.01296451,1,0,0


In [4]:
with open('/Users/michael/Documents/MA Stats/STAT 8090/Code Files/exchange.json','r') as f:
    exchange = json.load(f)

In [5]:
with open('/Users/michael/Documents/MA Stats/STAT 8090/Code Files/sector.json','r') as f:
    sector = json.load(f)

In [6]:
with open('/Users/michael/Documents/MA Stats/STAT 8090/Code Files/industry.json','r') as f:
    industry = json.load(f)

In [7]:
data['Exchange'] = data['Exchange'].replace(exchange)
data['Sector'] = data['Sector'].replace(sector)
data['Industry'] = data['Industry'].replace(industry)

In [8]:
# label_encoder = LabelEncoder()
# data['Exchange'] = label_encoder.fit_transform(data['Exchange'])
# data['Sector'] = label_encoder.fit_transform(data['Sector'])
# data['Industry'] = label_encoder.fit_transform(data['Industry'])

In [9]:
data = data.drop(columns=['Symbol', 'Company_Name', 'ipo_date', 'Country_HQ', 'Pitchbook_Number',
                          'Orig_Range_Low', 'Orig_Range_High', 'Year_Founded', 'Raised_to_IPO',
                          'ipo_date_open','ipo_date_close','ipo_date_volume','plusone_open','plusone_close',
                          'plusone_volume','plustwo_open','plustwo_close','plustwo_volume','plusthree_open','plusthree_close',
                          'plusthree_volume','plus90_open','plus90_close','plus90_volume','plus180_open','plus180_close',
                          'plus180_volume','day_one_bump','ipo_date_change_pct','ipo_date_volume_pct','plusone_change_pct',
                          'plusone_volume_pct','plustwo_change_pct','plustwo_volume_pct','plusthree_change_pct',
                          'plusthree_volume_pct','plus90_change_pct','plus90_volume_pct','plus180_change_pct','plus180_volume_pct',
                          'd1_pop', 'd1_d180_pre_price'])

In [10]:
data.head()

Unnamed: 0,Exchange,Price,Shares,Offer_Amount,is_Friday,IPO_Year,Years_to_IPO,Price_Low_Delta,Price_High_Delta,Sector,...,vix_min5,vix_min4,vix_min3,vix_min2,vix_min1,vix_min0,sent_negative,sent_neutral,sent_positive,d1_d180
0,1,15.0,18750000,281250000,1,2023,4,0.0,-0.117647,5,...,-0.206291,-0.004106,0.145605,-0.040704,0.236158,-0.193135,0,1,0,1
1,0,15.0,15000000,225000000,1,2023,11,-0.0625,-0.166667,4,...,-0.206291,-0.004106,0.145605,-0.040704,0.236158,-0.193135,0,2,0,1
2,1,11.0,9090910,100000010,1,2023,5,-0.153846,-0.266667,5,...,-0.206291,-0.004106,0.145605,-0.040704,0.236158,-0.140479,0,0,0,1
3,0,19.0,10000000,190000000,0,2023,8,0.0,-0.095238,3,...,-0.012837,-0.206291,-0.004106,0.145605,-0.040704,0.212347,0,0,0,1
4,0,46.0,32258064,1483870944,0,2023,249,0.045455,-0.061224,1,...,-0.012837,-0.206291,-0.004106,0.145605,-0.040704,0.181814,2,1,0,1


In [11]:
X = data.drop(columns=['d1_d180'])
y = data['d1_d180']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8090)

unique_values = set(X_train['Industry'])

# Check if at least one sample from each unique value is present in the training set
for value in unique_values:
    if value not in X_train['Industry'].unique():
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8090)
        break

## Full Model

### Logistic Regression

In [12]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('sampler', RandomUnderSampler()),
    ('classifier', LogisticRegressionCV(max_iter=100000))
])

In [None]:
param_grid = {
    'classifier__solver': ['liblinear', 'lbfgs', 'newton-cg', 'saga'],
    'classifier__penalty': ['l1', 'l2'],
}

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=8090)

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=kfold, verbose=2)

grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 8 candidates, totalling 80 fits
[CV] END classifier__penalty=l1, classifier__solver=liblinear; total time=513.2min
[CV] END classifier__penalty=l1, classifier__solver=liblinear; total time=65.7min
[CV] END classifier__penalty=l1, classifier__solver=liblinear; total time=129.1min
[CV] END classifier__penalty=l1, classifier__solver=liblinear; total time=876.7min


In [None]:
ranks = pd.DataFrame(
    {
        'Model': grid_search.cv_results_['params'],
        'Mean Test Score': grid_search.cv_results_['mean_test_score'],
        'Std Test Score': grid_search.cv_results_['std_test_score'],
        'Rank': grid_search.cv_results_['rank_test_score']
    }
)

pd.concat([ranks.drop(['Model'], axis=1), ranks['Model'].apply(pd.Series)], axis=1).sort_values(by="Rank")

In [None]:
best_estimator = grid_search.best_estimator_

print("Best Estimator:", best_estimator)

In [None]:
y_pred = grid_search.best_estimator_.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
cm

In [None]:
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TP = cm[1, 1]

odds_ratio, p_value = fisher_exact([[TP, FP], [FN, TN]])

log_odds_ratio = np.log(odds_ratio)

std_error_log_odds_ratio = np.sqrt(1/TP + 1/FP + 1/FN + 1/TN)

z_score = 1.96  # 95% confidence interval
lower_bound = log_odds_ratio - z_score * std_error_log_odds_ratio
upper_bound = log_odds_ratio + z_score * std_error_log_odds_ratio


ci_lower = np.exp(lower_bound)
ci_upper = np.exp(upper_bound)

print("95% Confidence Interval for Odds Ratio:", (ci_lower, ci_upper))
print("p-value for Odds Ratio:", p_value)

In [None]:
report = classification_report(y_test, y_pred)

print("Classification Report:")
print(report)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_estimator.classes_)
disp.plot(cmap=plt.cm.Blues)
# disp.plot()
plt.title('All Variables', fontsize=16)
plt.savefig('/Users/michael/Documents/MA Stats/STAT 8090/cm_lr_180_full.png', dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, best_estimator.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('All Variables', fontsize=16)
plt.legend(loc="lower right")
plt.savefig('/Users/michael/Documents/MA Stats/STAT 8090/roc_lr_180_full.png', dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Compute precision-recall curve
precision, recall, _ = precision_recall_curve(y_test, best_estimator.predict_proba(X_test)[:, 1])

# Plot Precision-Recall curve
plt.figure()
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('All Variables', fontsize=16)
plt.savefig('/Users/michael/Documents/MA Stats/STAT 8090/prc_lr_full_180.png', dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# train_sizes, train_scores, test_scores = learning_curve(best_estimator, X, y)
# display = LearningCurveDisplay(train_sizes=train_sizes,
#                                train_scores=train_scores, test_scores=test_scores, score_name="accuracy")
# display.plot()
# plt.ylim([0.0, 1.05])
# plt.xlim([0.0, 1.0])
# plt.title('Learning Curve')
# plt.show()

## Reduced Models

### Lasso Regression and Logistic Regression

Lasso Regression is used to build the reduced model, with fewer predictor variables. This model will compared to the full model. 

In [None]:
lasso_logistic = Pipeline([
    ('scaler', StandardScaler()),
    ('sampler', RandomUnderSampler()),
    ('feature_selection', SelectFromModel(LogisticRegressionCV(penalty="l1", solver='liblinear'))),
    ('classifier', LogisticRegression(max_iter=100000))
])

In [None]:
param_grid = {
    'classifier__solver': ['liblinear', 'lbfgs', 'newton-cg', 'saga'],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': [0.001, 0.01, 0.1]
}

kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=8090)

grid_search = GridSearchCV(estimator=lasso_logistic, param_grid=param_grid, cv=kfold, verbose=2)

grid_search.fit(X_train, y_train)

In [None]:
ranks = pd.DataFrame(
    {
        'Model': grid_search.cv_results_['params'],
        'Mean Test Score': grid_search.cv_results_['mean_test_score'],
        'Std Test Score': grid_search.cv_results_['std_test_score'],
        'Rank': grid_search.cv_results_['rank_test_score']
    }
)

pd.concat([ranks.drop(['Model'], axis=1), ranks['Model'].apply(pd.Series)], axis=1).sort_values(by="Rank")

In [None]:
best_estimator = grid_search.best_estimator_

print("Best Estimator:", best_estimator)

In [None]:
y_pred = grid_search.best_estimator_.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
cm

In [None]:
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TP = cm[1, 1]

odds_ratio, p_value = fisher_exact([[TP, FP], [FN, TN]])

log_odds_ratio = np.log(odds_ratio)

std_error_log_odds_ratio = np.sqrt(1/TP + 1/FP + 1/FN + 1/TN)

z_score = 1.96  # 95% confidence interval
lower_bound = log_odds_ratio - z_score * std_error_log_odds_ratio
upper_bound = log_odds_ratio + z_score * std_error_log_odds_ratio


ci_lower = np.exp(lower_bound)
ci_upper = np.exp(upper_bound)

print("95% Confidence Interval for Odds Ratio:", (ci_lower, ci_upper))
print("p-value for Odds Ratio:", p_value)

In [None]:
report = classification_report(y_test, y_pred)

print("Classification Report:")
print(report)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_estimator.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title('Lasso Penalization', fontsize=16)
plt.savefig('/Users/michael/Documents/MA Stats/STAT 8090/cm_lr_180_red1.png', dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, best_estimator.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Lasso Penalization', fontsize=16)
plt.legend(loc="lower right")
plt.savefig('/Users/michael/Documents/MA Stats/STAT 8090/roc_lr_180_red1.png', dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Compute precision-recall curve
precision, recall, _ = precision_recall_curve(y_test, best_estimator.predict_proba(X_test)[:, 1])

# Plot Precision-Recall curve
plt.figure()
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Lasso Penalization', fontsize=16)
plt.savefig('/Users/michael/Documents/MA Stats/STAT 8090/prc_lr_180_red1.png', dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# train_sizes, train_scores, test_scores = learning_curve(best_estimator, X, y)
# display = LearningCurveDisplay(train_sizes=train_sizes,
#                                train_scores=train_scores, test_scores=test_scores, score_name="accuracy")
# display.plot()
# plt.ylim([0.0, 1.05])
# plt.xlim([0.0, 1.0])
# plt.title('Learning Curve')
# plt.show()

## Feature Importance Models and Base Line Models

In [None]:
model = RandomForestClassifier()
model.fit(X, y)

feature_importance = pd.DataFrame(model.feature_importances_,
                                   index=X.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)

In [None]:
fi = feature_importance[feature_importance['importance'] >= 0.01]
fi = fi.index.tolist()

In [None]:
fi_data = X[fi]

In [None]:
fi_data.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(fi_data, y, test_size=0.3, random_state=8090)

unique_values = set(X_train['Industry'])

# Check if at least one sample from each unique value is present in the training set
for value in unique_values:
    if value not in X_train['Industry'].unique():
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8090)
        break

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('sampler', RandomUnderSampler()),
    ('classifier', LogisticRegressionCV(max_iter=100000))
])

In [None]:
param_grid = {
    'classifier__solver': ['liblinear', 'lbfgs', 'newton-cg', 'saga'],
    'classifier__penalty': ['l1', 'l2'],
}
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=8090)

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=kfold, verbose=2)

grid_search.fit(X_train, y_train)

In [None]:
ranks = pd.DataFrame(
    {
        'Model': grid_search.cv_results_['params'],
        'Mean Test Score': grid_search.cv_results_['mean_test_score'],
        'Std Test Score': grid_search.cv_results_['std_test_score'],
        'Rank': grid_search.cv_results_['rank_test_score']
    }
)

pd.concat([ranks.drop(['Model'], axis=1), ranks['Model'].apply(pd.Series)], axis=1).sort_values(by="Rank")

In [None]:
best_estimator = grid_search.best_estimator_

print("Best Estimator:", best_estimator)

In [None]:
y_pred = grid_search.best_estimator_.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
cm

In [None]:
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TP = cm[1, 1]

odds_ratio, p_value = fisher_exact([[TP, FP], [FN, TN]])

log_odds_ratio = np.log(odds_ratio)

std_error_log_odds_ratio = np.sqrt(1/TP + 1/FP + 1/FN + 1/TN)

z_score = 1.96  # 95% confidence interval
lower_bound = log_odds_ratio - z_score * std_error_log_odds_ratio
upper_bound = log_odds_ratio + z_score * std_error_log_odds_ratio


ci_lower = np.exp(lower_bound)
ci_upper = np.exp(upper_bound)

print("95% Confidence Interval for Odds Ratio:", (ci_lower, ci_upper))
print("p-value for Odds Ratio:", p_value)

In [None]:
report = classification_report(y_test, y_pred)

print("Classification Report:")
print(report)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_estimator.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title('Feature Importance', fontsize=16)
plt.savefig('/Users/michael/Documents/MA Stats/STAT 8090/cm_lr_180_fi.png', dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Compute ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, best_estimator.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Feature Importance', fontsize=16)
plt.legend(loc="lower right")
plt.savefig('/Users/michael/Documents/MA Stats/STAT 8090/roc_lr_180_fi.png', dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Compute precision-recall curve
precision, recall, _ = precision_recall_curve(y_test, best_estimator.predict_proba(X_test)[:, 1])

# Plot Precision-Recall curve
plt.figure()
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Feature Importance', fontsize=16)
plt.savefig('/Users/michael/Documents/MA Stats/STAT 8090/prc_lr_180_fi.png', dpi=300, bbox_inches="tight")
plt.show()

---
### End of Notebook