### Libraries

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split  
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC 
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go

### Load Dataset

In [5]:
df = pd.read_csv("../Dataset/process data/nasdaq_dataset.csv")
df.head()

Unnamed: 0,Date,Time,Open,High,Low,Close,Volume,NumberOfTrades,BidVolume,AskVolume,...,AROON_UP,ADX,STOCH_K,STOCH_D,MACD,MACD_SIGNAL,HAMMER,INVERTED_HAMMER,ENGULFING,Target
0,2011/6/13,20:00:00,1350.54,1350.54,1350.54,1350.54,0,0,0,0,...,21.428571,43.819322,40.615298,45.541867,-7.932664,-8.560322,0,0,0,0
1,2011/6/14,12:00:00,1363.16,1370.18,1363.16,1368.53,0,0,0,0,...,100.0,42.298061,58.060659,48.180735,-6.232078,-8.094673,0,0,0,0
2,2011/6/14,13:00:00,1368.51,1374.46,1368.06,1373.62,0,0,0,0,...,100.0,41.340922,76.586371,58.420776,-4.42265,-7.360269,0,0,0,0
3,2011/6/14,14:00:00,1373.62,1374.23,1370.97,1372.8,0,0,0,0,...,92.857143,40.45215,93.929978,76.192336,-3.02002,-6.492219,0,0,0,0
4,2011/6/14,15:00:00,1372.79,1373.59,1371.59,1372.15,0,0,0,0,...,85.714286,39.626862,93.361517,87.959288,-1.938528,-5.581481,0,0,0,0


In [6]:
features=['Open', 'High', 'Low', 'Close', 'Volume',
       'NumberOfTrades', 'BidVolume', 'AskVolume', 'ATR', 'RSI', 'AD',
       'AROON_DOWN', 'AROON_UP', 'ADX', 'STOCH_K', 'STOCH_D', 'MACD',
       'MACD_SIGNAL', 'HAMMER', 'INVERTED_HAMMER', 'ENGULFING']

In [7]:
model_scores = {}

Data Preprocess

In [8]:
standard = StandardScaler()
X = standard.fit_transform(df[features])
Y = df['Target']

Split dataset (80 - 20) & create new data points using SMOTE to resolve class imbalance problem

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Create the SMOTE object
smote = SMOTE(random_state=42)

# Oversample the training data using SMOTE
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, Y_train)

Decision Tree

In [10]:
DT = DecisionTreeClassifier(max_depth=3)
DT.fit(X_train_resampled, y_train_resampled)

DecisionTreeClassifier(max_depth=3)

In [11]:
predications = DT.predict(X_test)

model_scores['Decision Tree'] = accuracy_score(predications, Y_test)
accuracy_score(predications, Y_test)

0.5411908646003263

In [12]:
report=classification_report(Y_test,predications)
print(report)

              precision    recall  f1-score   support

          -1       0.11      0.46      0.18       449
           0       0.82      0.64      0.72      3845
           1       0.00      0.00      0.00       610

    accuracy                           0.54      4904
   macro avg       0.31      0.37      0.30      4904
weighted avg       0.65      0.54      0.58      4904



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM

In [11]:
# create the model
SVM = SVC()

# fit the model to the data
SVM.fit(X_train_resampled, y_train_resampled)

SVC()

In [12]:
predications = SVM.predict(X_test)

model_scores['SVM'] = accuracy_score(predications, Y_test)
accuracy_score(predications, Y_test)

0.39416802610114193

In [13]:
report=classification_report(Y_test,predications)
print(report)

              precision    recall  f1-score   support

          -1       0.12      0.39      0.18       449
           0       0.85      0.38      0.53      3845
           1       0.17      0.47      0.25       610

    accuracy                           0.39      4904
   macro avg       0.38      0.41      0.32      4904
weighted avg       0.70      0.39      0.46      4904



Random Forest

In [14]:
SRF = RandomForestClassifier(n_estimators=150, random_state=0)
SRF.fit(X_train_resampled,y_train_resampled)

RandomForestClassifier(n_estimators=150, random_state=0)

In [15]:
predications = SRF.predict(X_test)

model_scores['Random Forest'] = accuracy_score(predications, Y_test)
accuracy_score(predications, Y_test)

0.731647634584013

In [16]:
report=classification_report(Y_test,predications)
print(report)

              precision    recall  f1-score   support

          -1       0.26      0.18      0.21       449
           0       0.81      0.88      0.84      3845
           1       0.30      0.20      0.24       610

    accuracy                           0.73      4904
   macro avg       0.46      0.42      0.43      4904
weighted avg       0.70      0.73      0.71      4904



AdaBoost

In [17]:
base_model = DecisionTreeClassifier(max_depth=2)
ada_model = AdaBoostClassifier(base_estimator=base_model, n_estimators=200)

ada_model.fit(X_train_resampled, y_train_resampled)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2),
                   n_estimators=200)

In [18]:
predications = ada_model.predict(X_test)

model_scores['AdaBoost'] = accuracy_score(predications, Y_test)
accuracy_score(predications, Y_test)

0.7528548123980424

In [19]:
report=classification_report(Y_test,predications)
print(report)

              precision    recall  f1-score   support

          -1       0.16      0.04      0.07       449
           0       0.79      0.95      0.86      3845
           1       0.17      0.04      0.06       610

    accuracy                           0.75      4904
   macro avg       0.37      0.34      0.33      4904
weighted avg       0.65      0.75      0.69      4904



XGBoost

In [20]:
y_train_resampled[y_train_resampled == -1] = 2
Y_test[Y_test == -1] = 2

In [21]:
xg_reg = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss',n_estimators=200)
xg_reg.fit(X_train_resampled,y_train_resampled)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='mlogloss', gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=6,
              max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=200, n_jobs=0,
              num_parallel_tree=1, objective='multi:softprob', predictor='auto',
              random_state=0, reg_alpha=0, ...)

In [22]:
predications = xg_reg.predict(X_test)
# predications = ada_model.predict(X_test)

model_scores['XGBoost'] = accuracy_score(predications, Y_test)
accuracy_score(Y_test, predications)

0.7644779771615008

In [23]:
report=classification_report(Y_test,predications)
print(report)

              precision    recall  f1-score   support

           0       0.79      0.96      0.87      3845
           1       0.31      0.08      0.13       610
           2       0.23      0.06      0.09       449

    accuracy                           0.76      4904
   macro avg       0.44      0.36      0.36      4904
weighted avg       0.68      0.76      0.70      4904



In [24]:
fig = go.Figure()
fig.add_trace(go.Scatter(x = list(model_scores.keys()), y = list(model_scores.values()), mode='markers',
                            marker=dict(size=15, color=['yellow', 'red', 'orange', 'blue','green'])))
# add labels and title
fig.update_layout(title='Model Accuracy Scores', xaxis_title='Models', yaxis_title='Accuracy')
# set the y-axis tickformat to a percentage
fig.update_layout(yaxis=dict(tickformat='0.01%'))

# show the plot
fig.show()


Test

In [28]:
import pickle
 
# Save the model under the cwd
filename = "nasdaq_model.sav"
# filename = 'finalized_model.sav'
pickle.dump(SRF, open(filename, 'wb'))

In [29]:
loaded_model = pickle.load(open(filename, 'rb'))
loaded_model

RandomForestClassifier(n_estimators=150, random_state=0)

In [32]:
loaded_model.score(X,Y)

0.9463273379827889