In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [38]:
df = pd.read_csv('cleaned_data/Cleaned Data.csv', index_col=[0])

In [39]:
#Assigning values
X = df.drop('Signal', axis=1)
y = df['Signal']

In [40]:
# Checking data
y

1        Sell
2         Buy
3         Buy
4         Buy
5         Buy
         ... 
11575     Buy
11576    Sell
11577    Sell
11578    Sell
11579     Buy
Name: Signal, Length: 11579, dtype: object

In [41]:
# Create Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

In [42]:
#scale our data to avoid issues (X_scaled = X_std * (max - min) + min)
from sklearn.preprocessing import MinMaxScaler

In [43]:
#creating an instance
scaler = MinMaxScaler()

#adjusting scaler only based on the training set (without test values to prevent data leakage)
scaler.fit(X_train)

MinMaxScaler()

In [44]:
#perform the transformation
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [45]:
# Import Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(max_depth=4)
tree.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=4)

In [25]:
# Max depth
maxDepth = np.array([1, 2, 5, 10])

# Minimum number of samples required to split any internal node 
minSamplesNode = np.array([2, 5, 10])

# The minimum number of samples required to be at a leaf/terminal node
minSamplesLeaf = np.array([10, 20, 30])

# Import necessary functions
from sklearn.model_selection import StratifiedKFold, cross_val_score

# Create k-Fold CV object
kFold = StratifiedKFold(n_splits=10)

In [26]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter values to be tested
param_grid = {'criterion': ['gini', 'entropy'],
              'max_depth': maxDepth,
              'min_samples_split': minSamplesNode,
              'min_samples_leaf': minSamplesLeaf}

# Run brute-force grid search
gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=kFold, n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)

0.5576815697736371
{'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 10, 'min_samples_split': 2}


In [27]:
# Extract best parameter
clf = gs.best_estimator_

# Fit model given best parameter
clf.fit(X_train, y_train)

# Print out score on Test dataset
print('Test accuracy: {0: .4f}'.format(clf.score(X_test, y_test)))

Test accuracy:  0.5489


In [28]:
from sklearn import metrics

y_pred = tree.predict(X_test)
print('Confusion matrix: \n', 
      metrics.confusion_matrix(y_test, y_pred))

Confusion matrix: 
 [[ 470    0  934]
 [  83    0  141]
 [ 466    0 1380]]


In [29]:
y_test

5187      Buy
1490     Hold
9159     Sell
4083      Buy
7318      Buy
         ... 
3141      Buy
3648     Sell
1019     Sell
11245    Sell
9795      Buy
Name: Signal, Length: 3474, dtype: object

In [30]:
y_pred

array(['Sell', 'Buy', 'Sell', ..., 'Sell', 'Buy', 'Sell'], dtype=object)

In [46]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         Buy       0.52      0.41      0.46      1404
        Hold       0.00      0.00      0.00       224
        Sell       0.59      0.76      0.66      1846

    accuracy                           0.57      3474
   macro avg       0.37      0.39      0.37      3474
weighted avg       0.52      0.57      0.54      3474



In [47]:
#checking if our model learned something
6153/len(df)

0.5313930391225494

In [16]:
#We see that our model has similar accuracy as a model which is just guessing on sell
#Without any adjustments, this decision tree would perform as good as a model which is just guessing on Sell

## Random Forest 

In [48]:
from sklearn.ensemble import RandomForestClassifier

# Create classifier object and fit it to data
forest = RandomForestClassifier(criterion='gini', random_state=0, n_jobs=-1)
forest.fit(X_train, y_train)

RandomForestClassifier(n_jobs=-1, random_state=0)

In [49]:
y_pred = forest.predict(X_test)

In [50]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         Buy       0.52      0.41      0.46      1404
        Hold       0.00      0.00      0.00       224
        Sell       0.59      0.76      0.66      1846

    accuracy                           0.57      3474
   macro avg       0.37      0.39      0.37      3474
weighted avg       0.52      0.57      0.54      3474



In [51]:
from sklearn import metrics
print('Confusion matrix: \n', 
      metrics.confusion_matrix(y_test, y_pred))

Confusion matrix: 
 [[ 572    1  831]
 [  77    0  147]
 [ 448    2 1396]]


In [22]:
#With the Random Forest Adjustment, however, we can see that our model is able to increase its accuracy
#We see without any adjustments, our model shows skills

## Optimizing Random Forest with GridSearchCV

In [52]:
# Define the hyperparameter values to be tested
param_grid = {'criterion': ['gini'],
              'max_depth': [1, 2, 5, 10],
              'max_features': ['auto']}

# Run brute-force grid search
gs = GridSearchCV(estimator=RandomForestClassifier(random_state=0),
                  param_grid=param_grid,
                  scoring='accuracy',
                  cv=kFold, n_jobs=-1)
gs = gs.fit(X_train, y_train)
print(gs.best_score_)
print(gs.best_params_)


y_pred = forest.predict(X_test)

print(classification_report(y_test,y_pred))

0.5795188077514425
{'criterion': 'gini', 'max_depth': 10, 'max_features': 'auto'}
              precision    recall  f1-score   support

         Buy       0.52      0.41      0.46      1404
        Hold       0.00      0.00      0.00       224
        Sell       0.59      0.76      0.66      1846

    accuracy                           0.57      3474
   macro avg       0.37      0.39      0.37      3474
weighted avg       0.52      0.57      0.54      3474



In [53]:
print('Confusion matrix: \n', 
      metrics.confusion_matrix(y_test, y_pred))

Confusion matrix: 
 [[ 572    1  831]
 [  77    0  147]
 [ 448    2 1396]]
