In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import xgboost as xgb

# Loading Data and Training Models

1. Linear Regression
2. Random Forest Regression
3. XG Boost Regressor
4. Random Forest Classification
5. XG Boost Classifier

In [2]:
data = pd.read_csv('/Users/mackdelany/Documents/DSR Notebooks/NLP-stock-price-prediction/final_data.csv')

In [3]:
data = data.dropna() 
data.head()

Unnamed: 0,00,000,000bpd,000ft,000km,000new,000s,000sq,000th,000usd,...,zumas,zurich,zuyevo,zweimal,Label,1st_PC,2nd_PC,dayofweek,dayofmonth,monthofyear
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.296449,0.497935,1.0,0.233333,0.636364
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.263254,0.531826,0.0,0.333333,0.636364
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.262637,0.532455,0.25,0.366667,0.636364
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.236134,0.490468,0.5,0.4,0.636364
4,0.0,0.030702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.224337,0.518278,0.75,0.433333,0.636364


In [4]:
y = data['Label'].values
X = data.drop('Label',axis=1).to_numpy()
#X = X[:,20000:] #uncomment to run with NLP data only
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 0, shuffle=False)

In [5]:
## shift y from 'today' to 'tomorrow' 
""""y = y[1:]
X = X[:-1,:]"""

'"y = y[1:]\nX = X[:-1,:]'

In [6]:
lr = LogisticRegression(random_state=0)
rf_classifier = RandomForestClassifier(1000, verbose=1, n_jobs=-1, max_depth=100)
xg_classifier = xgb.XGBClassifier(objective ='reg:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 2, alpha = 10, n_estimators = 100)

In [7]:
lr.fit(X, y)
rf_classifier.fit(X_train, y_train)
xg_classifier.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:    2.0s finished


XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=2,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='reg:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [8]:
X.shape

(1507, 20005)

# 1. Logistic Classification

In [9]:
y_lr_pred_train = lr.predict(X_train)
y_lr_pred_test = lr.predict(X_test)

In [10]:
print("Training Accuracy:",metrics.accuracy_score(y_train, y_lr_pred_train))
print()
print("Test Accuracy:",metrics.accuracy_score(y_test, y_lr_pred_test))

Training Accuracy: 0.9233038348082596

Test Accuracy: 0.8940397350993378


# 2. Random Forest Classification

In [11]:
y_rfc_pred_train = rf_classifier.predict(X_train)
y_rfc_pred_test = rf_classifier.predict(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 1000 out of 1000 | elapsed:    0.1s finished


In [12]:
print("Training Accuracy:",metrics.accuracy_score(y_train, y_rfc_pred_train))
print()
print("Test Accuracy:",metrics.accuracy_score(y_test, y_rfc_pred_test))

Training Accuracy: 0.5471976401179941

Test Accuracy: 0.5827814569536424


# 3. XG Boost Classification

In [13]:
y_xgc_pred_train = xg_classifier.predict(X_train)
y_xgc_pred_test = xg_classifier.predict(X_test)

In [14]:
print("Training Accuracy:",metrics.accuracy_score(y_train, y_xgc_pred_train))
print()
print("Test Accuracy:",metrics.accuracy_score(y_test, y_xgc_pred_test))

Training Accuracy: 0.8421828908554573

Test Accuracy: 0.5099337748344371


# 4. Ensemble Classification (Mode)

In [15]:
y_class_pred_ensemble_train = (y_lr_pred_train + y_rfc_pred_train + y_xgc_pred_train) // 3
y_class_pred_ensemble_test = (y_lr_pred_test + y_rfc_pred_test + y_xgc_pred_test) // 3

In [16]:
print("Training Accuracy:",metrics.accuracy_score(y_train, y_class_pred_ensemble_train))
print()
print("Test Accuracy:",metrics.accuracy_score(y_test, y_class_pred_ensemble_test))

Training Accuracy: 0.9373156342182891

Test Accuracy: 0.7350993377483444


In [17]:
def is_positive(row):
    if row > 0:
        return 1
    else :
        return 0


test['label_check'] = test.Close - test.Open
test['label_check'] = test['label_check'].apply(is_positive)

NameError: name 'test' is not defined

In [None]:
(test.label_y - test.label_check).sum()

In [None]:
X_lr = X[:,:20000]

In [None]:
lr.fit(X_lr, y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_lr, y, test_size = 0.10, random_state = 0, shuffle=False)

In [None]:
y_lr_pred_train = lr.predict(X_train)
y_lr_pred_test = lr.predict(X_test)

In [None]:
print("Training Accuracy:",metrics.accuracy_score(y_train, y_lr_pred_train))
print()
print("Test Accuracy:",metrics.accuracy_score(y_test, y_lr_pred_test))

# 5. Ensemble with different data sets

In [None]:
y_class_pred_ensemble_test = (y_lr_pred_test + y_rfc_pred_test + y_xgc_pred_test) // 3

In [None]:
print("Test Accuracy:",metrics.accuracy_score(y_test, y_class_pred_ensemble_test))