In [1]:
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import xgboost as xgb

# Loading Data and Training Models

1. Linear Regression
2. Random Forest Regression
3. XG Boost Regressor
4. Random Forest Classification
5. XG Boost Classifier

In [2]:
text_path = '/Users/mackdelany/Documents/DSR Notebooks/NLP-stock-price-prediction/data/interim/text_features.csv'
labels_path = '/Users/mackdelany/Documents/DSR Notebooks/NLP-stock-price-prediction/data/interim/prototype_labels.csv'

text = pd.read_csv(text_path)
labels = pd.read_csv(labels_path)                   

In [3]:
data = text.merge(labels, how='inner', left_on='Date', right_on='Date').drop(['Date','Unnamed: 0'],axis=1)
print(data.shape)
data = data.dropna() # dropped 77 NA rows
print(data.shape)

(1584, 10001)
(1507, 10001)


In [4]:
y = data['Label'].values
X = data.drop(['Label'], axis=1).to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 0, shuffle=False)

In [5]:
lr = LogisticRegression(random_state=0)
rf_classifier = RandomForestClassifier(100, verbose=1, n_jobs=-1)
xg_classifier = xgb.XGBClassifier(objective ='reg:logistic', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)

In [6]:
lr.fit(X, y)
rf_classifier.fit(X_train, y_train)
xg_classifier.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.6s finished


XGBClassifier(alpha=10, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.3, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='reg:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [16]:
y.

array([0., 1., 0., ..., 1., 0., 1.])

# 1. Logistic Classification

In [7]:
y_lr_pred_train = lr.predict(X_train)
y_lr_pred_test = lr.predict(X_test)

In [8]:
print("Training Accuracy:",metrics.accuracy_score(y_train, y_lr_pred_train))
print()
print("Test Accuracy:",metrics.accuracy_score(y_test, y_lr_pred_test))

Training Accuracy: 0.946165191740413

Test Accuracy: 0.9403973509933775


# 2. Random Forest Classification

In [9]:
y_rfc_pred_train = rf_classifier.predict(X_train)
y_rfc_pred_test = rf_classifier.predict(X_test)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


In [10]:
print("Training Accuracy:",metrics.accuracy_score(y_train, y_rfc_pred_train))
print()
print("Test Accuracy:",metrics.accuracy_score(y_test, y_rfc_pred_test))

Training Accuracy: 1.0

Test Accuracy: 0.5298013245033113


# 3. XG Boost Classification

In [11]:
y_xgc_pred_train = xg_classifier.predict(X_train)
y_xgc_pred_test = xg_classifier.predict(X_test)

In [12]:
print("Training Accuracy:",metrics.accuracy_score(y_train, y_xgc_pred_train))
print()
print("Test Accuracy:",metrics.accuracy_score(y_test, y_xgc_pred_test))

Training Accuracy: 0.9933628318584071

Test Accuracy: 0.543046357615894


# 4. Ensemble Classification (Mode)

In [13]:
y_class_pred_ensemble_train = (y_lr_pred_train + y_rfc_pred_train + y_xgc_pred_train) // 3
y_class_pred_ensemble_test = (y_lr_pred_test + y_rfc_pred_test + y_xgc_pred_test) // 3

In [14]:
print("Training Accuracy:",metrics.accuracy_score(y_train, y_class_pred_ensemble_train))
print()
print("Test Accuracy:",metrics.accuracy_score(y_test, y_class_pred_ensemble_test))

Training Accuracy: 0.9985250737463127

Test Accuracy: 0.6821192052980133
