# Long-term Analysis

In [1]:
# Data Manipulation
import numpy as np
import pandas as pd
from datetime import datetime

# Plotting graphs
import matplotlib.pyplot as plt

# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import TimeSeriesSplit

from ta import add_all_ta_features #pip install --upgrade ta https://github.com/bukosabino/ta https://medium.datadriveninvestor.com/predicting-the-stock-market-with-python-bba3cf4c56ef
from fastai.tabular.all import add_datepart #pip install fastai https://docs.fast.ai/tabular.core.html https://www.analyticsvidhya.com/blog/2018/10/predicting-stock-price-machine-learningnd-deep-learning-techniques-python/

In [2]:
def convert(date_string):
    year, month, day = [int(i) for i in date_string.split('-')]
    return datetime(year=year, month=month, day=day)

In [3]:
def prepare_dataset(path = 'data/AAPL_data.csv', s_path = 'data/AAPL_byday_RoBERTa.csv', start_date = '2022-01-01', sentiment = False):
    '''
    Load dataset and generate technical features and limit time periods. 

    path(str) : path to the price data file.
    s_path(str) : path to the stocktwits data file. 
    select_feature(boolean) : if true keep selected features
    start_date(str) : start date of interested time period. 
    '''
    df = pd.read_csv(path)
    df.dropna()

    df["Date"] = pd.to_datetime(df.Date, format="%Y-%m-%d")
    df.index = df['Date']
    df = df.sort_index(ascending=True, axis=0)
    add_datepart(df, 'Date', drop=False)
    df.drop('Elapsed', axis=1, inplace=True)

    df = add_all_ta_features(
        df, high="High", low="Low", open="Open", close="Close", volume="Volume")

    if sentiment:
        df.index = np.array(range(len(df)))
        df_sentiment = pd.read_csv(s_path)
        if s_path == 'data/AAPL_byday_RoBERTa.csv':
        # Drop the first empty column for AAPL
            df_sentiment = df_sentiment.iloc[: , 1:]
        df_sentiment.date = df_sentiment.date.apply(convert)
        df_sentiment.rename(columns={'date':'Date'}, inplace=True)
        df = df.merge(df_sentiment, how='inner', on='Date').fillna(0)

    X = df.drop(['Close', 'trend_psar_down', 'trend_psar_up', 'Date', 'Adj Close'], axis=1)
    cols = X.columns
    
    df = df[df['Date'] >= start_date]
    df = df.drop(['trend_psar_down', 'trend_psar_up'], axis=1)
    df = df.dropna()

    y = np.where(df['Close'].shift(-1) > df['Close'], 1, -1)
    X = df.drop(['Close', 'Date', 'Adj Close'], axis=1)
    
    cols = X.columns
    X = np.array(X)
    X = X.astype(np.float32)

    # Split training and validation data
    split = int(0.8 * len(X))
    X_train = X[:split]
    X_test = X[split:]
    y_train = y[:split]
    y_test = y[split:]

    return X_train, X_test, y_train, y_test, cols


In [4]:
def log_reg(X_train, X_test, y_train, y_test):
    model = LogisticRegression(max_iter=float('inf'))
    model = model.fit(X_train, y_train)
    y_predict = model.predict(X_test)
    print(metrics.classification_report(y_test, y_predict))

## Apple Stock - Long-term

In [5]:
X_train, X_test, y_train, y_test, _ = prepare_dataset(start_date='2017-08-09')
t_scaler = MinMaxScaler()
X_train = t_scaler.fit_transform(X_train)
X_test = t_scaler.transform(X_test)

  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)
  self._psar_up = pd.Series(index=self._psar.index)
  self._psar_down = pd.Series(index=self._psar.index)


In [6]:
log_reg(X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

          -1       0.69      0.17      0.28       115
           1       0.55      0.93      0.69       123

    accuracy                           0.56       238
   macro avg       0.62      0.55      0.48       238
weighted avg       0.62      0.56      0.49       238



In [7]:
y = np.concatenate((y_train, y_test), axis=0)
print(f'y=1 {len(y[y==1])}, y=-1 {len(y[y==-1])}')

y=1 632, y=-1 557


## Microsoft Stock - Long-term

In [8]:
X_train, X_test, y_train, y_test, _ = prepare_dataset(path = 'data/MSFT_data.csv', start_date='2017-08-09')
t_scaler = MinMaxScaler()
X_train = t_scaler.fit_transform(X_train)
X_test = t_scaler.transform(X_test)

  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)
  self._psar_up = pd.Series(index=self._psar.index)
  self._psar_down = pd.Series(index=self._psar.index)


In [9]:
log_reg(X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

          -1       0.51      0.74      0.61       117
           1       0.56      0.31      0.40       121

    accuracy                           0.53       238
   macro avg       0.54      0.53      0.50       238
weighted avg       0.54      0.53      0.50       238



In [10]:
y = np.concatenate((y_train, y_test), axis=0)
print(f'y=1 {len(y[y==1])}, y=-1 {len(y[y==-1])}')

y=1 658, y=-1 530


## Apple Stock - Short-term

In [11]:
X_train, X_test, y_train, y_test, _ = prepare_dataset(sentiment=True)
t_scaler = MinMaxScaler()
X_train = t_scaler.fit_transform(X_train)
X_test = t_scaler.transform(X_test)

  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)
  self._psar_up = pd.Series(index=self._psar.index)
  self._psar_down = pd.Series(index=self._psar.index)


In [12]:
log_reg(X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

          -1       0.67      0.91      0.77        11
           1       0.50      0.17      0.25         6

    accuracy                           0.65        17
   macro avg       0.58      0.54      0.51        17
weighted avg       0.61      0.65      0.59        17



In [13]:
y = np.concatenate((y_train, y_test), axis=0)
print(f'y=1 {len(y[y==1])}, y=-1 {len(y[y==-1])}')

y=1 35, y=-1 46


## Microsoft Stock - Short-term

In [14]:
X_train, X_test, y_train, y_test, _ = prepare_dataset(path = 'data/MSFT_data.csv', s_path='data/MSFT_byday_RoBERTa.csv')
t_scaler = MinMaxScaler()
X_train = t_scaler.fit_transform(X_train)
X_test = t_scaler.transform(X_test)

  dip[idx] = 100 * (self._dip[idx] / value)
  din[idx] = 100 * (self._din[idx] / value)
  self._psar_up = pd.Series(index=self._psar.index)
  self._psar_down = pd.Series(index=self._psar.index)


In [15]:
log_reg(X_train, X_test, y_train, y_test)

              precision    recall  f1-score   support

          -1       0.71      0.56      0.63         9
           1       0.56      0.71      0.63         7

    accuracy                           0.62        16
   macro avg       0.63      0.63      0.63        16
weighted avg       0.64      0.62      0.63        16



In [16]:
y = np.concatenate((y_train, y_test), axis=0)
print(f'y=1 {len(y[y==1])}, y=-1 {len(y[y==-1])}')

y=1 38, y=-1 42
