In [25]:
!pip install --user -q -r requirements.txt

ERROR: Can not perform a '--user' install. User site-packages are not visible in this virtualenv.
You should consider upgrading via the 'C:\Users\lucaa\PycharmProjects\SCCProject\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [1]:
MODELS_DIR = 'models/'
DATA_DIR = 'data/'
RESULT_DIR = 'results'
stocks = {'Apple':'AAPL','Microsoft':'MSFT','Ibm':'IBM'}
stocks_metrics = dict()

In [2]:
def finance_dataset(stock):

    app = Ticker(stock)
    dataset = app.history(period='max')
    dataset = dataset.drop(columns=['Dividends', 'Stock Splits'])

    column_list = (dataset.columns.values.tolist())
    column_list.insert(0, column_list.pop())
    dataset = dataset.reindex(columns=column_list)

    return dataset


def series_to_supervised(dataset, n_back=10, n_forward=1, dropnan=True):
    data = dataset.values.astype('float32')
    columns = [x.lower() for x in dataset.columns.values]
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols, names = list(), list()

    # input sequence (t-n, ... t-1)
    for i in range(n_back, 0, -1):
        cols.append(df.shift(i))
        names += [f'{columns[j]}[t-{i}]' for j in range(n_vars)]

    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_forward + 1):
        cols.append(df.shift(-i))
        names += [f'{columns[j]}[t{"" if i == 0 else f"+{i}"}]' for j in range(n_vars)]

    # put it all together
    time_shifted = concat(cols, axis=1)
    time_shifted.columns = names
    # drop rows with NaN values
    if dropnan:
        time_shifted.dropna(inplace=True)
    time_shifted = time_shifted.drop(time_shifted.columns[-5:-1], axis=1)
    return time_shifted


def split_dataset(dataset, train_percentage=.7):
    dataset = dataset.copy()
    last_column = dataset.pop(dataset.columns[-1])

    y = last_column.values
    X = dataset.values
    return train_test_split(X, y, train_size=train_percentage)

In [3]:
import json
import joblib
import keras
import numpy as np
import xgboost
from pandas import DataFrame, concat
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from yfinance import Ticker
from lightgbm import LGBMRegressor

2022-12-29 17:46:26.280077: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from pandas import MultiIndex, Int64Index


In [4]:
def load_data(stock,out_path,back = 10,forward = 1):

    dataset = finance_dataset(stock)
    supervised_dataset = series_to_supervised(dataset, back, forward)
    X_train, X_test, y_train, y_test = split_dataset(supervised_dataset, train_percentage=.7)

    data = {'X_train': X_train.tolist(),'y_train': y_train.tolist(),
            'X_test': X_test.tolist(),'y_test': y_test.tolist()}

    data_json = json.dumps(data)

    with open(out_path, 'w') as out_file:
        out_file.truncate(0)
        json.dump(data_json, out_file)

In [5]:
def manipulate_data(dir_path,out_path):
    files_path = [str(x) for x in Path(dir_path).rglob('*') if x.is_file()]
    datasets = []

    for file in files_path:
        with open(file) as data_file:
            json_str = json.load(data_file)
            datasets.append(json.loads(json_str))
            
    X_train, X_test, y_train, y_test = [], [], [], []
    for data in datasets:
        X_train.extend(data['X_train'])
        y_train.extend(data['y_train'])
        X_test.extend(data['X_test'])
        y_test.extend(data['y_test'])

    whole_dataset = {'X_train': X_train,'y_train': y_train,
                     'X_test': X_test,'y_test': y_test}

    data_json = json.dumps(whole_dataset)
    with open(out_path, 'w') as out_file:
        out_file.truncate(0)
        json.dump(data_json, out_file)

In [6]:
def decision_tree(data_path, name="whole"):
    with open(data_path) as data_file:
        data = json.load(data_file)
    data = json.loads(data)

    X_train = data['X_train']
    y_train = data['y_train']
    X_test = data['X_test']
    y_test = data['y_test']

    model = LGBMRegressor()
    model.fit(X_train, y_train)
    
    joblib.dump(model,MODELS_DIR + name + '_LGBMRegressor.joblib')

    y_pred = model.predict(X_test)
    return mean_squared_error(y_test, y_pred)


In [7]:
from xgboost import XGBRegressor


def decision_tree_2(data_path, name="whole"):
    with open(data_path) as data_file:
        data = json.load(data_file)
    data = json.loads(data)

    X_train = data['X_train']
    y_train = data['y_train']
    X_test = data['X_test']
    y_test = data['y_test']

    model = XGBRegressor()
    model.fit(X_train, y_train)

    joblib.dump(model,MODELS_DIR + name + '_XGBRegressor.joblib')

    y_pred = model.predict(X_test)
    return mean_squared_error(y_test, y_pred)

In [8]:
def show_results(stocks_metrics):
    stocks_metrics = sorted(stocks_metrics.items(), key=lambda x:x[1])
    
    for key,value in stocks_metrics:
        print(f'[{key}] {value}')


In [9]:
stocks = {'Apple':'AAPL','Microsoft':'MSFT','Ibm':'IBM'}
stocks_metrics = dict()

for name,index in stocks.items():
    load_data(index, DATA_DIR+name+'.json')
    rmse = decision_tree(DATA_DIR+name+'.json', index)
    stocks_metrics[name] = rmse

manipulate_data(DATA_DIR, DATA_DIR+'whole_dataset.json')
rmse = decision_tree(DATA_DIR+'whole_dataset.json')
stocks_metrics['Whole'] = rmse

show_results(stocks_metrics)


[Ibm] 0.9649847994712346
[Apple] 1.0474779167412467
[Whole] 2.440151594145422
[Microsoft] 3.1645384937700847


In [10]:
stocks = {'Apple':'AAPL','Microsoft':'MSFT','Ibm':'IBM'}
stocks_metrics = dict()

for name,index in stocks.items():
    load_data(index, DATA_DIR+name+'.json')
    rmse = decision_tree_2(DATA_DIR+name+'.json', index)
    stocks_metrics[name] = rmse

manipulate_data(DATA_DIR, DATA_DIR+'whole_dataset.json')
rmse = decision_tree_2(DATA_DIR+'whole_dataset.json')
stocks_metrics['Whole'] = rmse

show_results(stocks_metrics)


[Whole] 0.6787970361457105
[Apple] 0.8858530172741975
[Ibm] 1.0265390862192958
[Microsoft] 3.8065624451122


In [None]:
model = joblib.load("models/Apple_XGBRegressor.joblib")

app = Ticker('AAPL')
dataset = app.history(start='2022-11-06',end='2022-12-22')
dataset = dataset.drop(columns=['Dividends', 'Stock Splits'])
print(dataset)
column_list = (dataset.columns.values.tolist())
column_list.insert(0, column_list.pop())
dataset = dataset.reindex(columns=column_list)

banale = series_to_supervised(dataset,2,2).drop(columns=['close[t+2]'])

print(model.predict(banale))