In [None]:
# %matplotlib ipympl 
import numpy as np
import pandas as pd
import csv
import datetime
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
from scipy.stats import norm
from scipy.stats import boxcox
from scipy.stats import yeojohnson

In [None]:
# load all data into dataframe
def load_data(path, file_names, aliases):
    dates = {}
    for data_set_idx in range(len(data_files)):
        cur_alias = aliases[data_set_idx]
        with open(path + data_files[data_set_idx] + '.csv', newline='') as csvfile:
            spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
            spamreader.__next__()
            for row in spamreader:
                try:
                    cur_date = datetime.datetime.strptime(row[0], '%m/%d/%Y')

                except:
                   continue
                if not cur_date in dates:
                    dates[cur_date] = {}
                    # need to generalize here
                if data_set_idx == 0 or data_set_idx == 2:
                    dates[cur_date][cur_alias] = float(row[4])
                elif data_set_idx == 1 or data_set_idx == 3 or data_set_idx == 4 or data_set_idx == 5:
                    dates[cur_date][cur_alias] = float(row[1])

    frame = pd.DataFrame.from_dict(dates, orient='index')
    frame.columns = aliases
    frame.reset_index(inplace=True)
    frame = frame.rename(columns = {'index':'Date'})
    frame = frame.sort_values('Date')
    return frame


In [None]:
# inputs here

path = 'C:\\Users\\plant\\OneDrive\\Documents\\Neural Networks Project\\'

baseline_asset = 'sp'

file_name_SP = 'SandPPrices'  
file_name_RE = 'RealEstateIndexDailySince2013'
file_name_BND = 'isharesBondIndexSince2003'
file_name_EU = 'USD_EURHistoricalData'
file_name_JPY = 'USD_JPYHistoricalData'
file_name_GOLD = 'GoldFuturesHistoricalData'

data_files = [file_name_SP, file_name_RE, file_name_BND, file_name_EU, file_name_JPY, file_name_GOLD]
aliases = ["sp", "re", "bnd", "eu", "jp", "gld"]

frame = load_data(path, data_files, aliases)

In [None]:
frame

In [None]:
not_null = frame.query(baseline_asset + ".notnull()")
not_null.reset_index(drop = True, inplace = True)

In [None]:
# adds correlation metrix to dataframe
def add_correlaries(cor_assets, cor_days_out, pred_distance, frame, assets):
    # stores percent changes from past x days 
    cors = [[] for i in range(len(cor_assets))]
    # stores percent changes for x future days for each asset
    futs = {}
    for a in assets:
        futs[a] = []
    
    # iterate through all data points
    for idx, row in frame.iterrows():
        # past data points
        for alias_idx, (asset, days_out) in enumerate(zip(cor_assets, cor_days_out)):
            cur_price = row[asset]
            if idx > days_out: # check for enough data
                # get percent change
                last_time_period = frame.loc[idx - days_out - 1].at[asset]
                time_period_change = (cur_price - last_time_period)/last_time_period
                cors[alias_idx].append(time_period_change)           
            else:
                cors[alias_idx].append(None)
        for asset in assets:
            #future data
            if idx + pred_distance < frame.shape[0]:
                fut_val = frame.loc[idx + pred_distance].at[asset]
                time_period_change = (fut_val - cur_price)/cur_price  
                futs[asset].append(time_period_change)         
            else:
                futs[asset].append(None)
    # input into data frame
    for idx, (asset, days_out) in enumerate(zip(cor_assets, cor_days_out)):
        name = asset + "_" + str(days_out) + "_dys"
        frame.insert(frame.shape[1], name, cors[idx], True)
    
    for asset in futs.keys():
        name = asset + "_fut_" + str(pred_distance) + "dys"
        frame.insert(frame.shape[1], name, futs[asset], True)

In [None]:
def add_pred_differences(pred_distance, assets, frame):
    for idx, asset1 in enumerate(assets):
        for idx2, asset2 in enumerate(assets[idx + 1: ]):
            change1 = frame[asset1 + "_fut_" + str(pred_distance) + "dys"]
            change2 = frame[asset2 + "_fut_" + str(pred_distance) + "dys"]
            diff = change2  - change1
            frame.insert(frame.shape[1], asset2 + "_" + asset1 + "_" + str(pred_distance) + "dys_diff", diff, True)

In [None]:
# input correlaries
cor_assets = ["sp", "re", "bnd", "eu", "jp", "gld", "sp", "re", "bnd", "eu", "jp", "gld"]
cor_days_out = [20, 20, 20, 20, 20, 5, 5, 5, 5, 5]
pred_distance = 10
assets = ["sp", "re", "bnd", "eu", "jp", "gld"]
add_correlaries(cor_assets, cor_days_out, pred_distance, not_null, assets)
add_pred_differences(pred_distance, assets, not_null)

In [None]:
not_null.columns

In [None]:
#use sklearn.preprocessing.PowerTransformer instead

plt.figure()

column_name = 'bnd_sp_2wk_diff'
column = sp_not_null[column_name] 
column = column[~np.isnan(column)]
print(column)
# column += np.array([1 for i in range(len(column))])
# print(column)
plt.figure()
plt.hist(column , color = 'red', bins = 500, density=True)
mean = np.mean(column)
std = np.std(column)
print(mean)
print(std)
x_axis = np.arange(-.15, .15, 0.01)

plt.plot(x_axis, norm.pdf(x_axis, mean, std))


plt.figure()
plt.hist(yeojohnson(column)[0] , color = 'red', bins = 500, density=True)
mean = np.mean(yeojohnson(column)[0])
std = np.std(yeojohnson(column)[0])
print(mean)
print(std)
x_axis = np.arange(-.3, .3, 0.01)

plt.plot(x_axis, norm.pdf(x_axis, mean, std))
# plt.hist(np.log(sp_not_null[column_name] + np.array([1 for i in range(len(sp_not_null[column_name]))])) , color = 'red', bins = 500, density=True)
# mean = np.mean(np.log(sp_not_null[column_name] + np.array([1 for i in range(len(sp_not_null[column_name]))])))
# std = np.std(np.log(sp_not_null[column_name] + np.array([1 for i in range(len(sp_not_null[column_name]))])))



# column += np.array([1 for i in range(len(column))])
# column = np.log(column)
# plt.figure()
# plt.hist(yeojohnson(column)[0] , color = 'red', bins = 500, density=True)
# mean = np.mean(yeojohnson(column)[0])
# std = np.std(yeojohnson(column)[0])
# print(mean)
# print(std)
# x_axis = np.arange(-.3, .3, 0.01)

plt.plot(x_axis, norm.pdf(x_axis, mean, std))
# plt.hist(sp_not_null['sp_fut_2wks'], color = 'red', bins = 500)
# plt.hist(sp_not_null['re_fut_2wks'], color = 'green', bins = 500, alpha = .5,)
# plt.hist(sp_not_null['bnd_fut_2wks'], color = 'blue', bins = 500, alpha = .5,)
# plt.hist(sp_not_null['gld_fut_2wks'], color = 'yellow', bins = 500, alpha = .5,)
# plt.hist(sp_not_null['eu_fut_2wks'], color = 'green', bins = 500, alpha = .5,)
# plt.hist(sp_not_null['jp_fut_2wks'], color = 'blue', bins = 500, alpha = .5,)
plt.show()

In [None]:
# fig = plt.figure()
# ax = fig.add_subplot(projection = '3d')

# ax.scatter(not_null["sp_last_month"], not_null["re_last_month"], not_null["re_sp_2wk_diff"])
# ax.set_xlabel('sp_last_month')
# ax.set_ylabel('re_last_month')
# ax.set_zlabel('re_sp_2wk_diff')
# plt.show()

In [None]:
def predict(asset, baseline, df, aliases, pred_distance, inputs, get_plots=False):
    START = -.25
    STOP = .25
    INCREMENT = .001
    
    # covariance matrix
    columns = []
    for col in df.columns:
        if not col in aliases and col != "Date" and not "diff" in col and not "fut" in col:
            columns.append(col)

    columns.append(asset + "_" + baseline + "_" + str(pred_distance) + "dys_diff")

    cov_mat = df[columns]
    cov_mat = cov_mat.cov()
    cov_mat = cov_mat.to_numpy()
    
    # means of values
    means = []
    for col in columns:
        means.append(np.mean(df[col]))
        

    rv = multivariate_normal(mean=means, cov=cov_mat, allow_singular=True)
    probs = []
   
    x = np.arange(START, STOP, INCREMENT)
    inputs.append(None)
    for val in x:
        # make an array with all the current values
        # insert past month performance
        #"sp", "re", "bnd", "eu", "jp", "gld", future difference
        inputs[-1] = val
        probs.append(rv.pdf(inputs))


    cdf = []
    for idx in range(x.size - 1):
        cur_prob = probs[idx]
        next_prob = probs[idx + 1]
        rieman_sum = min(cur_prob, next_prob) * INCREMENT
        rieman_sum += max(cur_prob, next_prob) - min(cur_prob, next_prob) * INCREMENT / 2
        if len(cdf) > 0:
            cdf.append(rieman_sum + cdf[-1])
        else:
            cdf.append(rieman_sum)
    if get_plots:
        fig1 = plt.figure()
        ax = fig1.add_subplot(111)
        plt.title("pdf")
        plt.xlabel("difference between performance of " + asset + " and " + baseline)
        plt.ylabel("probability")
        ax.plot(x, probs/cdf[-1])
        plt.show()

    for idx in range(len(cdf)):
        cdf[idx] /= cdf[-1]

    if get_plots:
        fig2 = plt.figure()
        ax = fig2.add_subplot(111)
        plt.title("cdg")
        plt.xlabel("difference between performance of " + asset + " and " + baseline)
        plt.ylabel("probability")
        ax.plot(x[:-1], cdf)
        plt.show() 
        
    # find 50% point
    cur_prob = 0
    idx = 0
    while(cur_prob < .5):
        cur_prob = cdf[idx]
        idx += 1
    fiftyfiftypt = x[idx]
    print("50 50 change to be above or below")
    print(x[idx])


    #find expected value
    expected_value = 0
    for idx in range(len(cdf)):
        if idx == 0:
            expected_value += cdf[idx] * x[idx]
        else:
            cur_prob = cdf[idx - 1]
            next_prob = cdf[idx]
            actual_prob = next_prob - cur_prob
            expected_value += actual_prob * x[idx]
    print("Expected Value")
    print(expected_value)
    return fiftyfiftypt, expected_value

In [None]:
predict("gld", baseline_asset, not_null, aliases, pred_distance, [.05, .05, .05, .05, .05, .05, .05, .05, .05, .05], get_plots=False)