* Wrangle your data. Get it into the notebook in the best form possible for your analysis and model building.

* Explore your data. Make visualizations and conduct statistical analyses to explain what’s happening with your data, why it’s interesting, and what features you intend to take advantage of for your modeling.

* Build a modeling pipeline. Your model should be build in a coherent pipeline of linked stages that is efficient and easy to implement.

* Evaluate your models. You should have built multiple models, which you should thoroughly evaluate and compare via a robust analysis of residuals and failures.

* Present and thoroughly explain your product. Describe your model in detail: why you chose it, why it works, what problem it solves, how it will run in a production like environment. What would you need to do to maintain it going forward?

In [4]:
import pandas as pd
import numpy as np
import sys
import time
import matplotlib.pyplot as plt
import ccxt
import os
import statistics
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
%matplotlib inline

In [8]:
path = 'C:/Users/Carter/Documents/Github/Thinkful__Projects/Final Capstone/'

# DataFrames we've created
historical_df = pd.read_csv(path + 'data/historical prices.csv')
hodl_df = pd.read_csv(path + 'hodl.csv')
rebalanced_df = pd.read_csv(path + 'rebalanced.csv')
summary_df = pd.read_csv(path + 'summary.csv')

# Date range used for simulations
start_date, end_date = historical_data['date'][0], historical_data['date'][len(historical_data)-1]
start_date = time.strftime('%m/%d/%Y', time.gmtime(start_date))
end_date = time.strftime('%m/%d/%Y', time.gmtime(end_date))


# list of coins used in each portfolio simulation
coins = historical_data.columns[1:].tolist()
cols = hodl_df.columns[1:]
# For each simulation, make a list of the coins randomly chosen
coin_lists = [i.split('-') for i in cols]

print('Coins used in analysis', coins)
print('Date range of simulation: {} - {}'.format(start_date, end_date)) 

In [None]:
# End prices 
# Note: explain how taxes were calculated
end_price_HODL = np.array(summary_df['end_price_HODL'] - summary_df['taxes_HODL'])
end_price_rebalanced = np.array(summary_df['end_price_rebalanced'] - summary_df['taxes_rebalanced'])
performance = list((end_price_rebalanced - end_price_HODL) / end_price_HODL)

In [None]:
# Dataframe to compare coin impact on outperforming HODL
df = pd.DataFrame(columns=coins)
df['beat market'] = performance
df['beat market'] = df['beat market'] > 0
df.fillna(False, inplace=True)

# Fill Dataframe with coins used for each simulation
for i in range(len(coin_lists)):
    for coin in coin_lists[i]:
        df.loc[i, coin] = True

In [None]:
# Feature importance analysis
tree = RandomForestClassifier()
X = df[coins]
Y = df['beat market']
tree.fit(X, Y)

feature_importance = tree.feature_importances_
feature_importance = 100 * (feature_importance / max(feature_importance))
temp = feature_importance.tolist()

# Take only top 10 features
top_feats = sorted(feature_importance,reverse=True)[:10]
sorted_features = np.array([temp.index(feat) for feat in top_feats])
pos = np.arange(sorted_features.shape[0]) + .5
plt.barh(pos, feature_importance[sorted_features], align='center')
plt.yticks(pos, X.columns[sorted_features])
plt.show()