* Wrangle your data. Get it into the notebook in the best form possible for your analysis and model building.

* Explore your data. Make visualizations and conduct statistical analyses to explain what’s happening with your data, why it’s interesting, and what features you intend to take advantage of for your modeling.

* Build a modeling pipeline. Your model should be build in a coherent pipeline of linked stages that is efficient and easy to implement.

* Evaluate your models. You should have built multiple models, which you should thoroughly evaluate and compare via a robust analysis of residuals and failures.

* Present and thoroughly explain your product. Describe your model in detail: why you chose it, why it works, what problem it solves, how it will run in a production like environment. What would you need to do to maintain it going forward?

In [1]:
import pandas as pd
import numpy as np
import sys
import time
import random
import matplotlib.pyplot as plt
import ccxt
import os
import statistics
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
%matplotlib inline

Historical market cap was downloaded from https://coin.dance/stats/marketcaphistorical.

In [None]:
file = 'data/historical_market_cap.csv'
df = pd.read_csv(file)

# convert date column to epoch time
df.rename(columns={0: 'date'}, inplace=True)
dates = [int(time.mktime(time.strptime(day, '%m/%d/%Y'))) for day in df['date']]
df['date'] = dates

# Total market cap
df['Total Market Cap'] = df['Altcoin Market Cap'] + df['Bitcoin Market Cap']

# Save file
df.set_index('date', drop=True, inplace=True)
df.to_csv('data/historical_market_cap.csv')

Historical market prices of coins

In [None]:
date_range = [time.strftime('%m/%d/%Y', time.localtime(day)) for day in dates]

# Use coins listed on Bittrex
primary_exchange = ccxt.bittrex({'options': {'adjustForTimeDifference': True}})
market = primary_exchange.load_markets()
tickers = list(market.keys())

# Since the tickers are formatted like 'ETH/BTC', split by the '/' and get a list of each coin once
coins = set()
[[coins.add(coin) for coin in ticker.split('/') if coin != 'BTC'] for ticker in tickers]
coins = list(coins)

# Now we have to convert all the coins back to coin/BTC to find prices.
# Since we can't pull BTC/BTC, use BTC/USDT ticker
tickers = [coin + '/BTC' for coin in coins]
coins.insert(0, 'BTC')
tickers.insert(0, 'BTC/USDT')

# Dataframe to save coin price data
df = pd.DataFrame(index=[list(dates_epoch)])

for ticker in tickers:
    # Unfornately, some coins may not have a coin/BTC ratio, so only pull information if ticker exists
    try:
        data = np.array(primary_exchange.fetch_ohlcv(ticker, '1d'))[:, :2]
    except:
    continue

    coin_prices = [price\
                   for day, price in data \
                   if time.strftime('%m/%d/%Y', time.localtime(day/1000)) in date_range]

    # Only add coin if it has price data for the whole time frame
    if len(coin_prices) == len(date_range):
        df[ticker[:ticker.find('/')]] = coin_prices
        
# Since all coins are still in BTC denomination, multiply by BTC price to get $ price
df[1:] *= df[0]
df.to_csv('data/historical prices.csv')

HODL simulation function

In [None]:
def simulate_HODL(hist_prices):
    sims = pd.DataFrame(index=sim_dates)

    for sim_num in range(1000):
        # Randomly select basket of coins
        random_list = random.sample(range(len(coins)-1), num_coins)

        # Determine amount of each coin bought on day 0
        coin_amts = amt_each / hist_prices[0, random_list]

        # Use coins as column name
        col = '-'.join([coins[i] for i in random_list])

        # Dot multiply list of coin amounts with array of historical prices of selected coins
        sims[col] = hist_prices[:, random_list].dot(coin_amts)

    simulations.to_csv('data/HODL.csv')

Rebalance simulation function

In [None]:
def simulate_rebalance(hodl_df, hist_prices):
    
    # Set the threshold of weight difference to trigger a trade
    thresh = 0.05
    avg_weight = 1 / num_coins
    weighted_thresh = np.float32(avg_weight * thresh)
    
    # Exclude date column
    cols = df.columns.tolist()
    
    # Convert to numpy for future vector multiplication
    hodl_sims = np.array(hodl_df)
    hist_prices = np.array(hist_prices[hist_prices.columns[1:]])# Exclude date column
    
    # Arrays to be transformed to CSV's
    sim_summary = [[] for x in range(len(cols))]
    rebalance_sims = np.empty(shape=(len(cols), len(hist_prices)))
    
    # Use the same coin combinations as the HODL simulation
    coin_lists = [col.split('-') for col in cols]
    
    # For each simulation, convert the symbol into the corresponding column # in historical_prices
    coin_lists_indices = [[coins.index(coin) for coin in coin_list] for coin_list in coin_lists]
    
    # Loop for each simulation
    for num, (col, coin_list, coin_list_index) in enumerate(zip(cols, coin_lists, coin_lists_indices)):
        fees, trade_count, trades_eliminated, taxes_rebalanced = 0, 0, 0, 0
        
        # Starting list of our daily totals
        daily_totals = [start_amt]
        
        # Reduce hist_prices array to only the coins used in the simulation (improves performace)
        small_hist_prices = historical_prices[:, coin_list_index]
        
        # Initial purchase prices for coins
        avg_prices = small_hist_prices[0].tolist()
        
        # Calculate starting coin amounts
        coin_amts = amt_each / small_hist_prices[0]
        
        # Simulate each day (starting at day 1)
        for day in range(1,len(hist_prices)):
            while True:
                
                # Dollar value of each coin using the coin prices from that day
                d_vals = small_hist_prices[day] * coin_amts
                
                d_val_sum = sum(d_vals)
                l_index, h_index = d_vals.argmin(), d_vals.argmax()
        
                # See how far the lightest and heaviest coin weight deviates from average weight
                weight_to_move = min([avg_weight - d_vals[l_index]/d_val_sum, d_vals[h_index]/d_val_sum - avg_weight])
                if weighted_thresh > weight_to_move:
                    break
        
                # Does a ticker for the coins exist? (Sometimes it doesn't: e.g. XRP/OMG)
                # if it doesn't, it needs to convert to BTC first, which takes two trades
                ratios = {coin_list[l_index] + '/' + coin_list[h_index], coin_list[h_index] + '/' + coin_list[l_index]}
                ticker = ratios & tickers
                
                # Calculating fees - depends if we have one or two trades
                rate = 0.0025
                if not ticker:
                    rate = 0.005
                
                d_amt = weight_to_move * d_val_sum
                fees += (d_amt * rate)
                
                # Get coin quantities to buy/sell based on current market price
                l_quantity = d_amt / small_hist_prices[day, l_index]
                h_quantity = d_amt / small_hist_prices[day, h_index] * (1 + rate)
                
                price_diff = small_hist_prices[day, h_index] - avg_prices[h_index]
                taxes_rebalanced += (price_diff * h_quantity * 0.25)
                
                # adjust avg purchase price for bought coin
                avg_prices[l_index] = (avg_prices[l_index] * coin_amts[l_index] + small_hist_prices[day, l_index] * l_quantity)/(coin_amts[l_index] + l_quantity)
                
                # Adjust coin quantities
                coin_amts[l_index] += l_quantity
                coin_amts[h_index] -= h_quantity
                
            # document total portfolio value on that day
            daily_totals.append(np.dot(small_hist_prices[day], coin_amts))
            
        # Document important features of the simulations
        end_price_HODL = hodl_sims[len(hodl_sims)-1, num]
        end_price_rebalanced = daily_totals[len(daily_totals)-1]
        taxes_HODL = (end_price_HODL - 5000) * .25
        
        # Save simulation results 
        sim_summary[num] = [col, fees, taxes_HODL, end_price_HODL, taxes_rebalanced, end_price_rebalanced]
        rebalance_sims[num] = daily_totals
    
    # Convert back to pandas DataFrame to save to CSV
    rebalance_simulations = pd.DataFrame(np.transpose(rebalance_simulations), columns=cols, index=sim_dates)
    rebalance_simulations.to_csv(file_path +  'rebalanced.csv')

    simulation_summary = pd.DataFrame(
        simulation_summary,
        columns = [
            'portfolio',
            'total_fees',
            'taxes_HODL',
            'end_price_HODL',
            'taxes_rebalanced',
            'end_price_rebalanced'
        ]
    )    
    simulation_summary.to_csv(file_path + 'summary.csv', index=False) 

Simulations

In [None]:
hist_prices = pd.read_csv(file_path + 'historical prices.csv')


coins = historical_prices.columns.tolist()[1:]

# Exclude date column from historical prices
historical_prices = np.array(historical_prices[coins])

# get date ranges used for simulations
historical_cap = pd.read_csv(file_path + 'historical market cap.csv')
historical_cap = np.array(historical_cap)

start_dates = historical_cap[:len(historical_cap) - 365]
end_dates = historical_cap[365:]

# Subtract the ending market caps from each other, located in the 4th column
cap_diffs = list(end_dates[:, 3] - start_dates[:, 3])

    
# Make sure there's an odd number of dates, so the median value can be indexed
if len(cap_diffs) % 2 == 0:
cap_diffs.pop(len(cap_diffs)-1)
        
# Start date for simulations
start_date = cap_diffs.index(np.median(cap_diffs))

# Limit dataframe dates to the date range
historical_prices = historical_prices[start_date:start_date + 365]
sim_dates = sim_dates[start_date:start_date + 365]

# Retrieve all current tickers on exchange
exchange = ccxt.bittrex()
tickers = set()
[tickers.add(ticker) for ticker in exchange.fetch_tickers()]

# Start with $5000 of Bitcoin at day 0 price
start_amt = 5000
num_coins = 5
amt_each = start_amt / num_coins

df = simulate_HODL()
simulate_rebalance(df)

In [None]:
path = 'C:/Users/Carter/Documents/Github/Thinkful__Projects/Final Capstone/'

# DataFrames we've created
historical_df = pd.read_csv(path + 'data/historical prices.csv')
hodl_df = pd.read_csv(path + 'hodl.csv')
rebalanced_df = pd.read_csv(path + 'rebalanced.csv')
summary_df = pd.read_csv(path + 'summary.csv')

# Date range used for simulations
start_date, end_date = historical_data['date'][0], historical_data['date'][len(historical_data)-1]
start_date = time.strftime('%m/%d/%Y', time.gmtime(start_date))
end_date = time.strftime('%m/%d/%Y', time.gmtime(end_date))


# list of coins used in each portfolio simulation
coins = historical_data.columns[1:].tolist()
cols = hodl_df.columns[1:]
# For each simulation, make a list of the coins randomly chosen
coin_lists = [i.split('-') for i in cols]

print('Coins used in analysis', coins)
print('Date range of simulation: {} - {}'.format(start_date, end_date)) 

In [None]:
# End prices 
# Note: explain how taxes were calculated
end_price_HODL = np.array(summary_df['end_price_HODL'] - summary_df['taxes_HODL'])
end_price_rebalanced = np.array(summary_df['end_price_rebalanced'] - summary_df['taxes_rebalanced'])
performance = list((end_price_rebalanced - end_price_HODL) / end_price_HODL)

In [None]:
# Dataframe to compare coin impact on outperforming HODL
df = pd.DataFrame(columns=coins)
df['beat market'] = performance
df['beat market'] = df['beat market'] > 0
df.fillna(False, inplace=True)

# Fill Dataframe with coins used for each simulation
for i in range(len(coin_lists)):
    for coin in coin_lists[i]:
        df.loc[i, coin] = True

In [None]:
# Feature importance analysis
tree = RandomForestClassifier()
X = df[coins]
Y = df['beat market']
tree.fit(X, Y)

feature_importance = tree.feature_importances_
feature_importance = 100 * (feature_importance / max(feature_importance))
temp = feature_importance.tolist()

# Take only top 10 features
top_feats = sorted(feature_importance,reverse=True)[:10]
sorted_features = np.array([temp.index(feat) for feat in top_feats])
pos = np.arange(sorted_features.shape[0]) + .5
plt.barh(pos, feature_importance[sorted_features], align='center')
plt.yticks(pos, X.columns[sorted_features])
plt.show()