In [1]:
pip install pycaret

Collecting pycaret
  Downloading pycaret-3.3.2-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.1/486.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting joblib<1.4,>=1.2.0 (from pycaret)
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn>1.4.0 (from pycaret)
  Downloading scikit_learn-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyod>=1.1.3 (from pycaret)
  Downloading pyod-2.0.1.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/163.8 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting imbalanced-learn>=0.12.0 (from py

In [2]:
pip install ruptures

Collecting ruptures
  Downloading ruptures-1.1.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ruptures
Successfully installed ruptures-1.1.9


In [3]:
import pandas as pd
import numpy as np
import datetime as dt

from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.offline

from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.tsa.arima.model import ARIMA
from pandas.tseries.offsets import DateOffset

In [4]:
df = pd.read_csv("/Bitcoin_Clean_V1.csv")
df

Unnamed: 0,Date,Price,Transaction_count,Unique_addresses,Total_fees_USD
0,2022-01-01,47155.476562,186834.0,446959.0,4.391292e+05
1,2022-01-02,47257.339844,196865.0,475472.0,4.376038e+05
2,2022-01-03,46817.292969,234793.0,641264.0,5.418452e+05
3,2022-01-04,46417.824219,274544.0,665948.0,6.167535e+05
4,2022-01-05,45880.234375,272155.0,701822.0,7.250674e+05
...,...,...,...,...,...
817,2024-03-28,70390.375000,428670.0,686127.0,2.126428e+06
818,2024-03-29,70047.835938,449421.0,579308.0,2.052532e+06
819,2024-03-30,69993.375000,490072.0,603660.0,2.091845e+06
820,2024-03-31,70440.375000,319675.0,440038.0,1.976385e+06


In [5]:
df.columns

Index(['Date', 'Price', 'Transaction_count', 'Unique_addresses',
       'Total_fees_USD'],
      dtype='object')

Univariate Analysis

In [6]:
fig = make_subplots(rows=2, cols=2, subplot_titles=('<b>Distr. of Price</b>',
                                                    '<b>Distr. of Transaction Count</b>',
                                                   '<b>Distr. of Unique Addresses</b>',
                                                   '<b>Distr. of Total Fees</b>'))

fig.add_trace(go.Histogram(x=df['Price'].dropna()), row=1, col=1)
fig.add_trace(go.Histogram(x=df['Transaction_count'].dropna()), row=1, col=2)
fig.add_trace(go.Histogram(x=df['Unique_addresses'].dropna()), row=2, col=1)
fig.add_trace(go.Histogram(x=df['Total_fees_USD'].dropna()), row=2, col=2)

# Update visual layout
fig.update_layout(
    showlegend=False,
    width=600,
    height=400,
    autosize=False,
    margin=dict(t=15, b=0, l=5, r=5),
    template="plotly_white",
    colorway=px.colors.qualitative.Prism ,
)
# update font size at the axes
fig.update_coloraxes(colorbar_tickfont_size=10)
# Update font in the titles: Apparently subplot titles are annotations (Subplot font size is hardcoded to 16pt · Issue #985)
fig.update_annotations(font_size=12)
# Reduce opacity
fig.update_traces(opacity=0.75)

fig.show()

In [7]:
fig = make_subplots(rows=2, cols=2, subplot_titles=('<b>Summary Stats on Price</b>',
                                                    '<b>Summary Stats on Transaction Count</b>',
                                                   '<b>Summary Stats on Unique Addresses</b>',
                                                   '<b>Summary Stats on Total Fees</b>'))

fig.add_trace(go.Box(x=df['Price'].dropna(), name='Price'), row=1, col=1)
fig.add_trace(go.Box(x=df['Transaction_count'].dropna(), name='Transaction<br>Count'), row=1, col=2)
fig.add_trace(go.Box(x=df['Unique_addresses'].dropna(), name='Unique<br>Addresses'), row=2, col=1)
fig.add_trace(go.Box(x=df['Total_fees_USD'].dropna(), name='Total<br>Fees'), row=2, col=2)

# Update visual layout
fig.update_layout(
    showlegend=False,
    width=800,
    height=400,
    autosize=False,
    margin=dict(t=15, b=0, l=55, r=55),
    template="plotly_white",
    colorway=px.colors.qualitative.Prism ,
)
# update font size at the axes
fig.update_coloraxes(colorbar_tickfont_size=10)
# Update font in the titles: Apparently subplot titles are annotations (Subplot font size is hardcoded to 16pt · Issue #985)
fig.update_annotations(font_size=12)
# Reduce opacity
fig.update_traces(opacity=0.75)

fig.show()

Summary Stats on Price, Transaction Count, Total Fees are:

Skewed to the left

Unique Addresses seem to be an outlier

**Time Series Analysis**

In [8]:
dfg = df[['Date', 'Price']]
 # Set index
dfg = dfg.set_index('Date')
# Resample by month end date, avg value  of observations per day
dfg_agg = dfg.resample(rule= 'M').mean()

fig = px.line(dfg_agg, x=dfg_agg.index, y="Price",
                 width=900, height=400,
                color_discrete_sequence=px.colors.qualitative.Prism)
fig.update_layout(
    title=dict(text='History of Monthly Avg.Price (Bitcoin prices, USD)', font=dict(size=20), yref='paper'),
    yaxis = dict(tickfont = dict(size=10)),
    xaxis = dict(tickfont = dict(size=7))
)
fig.update_xaxes(
    dtick="M1",
    tickformat="%b-%Y"
)
fig.show()

TypeError: Only valid with DatetimeIndex, TimedeltaIndex or PeriodIndex, but got an instance of 'Index'

In [9]:
df['Date'] = pd.to_datetime(df['Date'])
df

Unnamed: 0,Date,Price,Transaction_count,Unique_addresses,Total_fees_USD
0,2022-01-01,47155.476562,186834.0,446959.0,4.391292e+05
1,2022-01-02,47257.339844,196865.0,475472.0,4.376038e+05
2,2022-01-03,46817.292969,234793.0,641264.0,5.418452e+05
3,2022-01-04,46417.824219,274544.0,665948.0,6.167535e+05
4,2022-01-05,45880.234375,272155.0,701822.0,7.250674e+05
...,...,...,...,...,...
817,2024-03-28,70390.375000,428670.0,686127.0,2.126428e+06
818,2024-03-29,70047.835938,449421.0,579308.0,2.052532e+06
819,2024-03-30,69993.375000,490072.0,603660.0,2.091845e+06
820,2024-03-31,70440.375000,319675.0,440038.0,1.976385e+06


In [10]:
df.dtypes

Date                 datetime64[ns]
Price                       float64
Transaction_count           float64
Unique_addresses            float64
Total_fees_USD              float64
dtype: object

In [11]:
dfg = df[['Date', 'Price']]
 # Set index
dfg = dfg.set_index('Date')
# Resample by month end date, avg value  of observations per day
dfg_agg = dfg.resample(rule= 'M').mean()

fig = px.line(dfg_agg, x=dfg_agg.index, y="Price",
                 width=900, height=400,
                color_discrete_sequence=px.colors.qualitative.Prism)
fig.update_layout(
    title=dict(text='History of Monthly Avg.Price (Bitcoin prices, USD)', font=dict(size=20), yref='paper'),
    yaxis = dict(tickfont = dict(size=10)),
    xaxis = dict(tickfont = dict(size=7))
)
fig.update_xaxes(
    dtick="M1",
    tickformat="%b-%Y"
)
fig.show()

In [21]:
fig = px.histogram (dfg_agg, x=dfg_agg.index, y="Price",
                   histfunc="avg",
                   title="Histogram of Average Close Price, USD",
                  color_discrete_sequence=px.colors.qualitative.Prism)
fig.update_traces(xbins_size="M1")
fig.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M1", tickformat="%b\n%Y")
fig.update_layout(bargap=0.1)
fig.add_trace(go.Scatter(mode="markers", x=dfg_agg.index, y=dfg_agg["Price"], name="monthly"))
fig.show()

In [13]:
df.columns


Index(['Date', 'Price', 'Transaction_count', 'Unique_addresses',
       'Total_fees_USD'],
      dtype='object')

In [26]:
dfg2 = df[['Date', 'Transaction_count']]
 # Set index
dfg2 = dfg2.set_index('Date')
# Resample by month end date, avg value  of observations per day
dfg2_agg = dfg2.resample(rule= 'M').mean()

dfg3 = df[['Date', 'Total_fees_USD']]
 # Set index
dfg3 = dfg3.set_index('Date')
# Resample by month end date, avg value  of observations per day
dfg3_agg = dfg3.resample(rule= 'M').mean()

In [41]:
fig1 = px.histogram(dfg_agg, x=dfg_agg.index, y="Price",histfunc="avg")
fig1.update_traces(xbins_size="M1")
fig1.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M1", tickformat="%b\n%Y")
fig1.update_layout(bargap=0.1)
fig1.add_trace(go.Scatter(mode="markers", x=dfg_agg.index, y=dfg_agg["Price"], name="monthly"))


fig2 = px.histogram(dfg2_agg, x=dfg2_agg.index, y="Transaction_count",histfunc="avg")
fig2.update_traces(xbins_size="M1")
fig2.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M1", tickformat="%b\n%Y")
fig2.update_layout(bargap=0.1)
fig2.add_trace(go.Scatter(mode="markers", x=dfg2_agg.index, y=dfg2_agg["Transaction_count"], name="monthly"))

fig3 = px.histogram(dfg3_agg, x=dfg3_agg.index, y="Total_fees_USD",histfunc="avg")
fig3.update_traces(xbins_size="M1")
fig3.update_xaxes(showgrid=True, ticklabelmode="period", dtick="M1", tickformat="%b\n%Y")
fig3.update_layout(bargap=0.1)
fig3.add_trace(go.Scatter(mode="markers", x=dfg3_agg.index, y=dfg3_agg["Total_fees_USD"], name="monthly"))

#I need to combine these 3 graphs into 1.. How do i do that?
