# BTC price prediction

In [11]:
# Importing required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV, cross_validate
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [4]:
# Placeholder for Bitcoin data (synthetic or actual data)
# Using synthetic data for demonstration
np.random.seed(42)
dates = pd.date_range(start='2016-01-01', end='2024-01-01', freq='D')
prices = np.cumsum(np.random.randn(len(dates))) + 20000  # Synthetic cumulative price data
btc_data = pd.DataFrame({'Date': dates, 'Price': prices})

In [5]:
np.random.seed(42)

# Feature engineering: Creating day, month, and year as features
btc_data['Day'] = btc_data['Date'].dt.day
btc_data['Month'] = btc_data['Date'].dt.month
btc_data['Year'] = btc_data['Date'].dt.year

# Defining features and target variable
X = btc_data[['Day', 'Month', 'Year']]
y = btc_data['Price']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Defining models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'KNN': KNeighborsRegressor(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse'),
    'CatBoost': CatBoostRegressor(verbose=0, random_state=42)
}

# Cross Validation

#### test_score: 
* neg_mean_squared_error: The mean test score from cross-validation, specifically the negative mean squared error (MSE). A more negative value indicates a higher error, while a closer-to-zero value indicates a better fit.

In [12]:
results = {}
def mean_std_cross_val_scores(model, X_train, y_train, **kwargs):
    """
    Returns mean and std of cross validation

    Parameters
    ----------
    model :
        scikit-learn model
    X_train : numpy array or pandas DataFrame
        X in the training data
    y_train :
        y in the training data

    Returns
    ----------
        pandas Series with mean scores from cross_validation
    """

    scores = cross_validate(model, X_train, y_train, **kwargs)

    mean_scores = pd.DataFrame(scores).mean()
    std_scores = pd.DataFrame(scores).std()
    out_col = []

    for i in range(len(mean_scores)):
        out_col.append((f"%0.3f (+/- %0.3f)" % (mean_scores.iloc[i], std_scores.iloc[i])))

    return pd.Series(data=out_col, index=mean_scores.index)

## function inspired by CPSC_320, UBC

In [16]:
initial_cv_results = {}
for model_name, model in models.items():
    initial_cv_results[model_name] = mean_std_cross_val_scores(model, X, y, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

In [17]:
pd.DataFrame(initial_cv_results).T

Unnamed: 0,fit_time,score_time,test_score,train_score
Linear Regression,0.005 (+/- 0.003),0.006 (+/- 0.007),-304.855 (+/- 184.973),-154.449 (+/- 27.643)
KNN,0.004 (+/- 0.001),0.006 (+/- 0.001),-263.333 (+/- 145.705),-25.762 (+/- 5.507)
Random Forest,0.509 (+/- 0.063),0.012 (+/- 0.001),-192.783 (+/- 162.436),-0.109 (+/- 0.002)
XGBoost,0.132 (+/- 0.047),0.004 (+/- 0.002),-195.474 (+/- 143.235),-0.327 (+/- 0.022)
CatBoost,0.826 (+/- 0.156),0.010 (+/- 0.012),-238.233 (+/- 164.107),-1.144 (+/- 0.077)


* Most of the models are overfitting 
* RandomForest shows the most promise.

# Hyperparameter Tunning

In [26]:
rf_params = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]}
rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), rf_params, cv=5, scoring='neg_mean_squared_error')
rf_grid_search.fit(X_train, y_train)  # Uncomment to fit

# XGBoost hyperparameter tuning
xgb_params = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.1, 0.3], 'subsample': [0.8, 1.0]}
xgb_grid_search = GridSearchCV(XGBRegressor(random_state=42, use_label_encoder=False), xgb_params, cv=5, scoring='neg_mean_squared_error')
xgb_grid_search.fit(X_train, y_train)

# CatBoost hyperparameter tuning
cat_params = {'iterations': [100, 200, 500], 'learning_rate': [0.01, 0.1, 0.3], 'depth': [3, 5, 7]}
cat_random_search = RandomizedSearchCV(CatBoostRegressor(verbose=0, random_state=42), cat_params, cv=5, n_iter=10, scoring='neg_mean_squared_error', random_state=42)
cat_random_search.fit(X_train, y_train)


In [27]:
# Cross-validation after hyperparameter tuning (replace the tuned models accordingly)
tuned_models = {
    'Tuned Random Forest': rf_grid_search.best_estimator_ if 'rf_grid_search' in locals() else RandomForestRegressor(random_state=42),
    'Tuned XGBoost': xgb_grid_search.best_estimator_ if 'xgb_grid_search' in locals() else XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse'),
    'Tuned CatBoost': cat_random_search.best_estimator_ if 'cat_random_search' in locals() else CatBoostRegressor(verbose=0, random_state=42)
}

In [28]:
tuned_cv_results = {}
for model_name, model in tuned_models.items():
    tuned_cv_results[model_name] = mean_std_cross_val_scores(model, X, y, cv=5, scoring='neg_mean_squared_error')

In [29]:
pd.DataFrame(tuned_cv_results).T

Unnamed: 0,fit_time,score_time,test_score
Tuned Random Forest,0.998 (+/- 0.132),0.024 (+/- 0.005),-192.629 (+/- 162.379)
Tuned XGBoost,0.334 (+/- 0.028),0.003 (+/- 0.000),-190.861 (+/- 146.037)
Tuned CatBoost,0.257 (+/- 0.015),0.002 (+/- 0.000),-177.520 (+/- 124.487)


# Next Steps:
* Given that our test scores are still very low even after hyperparameter tunning, we will treat the problem as a time-series problem and try to solve it as such.