# CryptoProphet
## Notebook's Goal
> Train and assess results of XGBoost Regression model

In [1]:
# imports custom packages
from src.paths import LOCAL_RAW_DATA_PATH, LOCAL_PROCESSED_DATA_PATH, LOCAL_MODELS_PATH
from src.visualization.stats import plot_stats_regressor

# imports official packages
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from tqdm._tqdm_notebook import tqdm_notebook
from collections import OrderedDict
from xgboost import plot_importance
import matplotlib.pyplot as plt 
from scipy.stats import skew
import missingno as msno
import pandas as pd
import xgboost
import pickle

tqdm_notebook.pandas()

seed = 42

# imports data
df_path = LOCAL_PROCESSED_DATA_PATH / 'pretrain_dataset_20211013_ta.pkl'
df = pd.read_pickle(df_path)
tgt_vars = [c for c in df.columns if '_change_' in str(c)]

# drops unused columns
drop_cols = ['created_at', 'created_at_trunc_h', 'id_str',
             'full_text', 'user_screen_name',
             'ma_120_periods', 'ma_720_periods'] + tgt_vars
X_cols = [c for c in df.columns if c not in drop_cols]

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  if __name__ == '__main__':


In [5]:
# prints top 30 features (without text embeddings)
df.columns[:40]

Index([                         'created_at',
                        'created_at_trunc_h',
                                    'id_str',
                                 'full_text',
                             'retweet_count',
                            'favorite_count',
                          'user_screen_name',
                                 'user_feat',
                                       'BTC',
                                      'DOGE',
                                       'ETH',
                                       'LTC',
                                      'USDT',
                                       'ADA',
                                    'volume',
                                     'close',
                            'ma_120_periods',
                            'ma_720_periods',
                'close_change_1periods_perc',
                'close_change_2periods_perc',
                'close_change_6periods_perc',
               'close_change_12per

In [6]:
# shows top values
df.head()

Unnamed: 0,created_at,created_at_trunc_h,id_str,full_text,retweet_count,favorite_count,user_screen_name,user_feat,BTC,DOGE,...,758,759,760,761,762,763,764,765,766,767
67894,2019-06-24 00:48:17,2019-06-24 00:00:00,1142957583340380160,@roysebag @mikejcasey https://t.co/HAdE9hivHb,1.0,5.0,BarrySilbert,3,0,0,...,0.211892,0.090323,-0.033169,-0.074099,0.049208,0.008833,0.092621,-0.013973,-0.091649,-0.009876
109535,2019-06-24 01:45:59,2019-06-24 01:00:00,1142972105182601216,@SoroushG_ @hasufl Perhaps expanding to non-ec...,0.0,4.0,FEhrsam,14,0,0,...,0.159535,0.046459,-0.129009,-0.05173,0.037167,0.01258,0.112651,-0.005474,0.065391,0.004129
89071,2019-06-24 03:39:16,2019-06-24 03:00:00,1143000611329388544,@NPO_SPS_2013 @Emurgo_Japan 投稿ありがとうございました！是非、 ...,1.0,30.0,Cardano,5,0,0,...,-0.211426,-0.008754,0.002088,-0.10703,0.079004,0.251885,0.013369,-0.043283,0.211008,0.042472
89070,2019-06-24 04:04:05,2019-06-24 04:00:00,1143006856761266176,"On June 22, #CardanoFoundation attended the fi...",50.0,235.0,Cardano,5,0,0,...,-0.063294,0.127559,0.099624,-0.001857,-0.06896,0.112617,-0.000934,0.009449,0.031449,0.055679
89069,2019-06-24 04:23:13,2019-06-24 04:00:00,1143011672560427008,ご注意ください：ダイダロスのサポートをすると言ってデータを抜き取り#ADA を盗まれるという...,42.0,123.0,Cardano,5,0,0,...,-0.196956,-0.077403,0.03381,-0.18143,0.158272,0.118863,-0.029405,-0.13752,0.256907,-0.011696


In [7]:
# defines X and y variables
X = df[X_cols]
y = df['close_change_2periods_perc']

In [8]:
# sanity check on null values
X.isna().sum().sum(), y.isna().sum()

(80, 4)

In [9]:
# splits dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)

In [10]:
# sanity check on resulting shapes
print(X_train.shape,  X_test.shape)
print(y_train.shape,  y_test.shape)
df.shape[0] == X_train.shape[0] + X_test.shape[0] 

(73656, 785) (18415, 785)
(73656,) (18415,)


True

In [None]:
# inits and fits model
xgboost.set_config(**{'use_rmm': False, 'verbosity': 1})

model = xgboost.XGBRegressor(**{
    'colsample_bytree': 0.8,
    'gamma': 0,
    'learning_rate': 0.01,
    'max_depth': 5,
    'min_child_weight': 1.5,
    'n_estimators': 1200,
    'reg_alpha': 0.75,
    'reg_lambda': 0.45,
    'seed': seed,
    'subsample': 0.9
}) 

model.fit(X_train,y_train)

In [None]:
# exports model
with open(LOCAL_MODELS_PATH / 'xgboost_20211013_ta.pkl', mode='wb') as fp:
    pickle.dump(model, fp)

In [None]:
# plots results
plot_stats_regressor(model, X_train, y_train, test_name='training')
plot_stats_regressor(model, X_test, y_test, test_name='testing')

In [None]:
# plots results broken down per influencer
for u in X_train.user_feat.unique():
    mask = X_train.user_feat == u
    mask_test = X_test.user_feat == u
    user_screen_name = user_le.inverse_transform([u])[0]
    print(f"{user_screen_name} - {mask.sum() + mask_test.sum()} tweets")

    try:
        plot_stats_regressor(model, X_train[mask], y_train[mask], test_name=f"{user_screen_name} - full data")
        plot_stats_regressor(model, X_test[mask_test], y_test[mask_test], test_name=f"{user_screen_name} - train data")
    except Exception as e:
        print(f'error {e}')
        
    print(20*'=')

# Conclusion
> Model overall contains too much noise, but some influencer show promissing results
>
> (eg. PeterMcCormack with R² score of   ~0.37).