# CryptoProphet
## Notebook's Goal
> To test the hypothesis that tweets of influencers have an effect on the price of crypto, we take the tweets and try to solve a classicfication problem using XGBoost. For this approach we have 2 parameters we tried out, namely the number of classes we want to predict and the time of the future price prediction. In this notebook 5 classes were used namely: **'STONG_DECREASE', 'DECREASE', 'NEUTRAL', 'INCREASE', 'STRONG_INCREASE'**. And the time of the price prediction is **1 hour** after the tweet was published. 



In [1]:
# imports custom packages

from src.paths import LOCAL_RAW_DATA_PATH, LOCAL_PROCESSED_DATA_PATH, LOCAL_MODELS_PATH

# imports official packages
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from tqdm._tqdm_notebook import tqdm_notebook
from collections import OrderedDict
from xgboost import plot_importance
import matplotlib.pyplot as plt 
from scipy.stats import skew
import missingno as msno
import pandas as pd
import xgboost
import pickle


tqdm_notebook.pandas()

seed = 42

# imports dataset with tweets in form of a panda dataframe
df_path = LOCAL_PROCESSED_DATA_PATH / 'pretrain_dataset_20211013.pkl'
df = pd.read_pickle(df_path)
tgt_vars = [c for c in df.columns if '_change_' in str(c)]

# some informations are unnnecessary so we drop unused columns of the dataframe

drop_cols = ['created_at', 'created_at_trunc_h', 'id_str',
             'full_text', 'user_screen_name',
             'ma_120_periods', 'ma_720_periods'] + tgt_vars
X_cols = [c for c in df.columns if c not in drop_cols]

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  if __name__ == '__main__':


In [2]:
# We extract quantiles in order to form 5 classes based on, if a value lies in a certain range or not
df['close_change_2periods_perc_5Q'] = pd.qcut(df['close_change_2periods_perc'], q=5)
cc2pp5Q_to_hr = {}
scale = ['STONG_DECREASE', 'DECREASE', 'NEUTRAL', 'INCREASE', 'STRONG_INCREASE']
for i, v in enumerate(df['close_change_2periods_perc_5Q'].unique().sort_values()):
    cc2pp5Q_to_hr[v] = scale[i] + f'_Q{i+1}' 
cc2pp5Q_to_hr

{Interval(-0.136, -0.00702, closed='right'): 'STONG_DECREASE_Q1',
 Interval(-0.00702, -0.00158, closed='right'): 'DECREASE_Q2',
 Interval(-0.00158, 0.00193, closed='right'): 'NEUTRAL_Q3',
 Interval(0.00193, 0.00773, closed='right'): 'INCREASE_Q4',
 Interval(0.00773, 0.146, closed='right'): 'STRONG_INCREASE_Q5'}

In [3]:
# map quantiles to human readable form, basicallly a text 'STONG_DECREASE', 'DECREASE', ...
df['close_change_2periods_perc_5Q_HR'] = df['close_change_2periods_perc_5Q'].map(cc2pp5Q_to_hr)

In [4]:

X = df[X_cols]
y = df['close_change_2periods_perc_5Q_HR']

In [5]:
# sanity check on null values
X.isna().sum().sum(), y.isna().sum()

(0, 0)

In [6]:
# splits dataset into train and test datasets. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=seed) 

In [7]:
# sanity check on resulting shapes
print(X_train.shape,  X_test.shape)
print(y_train.shape,  y_test.shape)
df.shape[0] == X_train.shape[0] + X_test.shape[0] 

(73656, 779) (18415, 779)
(73656,) (18415,)


True

In [8]:
# xgboost.XGBClassifier
# xgboost.XGBRFClassifier

In [9]:
# initialize and fits model using XGBoost
xgboost.set_config(**{'use_rmm': False, 'verbosity': 1})

In [None]:
%time

model = xgboost.XGBRFClassifier(**{
    'colsample_bytree': 0.8,
    'gamma': 0,
    'learning_rate': 0.01,
    'max_depth': 5,
    'min_child_weight': 1.5,
    'n_estimators': 1200,
    'reg_alpha': 0.75,
    'reg_lambda': 0.45,
    'seed': seed,
    'subsample': 0.9
}) 


model.fit(X_train,y_train)

Wall time: 0 ns




In [None]:
# export the model to pickle file
with open(LOCAL_MODELS_PATH / 'xgboost_20211014_XGBRFClassifier.pkl', mode='wb') as fp:
    pickle.dump(model, fp)

In [None]:
print('finish!')

In [None]:
# shows overall score of the model
from sklearn.metrics import accuracy_score
accuracy_score(y, model.predict(X))

In [None]:
# plots results in a confusion matrix to try to evalute the modelv
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(model, X_test, y_test)  

In [None]:
plot_confusion_matrix(model, X, y)  

In [None]:
plot_confusion_matrix(model, X_train, y_train)  

In [None]:
import pickle

In [None]:
# exports data
with open(LOCAL_MODELS_PATH / 'user_label_encoder.pkl', 'rb') as fp:
    user_le = pickle.load(fp)

In [None]:
#define plot function
def plot_stats(model, X, y, test_name=None):
    y_pred = model.predict(X)

    score = model.score(X, y)  
    print(f"{test_name} - R^2 score: ", score)

    plot_confusion_matrix(model, X, y)  

In [None]:
# prints score ordered by influencers accuracy
user2score = []
for u in X_train.user_feat.unique():
    mask = X.user_feat == u
    user_screen_name = user_le.inverse_transform([u])[0]
    stats ={
        'user_label': u,
        'user_screen_name': user_screen_name,
        'accuracy': model.score(X[mask], y[mask]),
        'tweet_count': mask.sum()}
    user2score.append(stats)
user2score = pd.DataFrame(user2score).sort_values('accuracy', ascending=False)

In [None]:
user2score

In [None]:
#plot of prediction vs ground truth for each infleuncer separetly
for u in X_train.user_feat.unique():
    mask = X_train.user_feat == u
    mask_test = X_test.user_feat == u
    user_screen_name = user_le.inverse_transform([u])[0]
    print(f"{user_screen_name} - {mask.sum() + mask_test.sum()} tweets")

    try:
        plot_stats(model, X_train[mask], y_train[mask], test_name=f"{user_screen_name} - full data")
        plot_stats(model, X_test[mask_test], y_test[mask_test], test_name=f"{user_screen_name} - train data")
    except Exception as e:
        print(f'error {e}')
        
    print(20*'=')


In [None]:
scores = cross_val_score(model, X_train, y_train ,cv=10)
print("Cross Val Score: ", score)

kfold = KFold(n_splits=10, shuffle=True)
kf_cv_scores = cross_val_score(model, X_train, y_train, cv=kfold )
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())

In [None]:
# parameters = {
#     'colsample_bytree': [0.3, 0.8], #0.3 to 0.8
#     'gamma': [0], # 0, 1, 5                 
#     'learning_rate': [0.01, 0.1], # 0.1 and 0.01 #
#     'max_depth': [1,5],
#     'min_child_weight': [1.5],
#     'n_estimators': [1200], #  80-200 if the size of data is high (of the order of millions), 800-1200 is if it is medium-low                                                                    
#     'reg_alpha': [0.75],
#     'reg_lambda': [0.45],
#     'subsample': [0.9,2], #  0.8 and 1
#     'seed': [seed]
# }
# gs = GridSearchCV(xgbr, parameters)
# gs.fit(X_train,y_train)
# y_pred = gs.predict(X_train)


In [None]:
# model = xgboost.XGBRegressor(
#                  colsample_bytree=0.5, #0.3 to 0.8
#                  gamma=0, # 0, 1, 5                 
#                  learning_rate=0.07, # 0.1 and 0.01 #
#                  max_depth=3,
#                  min_child_weight=1.5,
#                  n_estimators=1200, #  80-200 if the size of data is high (of the order of millions), 800-1200 is if it is medium-low                                                                    
#                  reg_alpha=0.75,
#                  reg_lambda=0.45,
#                  subsample=0.8, #  0.8 and 1
#                  seed=seed) 

# model.fit(X_train,y_train)

# score = model.score(X_train, y_train)  
# print("Training score: ", score)

# y_pred = model.predict(X_train)
# x_ax = range(len(y_train))
# plt.plot(x_ax, y_train, label="original")
# plt.plot(x_ax, y_pred, label="predicted")
# plt.title("XGBoostRegressor test and predicted data")
# plt.legend()
# plt.show()


# Conclusion
> Model overall contains too much noise, however some influencer show promissing results. Mots of the results are over 30% so the model performs better than a random classifier.
> (eg. PeterMcCormack with accuracy of 0.487963).
>Other combinations of classes/prediction time perform better.
>