In [None]:
% matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import MySQLdb as mdb
import sys
import sqlalchemy as sql
import seaborn as sns
import sklearn
from   sklearn import preprocessing 
from   sklearn.decomposition import PCA
from   sklearn.feature_selection import SelectKBest
from   sklearn.feature_selection import chi2

In [None]:
# connect directly to MySQL DB*/
conn= mdb.connect(host='xxxxxx',user='xxxx',password='xxxxxxxxxxxx',db='mcm_practicum')
# xrp
xrp= pd.read_sql(
'SELECT t1.coin , t1.close_price ,t1.close_timestamp as close_time'
', tim as tweet_time , num_tweets , fav_percent , retweet_percent , avg_senti , min_senti , max_senti '
'FROM mcm_practicum.xrpbtc as t1 '
'INNER JOIN ('
'select tim, num_tweets , case when favs = 0 then 0 else favs/num_tweets end as fav_percent'
', case when retweet = 0 then 0 else retweet/num_tweets end as retweet_percent'
', retweet'
',avg_senti'
',max_senti'
',min_senti '
'from ( '
'SELECT FROM_UNIXTIME(CEIL(UNIX_TIMESTAMP(tweet_date)/1800)*1800) as tim  /* rounding to nearest thirty minutes for aggregations */ '
', count(distinct concat(id,tweet_date)) as num_tweets'
', sum(favorites) as favs'
', sum(retweets)  as retweet'
', avg(sentiment) as avg_senti'
', MIN(sentiment) min_senti'
', MAX(sentiment) max_senti '
'FROM mcm_practicum.xrp_tweets '
'GROUP BY tim)st1 '
')t2 '
'on t2.tim = date_sub(t1.close_timestamp, interval 30 minute) /* stagger data so sentiment occurred at least thirty minutes before price */'
';', conn)

In [None]:
xrp = xrp.sort_values(by=['close_time'])

In [None]:
xrp = xrp.reset_index(drop=True)

In [None]:
xrp.shape

In [None]:
sns.set_style('darkgrid')
plt.figure(figsize=(2,2))
plt.plot(xrp.close_time,xrp.close_price , color = '#e74c3c', label='Ripple', linewidth = 0.5)
plt.title('Price in Bitcoin',fontsize = 9)
plt.xlabel('Date',fontsize = 9)
plt.legend(loc='best',fontsize = 9)
plt.xticks(fontsize = 8)
plt.yticks(fontsize = 8)
plt.gcf().autofmt_xdate()

In [None]:
sns.set_style('white')
plt.figure(figsize=(5,4))
plt.plot(xrp.close_time,xrp.close_price , color = '#e74c3c', linewidth = 1.5)
plt.title('Ripple Price',fontsize = 14)
plt.xlabel('Date',fontsize = 13)
plt.ylabel('Price in Bitcoin',fontsize = 13)
plt.xticks(fontsize = 11)
plt.yticks(fontsize = 11)
plt.gcf().autofmt_xdate()

In [None]:
#checking the disparity in number of records
print(xrp.close_time.min()) 
print(xrp.close_time.max()) 

In [None]:
xrp.columns

In [None]:
# create log returns column
xrp["close_log_return"] = np.log(xrp.close_price / xrp.close_price.shift(1)) # log of (current price divided by previous price)


In [None]:
# create raw returns column
xrp["close_raw_return"] = (xrp.close_price / xrp.close_price.shift(1))-1 # raw return

In [None]:
print(xrp.close_raw_return.mean())

In [None]:
plt.figure(figsize=(10,8))
plt.plot(xrp.close_time,xrp.close_raw_return , color = '#e74c3c')
plt.legend(loc='best')
plt.gcf().autofmt_xdate()

In [None]:
# create target
conditions = [
    (xrp['close_raw_return'] > 0) ,
    (xrp['close_raw_return'] <= 0)]
choices = [1, 0]
xrp['target'] = np.select(conditions, choices, default=0)

In [None]:
print(xrp[['close_raw_return', 'target']])

In [None]:
print(xrp.target.agg('sum') ,xrp.target.agg('count'))

In [None]:
# refine dataframes
xrp = xrp[['coin','target','close_time','close_raw_return','tweet_time', 'num_tweets', 'fav_percent', 'retweet_percent', 'avg_senti', 'max_senti' , 'min_senti']]

# drop NA (this the first row which can't calculate close_return as there is no previous close_price to calculate using)
xrp = xrp.dropna()
#reset the index
xrp = xrp.reset_index(drop='True')

#cast to float
xrp.max_senti = xrp.max_senti.astype(float)
xrp.min_senti = xrp.min_senti.astype(float)
xrp.fav_percent = xrp.fav_percent.astype(float)
xrp.retweet_percent = xrp.retweet_percent.astype(float)

In [None]:
xrp.dtypes

In [None]:
plt.figure(figsize=(10,8))
xrp.plot.scatter(x='close_raw_return',y = 'avg_senti')
plt.legend(loc='best')
plt.gcf().autofmt_xdate()

In [None]:
xrp.columns

In [None]:
xrp = xrp[['coin','target', 'close_time', 'close_raw_return', 'tweet_time', 'num_tweets',
       'fav_percent', 'retweet_percent', 'avg_senti', 'max_senti',
       'min_senti']]

In [None]:
print(xrp.close_raw_return.head(10), xrp.close_raw_return.rolling(min_periods=4, window=4).mean().shift(1).head(8))
#xrp.avg_senti.rolling(min_periods=6, window=6).mean().shift(1).head(8)

In [None]:
# create additional variables
# based on return
xrp['avg_return_L2h'] = xrp.close_raw_return.rolling(min_periods=4, window=4).mean().shift(1) # avg returns for 2 hours before t where t is current time period
xrp['avg_return_L6h'] = xrp.close_raw_return.rolling(min_periods=12, window=12).mean().shift(1) # avg returns for 6 hours before t where t is current time period
xrp['avg_return_L12h'] = xrp.close_raw_return.rolling(min_periods=24, window=24).mean().shift(1) # avg returns for 12 hours before t where t is current time period

# based on sentiment
xrp['avg_senti_L2h'] = xrp.avg_senti.rolling(min_periods=4, window=4).mean().shift(1) # avg sentiment for 2 hours before t where t is current time period
xrp['avg_senti_L6h'] = xrp.avg_senti.rolling(min_periods=12, window=12).mean().shift(-1) # avg sentiment for 6 hours before t where t is current time period
xrp['avg_senti_L12h'] = xrp.avg_senti.rolling(min_periods=24, window=24).mean().shift(-1) # avg sentiment for 12 hours before t where t is current time period

In [None]:
xrp[['close_time' , 'target', 'close_raw_return' ,'avg_return_L2h', 'avg_senti' , 'avg_senti_L2h']].head(7)

In [None]:
xrp.columns

In [None]:
print("Prevalance in Data Set:",xrp.target.sum() / xrp.target.count(),"")

In [None]:
xrp[['coin', 'target', 'close_time', 'tweet_time',
       'num_tweets', 'fav_percent', 'retweet_percent', 'avg_senti',
       'max_senti', 'min_senti', 'avg_return_L2h', 'avg_return_L6h',
       'avg_return_L12h', 'avg_senti_L2h', 'avg_senti_L6h', 'avg_senti_L12h']]

# Training and Testing

In [None]:
# train/ test split
xrp_train = xrp.sample(frac=0.7,random_state=200)  
xrp_test = xrp.drop(xrp_train.index)

xrp_train = xrp_train[['target','num_tweets', 'fav_percent', 'retweet_percent', 'avg_senti','max_senti', 'min_senti'
  , 'avg_return_L2h', 'avg_return_L6h', 'avg_return_L12h', 'avg_senti_L2h', 'avg_senti_L6h', 'avg_senti_L12h']]
xrp_test = xrp_test[['target','num_tweets', 'fav_percent', 'retweet_percent', 'avg_senti','max_senti', 'min_senti'
  , 'avg_return_L2h', 'avg_return_L6h', 'avg_return_L12h', 'avg_senti_L2h', 'avg_senti_L6h', 'avg_senti_L12h']]

In [None]:
print("Prevalance in Training Set:",xrp_train.target.sum() / xrp_train.target.count(),"")
print("Prevalance in Testing Set:",xrp_test.target.sum() / xrp_test.target.count(),"")

## Check for NaN

In [None]:
print("Number of NaN in Training:",xrp_train.isnull().sum().sum(),", Number of NaN in Testing:", xrp_test.isnull().sum().sum())

In [None]:
print(xrp_train)

In [None]:
#records that are being dropped are ones that didn't have a sufficient number of records that occurred immediately prior 
#to them in order to create the derived variables i.e. avg_senti_L2h: the first four records in the dataset would have been dropped
xrp_train = xrp_train.dropna()
xrp_test = xrp_test.dropna()
print("Number of NaN in Training:",xrp_train.isnull().sum().sum(),", Number of NaN in Testing:", xrp_test.isnull().sum().sum())

In [None]:
# reset the index for the training and test so that PCA sets can be merged back
xrp_train = xrp_train.reset_index(drop=True)
xrp_test = xrp_test.reset_index(drop=True)

### More Feature Creation

In [None]:
xrp_train.num_tweets.quantile(.9)

In [None]:
#top decile
quant90tr = []
quant90tst = []

for t in xrp_train.num_tweets:
     if t > xrp_train.num_tweets.quantile(.90):
            quant90tr.append(1)
     else:
            quant90tr.append(0)
            

for t in xrp_test.num_tweets:
     if t > xrp_train.num_tweets.quantile(.90):#uses the training quartile to determine its value as test data is for validation only
            quant90tst.append(1)
     else:
            quant90tst.append(0)            
                
xrp_train = pd.merge(xrp_train, pd.DataFrame(quant90tr), left_index=True, right_index=True)                             
xrp_test = pd.merge(xrp_test, pd.DataFrame(quant90tst), left_index=True, right_index=True)                       

In [None]:
#top quartile
quant75tr = []
quant75tst = []

for t in xrp_train.num_tweets:
     if t > xrp_train.num_tweets.quantile(.75):
            quant75tr.append(1)
     else:
            quant75tr.append(0)
            

for t in xrp_test.num_tweets:
     if t > xrp_train.num_tweets.quantile(.75):#uses the training quartile to determine its value as test data is for validation only
            quant75tst.append(1)
     else:
            quant75tst.append(0)            
            
xrp_train = pd.merge(xrp_train, pd.DataFrame(quant75tr), left_index=True, right_index=True)                             
xrp_test = pd.merge(xrp_test, pd.DataFrame(quant75tst), left_index=True, right_index=True)                

In [None]:
xrp_train.columns = ['target', 'num_tweets', 'fav_percent', 'retweet_percent', 'avg_senti',
       'max_senti', 'min_senti', 'avg_return_L2h', 'avg_return_L6h',
       'avg_return_L12h', 'avg_senti_L2h', 'avg_senti_L6h', 'avg_senti_L12h',
       'top_decile_num_tweets', 'top_quartile_num_tweets']
xrp_test.columns = ['target', 'num_tweets', 'fav_percent', 'retweet_percent', 'avg_senti',
       'max_senti', 'min_senti', 'avg_return_L2h', 'avg_return_L6h',
       'avg_return_L12h', 'avg_senti_L2h', 'avg_senti_L6h', 'avg_senti_L12h',
       'top_decile_num_tweets', 'top_quartile_num_tweets']
print(xrp_train, xrp_test)

In [None]:
print('top Decile tweets in train:',xrp_train.top_decile_num_tweets.agg('sum') , 'top Decile tweets in test:',xrp_test.top_decile_num_tweets.agg('sum'))

### Scale Variables

In [None]:
scaler = sklearn.preprocessing.MinMaxScaler() #scaler

#scaling
xrp_train1 = xrp_train[xrp_train.columns.difference(['target'])]

xrp_scaled_tr = pd.DataFrame(scaler.fit_transform(xrp_train1))
xrp_scaled_tr.columns = xrp_train1.columns
xrp_scaled_tr.index = xrp_train1.index
print(xrp_scaled_tr)

In [None]:
#example of how some of the variables are correlated in the fundamentals dataset
sns.pairplot(xrp_scaled_tr[['num_tweets', 'fav_percent', 'retweet_percent', 'avg_senti',
       'max_senti', 'min_senti', 'avg_return_L2h', 'avg_return_L6h',
       'avg_return_L12h', 'avg_senti_L2h', 'avg_senti_L6h', 'avg_senti_L12h']])
plt.show()

In [None]:
#don't include categorical cols or cols with ~25% NaN in imputation and PCA 
xrp_test1 = xrp_test[xrp_test.columns.difference(['target'])]

xrp_scaled_tst = pd.DataFrame(scaler.transform(xrp_test1))
xrp_scaled_tst.columns = xrp_test1.columns
xrp_scaled_tst.index = xrp_test1.index
xrp_scaled_tr= pd.merge(xrp_train[['target']], xrp_scaled_tr, left_index=True, right_index=True)
print(xrp_scaled_tst)

### Univariate Analysis

In [None]:
#no negative values
sum(n < 0 for n in xrp_scaled_tr[['num_tweets', 'fav_percent', 'retweet_percent', 'avg_senti',
       'max_senti', 'min_senti', 'avg_return_L2h', 'avg_return_L6h',
       'avg_return_L12h', 'avg_senti_L2h', 'avg_senti_L6h', 'avg_senti_L12h',
       'top_decile_num_tweets', 'top_quartile_num_tweets']].values.flatten())

In [None]:
univar_tr_target = xrp_train[['target']].values
univar_tr_data = xrp_scaled_tr[['num_tweets', 'fav_percent', 'retweet_percent', 'avg_senti',
       'max_senti', 'min_senti', 'avg_return_L2h', 'avg_return_L6h',
       'avg_return_L12h', 'avg_senti_L2h', 'avg_senti_L6h', 'avg_senti_L12h',
       'top_decile_num_tweets', 'top_quartile_num_tweets']].values
univar = SelectKBest(score_func=chi2, k=4)
fit_univar = univar.fit(univar_tr_data, univar_tr_target)
# summarize scores
np.set_printoptions(precision=3)
print(fit_univar.scores_)
univar_features = fit_univar.transform(univar_tr_data)
# summarize selected features
print(xrp_scaled_tr.columns)

### Variable Distributions

In [None]:
for i in ('avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h','avg_senti', 'avg_senti_L12h', 'avg_senti_L2h', 'avg_senti_L6h',
'fav_percent', 'max_senti', 'min_senti', 'num_tweets','retweet_percent', 'top_decile_num_tweets', 'top_quartile_num_tweets'):
    plt.figure(figsize=(2,2))
    plt.title('XRP %s'%i, fontsize = 9)
    plt.xticks(fontsize = 8)
    plt.yticks(fontsize = 8)
    plt.hist(xrp_scaled_tr[i], normed=True)

### Correlation Matrix

In [None]:
corr = xrp_scaled_tr[['avg_return_L2h', 'avg_return_L6h', 'avg_return_L12h',
       'avg_senti', 'avg_senti_L2h', 'avg_senti_L6h', 'avg_senti_L12h',
       'fav_percent', 'max_senti', 'min_senti', 'num_tweets',
       'retweet_percent', 'top_decile_num_tweets', 'top_quartile_num_tweets']].corr('spearman') #used Spearman as some vars not not normally distributed

sns.set(font_scale=1.3)
ax = plt.axes()
sns.heatmap(corr,  
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
           cmap='Reds', ax=ax)
ax.set_title('Ripple Correlation Matrix', fontsize=18)
plt.show()

In [None]:
corr = xrp_scaled_tr[['avg_return_L2h', 'avg_return_L6h', 'avg_return_L12h',
       'avg_senti', 'avg_senti_L2h', 'avg_senti_L6h', 'avg_senti_L12h',
       'fav_percent', 'max_senti', 'min_senti', 'num_tweets',
       'retweet_percent', 'top_decile_num_tweets', 'top_quartile_num_tweets']].corr('spearman') #used Spearman as some vars not not normally distributed

fig, ax = plt.subplots(figsize=(2,2))
sns.heatmap(corr,  
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
           cmap="Blues", ax=ax)

## PCA

#### Include Sentiment and Tweet Features

In [None]:
pca = PCA().fit(xrp_scaled_tr[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h',
       'avg_senti', 'avg_senti_L12h', 'avg_senti_L2h', 'avg_senti_L6h',
       'fav_percent', 'max_senti', 'min_senti', 'num_tweets',
       'retweet_percent', 'top_decile_num_tweets', 'top_quartile_num_tweets']])
PCA_chk = pd.DataFrame(np.cumsum(pca.explained_variance_ratio_))
PCA_chk['rule'] = PCA_chk.index+1
PCA_chk.columns = ['cum_var' , 'PC']

#graph
sns.set_style('white')
plt.figure(figsize=(5,4))
plt.title('Ripple Scree Plot', fontsize = 14)
plt.xticks(np.arange(min(PCA_chk['PC']), max(PCA_chk['PC'])+1, 1.0),fontsize = 13)
plt.yticks(fontsize = 13)
plt.plot(PCA_chk['PC'],PCA_chk['cum_var'], color = '#e74c3c', linewidth = 2.5)
plt.ylim(ymin=0)
plt.xlabel('No. of Components',fontsize = 13)
plt.ylabel('Cum. Explained Variance',fontsize = 13)

In [None]:
# elbow appears to be at 5 Principle Components
PCA_chk.to_csv('XRP PCA_rule_variant.csv')

#### Fit PCA to Training & Apply to Training & Test Set

In [None]:
pca = PCA(n_components = 5) #account for 91% of variance
pca.fit(xrp_scaled_tr[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h',
       'avg_senti', 'avg_senti_L12h', 'avg_senti_L2h', 'avg_senti_L6h',
       'fav_percent', 'max_senti', 'min_senti', 'num_tweets',
       'retweet_percent', 'top_decile_num_tweets', 'top_quartile_num_tweets']])

In [None]:
xrp_train1 = pd.DataFrame(pca.transform(xrp_scaled_tr[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h',
       'avg_senti', 'avg_senti_L12h', 'avg_senti_L2h', 'avg_senti_L6h',
       'fav_percent', 'max_senti', 'min_senti', 'num_tweets',
       'retweet_percent', 'top_decile_num_tweets', 'top_quartile_num_tweets']]))
xrp_test1 = pd.DataFrame(pca.transform(xrp_scaled_tst[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h',
       'avg_senti', 'avg_senti_L12h', 'avg_senti_L2h', 'avg_senti_L6h',
       'fav_percent', 'max_senti', 'min_senti', 'num_tweets',
       'retweet_percent', 'top_decile_num_tweets', 'top_quartile_num_tweets']]))

In [None]:
#rename PCA cols
for c in xrp_train1.columns:
    d = c+1                        #PCA and index + 1 as name
    xrp_train1['PC%s'% d] = xrp_train1[c]

#drop duplicate cols
xrp_train1 = xrp_train1.filter(like='PC',axis=1)

In [None]:
#rename PCA cols
for c in xrp_test1.columns:
    d = c+1                       #PCA and index + 1 as name
    xrp_test1['PC%s'% d] = xrp_test1[c]

#drop duplicate cols
xrp_test1 = xrp_test1.filter(like='PC',axis=1)

In [None]:
sns.pairplot(xrp_train1)
plt.show()

In [None]:
#merge PCA back to fundamentals
xrp_trainPCA = pd.merge(xrp_train[['target']], xrp_train1, left_index=True, right_index=True)
xrp_testPCA = pd.merge(xrp_test[['target']], xrp_test1, left_index=True, right_index=True)

#### Exclude Sentiment and Tweet Features

In [None]:
pca = PCA().fit(xrp_scaled_tr[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h']])
PCA_chk = pd.DataFrame(np.cumsum(pca.explained_variance_ratio_))
PCA_chk['rule'] = PCA_chk.index+1
PCA_chk.columns = ['cum_var' , 'PC']

#graph
plt.plot(PCA_chk['PC'],PCA_chk['cum_var'], c= '#4286f4')
plt.ylim(ymin=0)
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

In [None]:
# elbow appears to be at 2 Principle Components
PCA_chk.to_csv('XRP PCA_rule_variant_no_tweet_data.csv')

#### Fit PCA to Training & Apply to Training & Test Set

In [None]:
pca = PCA(n_components = 2) #account for 90% of variance
pca.fit(xrp_scaled_tr[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h']])

In [None]:
xrp_train_no_tweet_1 = pd.DataFrame(pca.transform(xrp_scaled_tr[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h']]))
xrp_test_no_tweet_1 = pd.DataFrame(pca.transform(xrp_scaled_tst[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h']]))

In [None]:
#rename PCA cols
for c in xrp_train_no_tweet_1.columns:
    d = c+1                        #PCA and index + 1 as name
    xrp_train_no_tweet_1['PC%s'% d] = xrp_train_no_tweet_1[c]

#drop duplicate cols
xrp_train_no_tweet_1 = xrp_train_no_tweet_1.filter(like='PC',axis=1)

In [None]:
#rename PCA cols
for c in xrp_test_no_tweet_1.columns:
    d = c+1                       #PCA and index + 1 as name
    xrp_test_no_tweet_1['PC%s'% d] = xrp_test_no_tweet_1[c]

#drop duplicate cols
xrp_test_no_tweet_1 = xrp_test_no_tweet_1.filter(like='PC',axis=1)

In [None]:
sns.pairplot(xrp_train_no_tweet_1)
plt.show()

In [None]:
#merge PCA back to fundamentals
xrp_train_no_tweet_PCA = pd.merge(xrp_train[['target']], xrp_train_no_tweet_1, left_index=True, right_index=True)
xrp_test_no_tweet_PCA = pd.merge(xrp_test[['target']], xrp_test_no_tweet_1, left_index=True, right_index=True)

# Machine Learning

In [None]:
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from sklearn import linear_model as lm 
from sklearn.tree import DecisionTreeClassifier as dt
from sklearn.ensemble import RandomForestClassifier as rf
from sklearn import svm 
from sklearn import naive_bayes as nb
from patsy import dmatrices
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFECV
from sklearn import calibration

In [None]:
print(xrp_trainPCA)
print(xrp_train_no_tweet_PCA)

In [None]:
print(xrp_testPCA.isnull().sum().sum())
print(xrp_test_no_tweet_PCA.isnull().sum().sum())

In [None]:
#records that are being dropped are ones that didn't have a sufficient number of records that occurred immediately prior 
#to them in order to create the derived variables i.e. avg_senti_L12h
xrp_train = xrp_train.dropna()
xrp_test = xrp_test.dropna()
print("Number of NaN in Training:",xrp_train.isnull().sum().sum(),", Number of NaN in Testing:", xrp_test.isnull().sum().sum())

In [None]:
print(xrp_trainPCA.columns)
print(xrp_train_no_tweet_PCA.columns)

In [None]:
data_cols = pd.DataFrame(columns = ['PC1', 'PC2', 'PC3', 'PC4' , 'PC5'])  #columns for modelling
#convert train/ test to matrices for modelling
xrp_data_train = xrp_trainPCA.as_matrix(columns=[ list(data_cols) ])
xrp_target_train = xrp_trainPCA.as_matrix(columns=['target'])
xrp_target_train = np.ravel(xrp_target_train)

xrp_data_test = xrp_testPCA.as_matrix(columns=[ list(data_cols) ])
xrp_target_test = xrp_testPCA.as_matrix(columns=['target'])
xrp_target_test = np.ravel(xrp_target_test)

In [None]:
data_cols_no_tweet = pd.DataFrame(columns = ['PC1', 'PC2'])  #columns for modelling
#convert train/ test to matrices for modelling
xrp_data_no_tweet_train = xrp_train_no_tweet_PCA.as_matrix(columns=[ list(data_cols_no_tweet) ])

xrp_data_no_tweet_test = xrp_test_no_tweet_PCA.as_matrix(columns=[ list(data_cols_no_tweet) ])

In [None]:
pd.DataFrame(xrp_target_train).sum() / pd.DataFrame(xrp_target_train).count()

In [None]:
#xrp_scaled_tr.columns
xrp_scaled_tr.columns.tolist()

In [None]:
#tweets
xrp_scaled_tr = xrp_scaled_tr[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h',
       'avg_senti', 'avg_senti_L12h', 'avg_senti_L2h', 'avg_senti_L6h',
       'fav_percent', 'max_senti', 'min_senti', 'num_tweets',
       'retweet_percent', 'top_decile_num_tweets', 'top_quartile_num_tweets']]

xrp_scaled_tr = xrp_scaled_tr[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h',
       'avg_senti', 'avg_senti_L12h', 'avg_senti_L2h', 'avg_senti_L6h',
       'fav_percent', 'max_senti', 'min_senti', 'num_tweets',
       'retweet_percent', 'top_decile_num_tweets', 'top_quartile_num_tweets']]

xrp_trainPCA = xrp_trainPCA[['PC1' , 'PC2', 'PC3' ,'PC4', 'PC5']]
xrp_testPCA = xrp_testPCA[['PC1' , 'PC2', 'PC3' ,'PC4', 'PC5']]


#no tweets
xrp_scaled_no_tweet_tr = xrp_scaled_tr[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h']]

xrp_scaled_no_tweet_tst = xrp_scaled_tst[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h']]

xrp_train_no_tweet_PCA = xrp_train_no_tweet_PCA[['PC1' , 'PC2']]
xrp_test_no_tweet_PCA = xrp_test_no_tweet_PCA[['PC1' , 'PC2']]

## Parameter & Feature Selection

### Logisitic Regression: Parameter and Feature Investigation

#### With Tweet Data

In [None]:
for X in (xrp_scaled_tr ,xrp_trainPCA):
     for solv in ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'):
            for c in (.001, 0.1, 1, 10, 100):
                #Recursive Feature Elimination
                model_RFE_lr = lm.LogisticRegression(C = c,solver = solv,class_weight = 'balanced')
                # create the RFECV model
                rfe_lr = RFECV(model_RFE_lr, step=1, scoring='roc_auc')   #'roc_auc' 
                rfe_lr = rfe_lr.fit(X, xrp_target_train)
                result_lr = rfe_lr.support_
                rank_lr = rfe_lr.ranking_
                RFE_cols_lr = pd.DataFrame(result_lr, columns = ['Keep'])
                RFE_rank_lr = pd.DataFrame(rank_lr, columns = ['Rank'])
                data_cols_df = pd.DataFrame(X.columns.tolist(), columns =['Variable'])
                RFE_cols_lr['indexs'] = RFE_cols_lr.index
                RFE_rank_lr['indexs'] = RFE_rank_lr.index
                data_cols_df['indexs'] = data_cols_df.index
                keep_vars_lr = pd.merge(data_cols_df, RFE_cols_lr , on=['indexs'])
                keep_vars_lr = pd.merge(keep_vars_lr, RFE_rank_lr , on=['indexs'])
                keep_vars_lr = keep_vars_lr.drop('indexs', axis=1)
                matr=confusion_matrix(xrp_target_train, rfe_lr.predict(X)).ravel()
                print("C=",c,"Algorithm=",solv, "ROC:" , metrics.roc_auc_score(xrp_target_train, rfe_lr.predict_proba(X)[:, 1]), keep_vars_lr[:25],metrics.classification_report(xrp_target_train, rfe_lr.predict(X)))

#### Without Tweet Data

In [None]:
for X in (xrp_scaled_no_tweet_tr ,xrp_train_no_tweet_PCA):
     for solv in ('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'):
            for c in (.001, 0.1, 1, 10, 100):
                #Recursive Feature Elimination
                model_RFE_lr = lm.LogisticRegression(C = c,solver = solv,class_weight = 'balanced')
                # create the RFECV model
                rfe_lr = RFECV(model_RFE_lr, step=1, scoring='roc_auc')   #'roc_auc' 
                rfe_lr = rfe_lr.fit(X, xrp_target_train)
                result_lr = rfe_lr.support_
                rank_lr = rfe_lr.ranking_
                RFE_cols_lr = pd.DataFrame(result_lr, columns = ['Keep'])
                RFE_rank_lr = pd.DataFrame(rank_lr, columns = ['Rank'])
                data_cols_df = pd.DataFrame(X.columns.tolist(), columns =['Variable'])
                RFE_cols_lr['indexs'] = RFE_cols_lr.index
                RFE_rank_lr['indexs'] = RFE_rank_lr.index
                data_cols_df['indexs'] = data_cols_df.index
                keep_vars_lr = pd.merge(data_cols_df, RFE_cols_lr , on=['indexs'])
                keep_vars_lr = pd.merge(keep_vars_lr, RFE_rank_lr , on=['indexs'])
                keep_vars_lr = keep_vars_lr.drop('indexs', axis=1)
                matr=confusion_matrix(xrp_target_train, rfe_lr.predict(X)).ravel()
                print("C=",c,"Algorithm=",solv, "ROC:" , metrics.roc_auc_score(xrp_target_train, rfe_lr.predict_proba(X)[:, 1]), keep_vars_lr[:25],metrics.classification_report(xrp_target_train, rfe_lr.predict(X)))

### Logistic Regression Training & Test Set

#### With Tweet Data

In [None]:
#scaled raw data
xrp_data_lr_tr2 = xrp_scaled_tr[['avg_return_L12h', 'avg_return_L6h', 'avg_senti_L12h', 'min_senti', 'num_tweets'
                                 ,'top_decile_num_tweets', 'top_quartile_num_tweets']]
xrp_data_lr_tst2 = xrp_scaled_tst[['avg_return_L12h', 'avg_return_L6h', 'avg_senti_L12h', 'min_senti', 'num_tweets'
                                 ,'top_decile_num_tweets', 'top_quartile_num_tweets']]
    
#PCA data
xrp_data_lr_tr3 = xrp_trainPCA[['PC1','PC2', 'PC3', 'PC4', 'PC5']]
xrp_data_lr_tst3 = xrp_testPCA[['PC1','PC2', 'PC3', 'PC4', 'PC5']]

#### Without Tweet Data

In [None]:
#scaled raw data
xrp_data_lr_no_tweet_tr2 = xrp_scaled_no_tweet_tr[['avg_return_L12h']]
xrp_data_lr_no_tweet_tst2 = xrp_scaled_no_tweet_tst[['avg_return_L12h']]

#PCA data
xrp_data_lr_no_tweet_tr3 = xrp_train_no_tweet_PCA[['PC1','PC2']]
xrp_data_lr_no_tweet_tst3 = xrp_test_no_tweet_PCA[['PC1','PC2']]

### Decision Tree: Parameter and Feature Investigation

#### With Tweet Data

In [None]:
for X in (xrp_scaled_tr, xrp_trainPCA ):
    for c in ('gini' , 'entropy'):
            for depth in (1,5,10,15,20):
                for split in (.1,.2,.3,.4,.5):
                    for leaf in (.1,.2,.3):
                        #Recursive Feature Elimination
                        model_RFE_dt = dt(criterion = c,class_weight = 'balanced'
                                           , max_depth=depth, min_samples_split=split
                                           , min_samples_leaf=leaf)
                        # create the RFECV model
                        rfe_dt = RFECV(model_RFE_dt, step=1, scoring='roc_auc')   #'roc_auc' 
                        rfe_dt = rfe_dt.fit(X, xrp_target_train)
                        result_dt = rfe_dt.support_
                        rank_dt = rfe_dt.ranking_
                        RFE_cols_dt = pd.DataFrame(result_dt, columns = ['Keep'])
                        RFE_rank_dt = pd.DataFrame(rank_dt, columns = ['Rank'])
                        data_cols_df = pd.DataFrame(X.columns.tolist(), columns =['Variable'])
                        RFE_cols_dt['indexs'] = RFE_cols_dt.index
                        RFE_rank_dt['indexs'] = RFE_rank_dt.index
                        data_cols_df['indexs'] = data_cols_df.index
                        keep_vars_dt = pd.merge(data_cols_df, RFE_cols_dt , on=['indexs'])
                        keep_vars_dt = pd.merge(keep_vars_dt, RFE_rank_dt , on=['indexs'])
                        keep_vars_dt = keep_vars_dt.drop('indexs', axis=1)
                        matr=confusion_matrix(xrp_target_train, rfe_dt.predict(X)).ravel()
                        print("Criterion=",c, "Max Depth=",depth,"Min Samples Split=",split,"Min Samples Leaf=",leaf, "ROC:" , metrics.roc_auc_score(xrp_target_train, rfe_dt.predict_proba(X)[:, 1]), keep_vars_dt[:25],metrics.classification_report(xrp_target_train, rfe_dt.predict(X)))

#### Without Tweet Data

In [None]:
for X in (xrp_scaled_no_tweet_tr ,xrp_train_no_tweet_PCA):
    for c in ('gini' , 'entropy'):
            for depth in (1,5,10,15,20):
                for split in (.1,.2,.3,.4,.5):
                    model_RFE_dt = dt(criterion = c,class_weight = 'balanced'
                                       , max_depth=depth, min_samples_split=split
                                       , min_samples_leaf=0.1)
                    # create the RFECV model
                    rfe_dt = RFECV(model_RFE_dt, step=1, scoring='roc_auc')   #'roc_auc' 
                    rfe_dt = rfe_dt.fit(X, xrp_target_train)
                    result_dt = rfe_dt.support_
                    rank_dt = rfe_dt.ranking_
                    RFE_cols_dt = pd.DataFrame(result_dt, columns = ['Keep'])
                    RFE_rank_dt = pd.DataFrame(rank_dt, columns = ['Rank'])
                    data_cols_df = pd.DataFrame(X.columns.tolist(), columns =['Variable'])
                    RFE_cols_dt['indexs'] = RFE_cols_dt.index
                    RFE_rank_dt['indexs'] = RFE_rank_dt.index
                    data_cols_df['indexs'] = data_cols_df.index
                    keep_vars_dt = pd.merge(data_cols_df, RFE_cols_dt , on=['indexs'])
                    keep_vars_dt = pd.merge(keep_vars_dt, RFE_rank_dt , on=['indexs'])
                    keep_vars_dt = keep_vars_dt.drop('indexs', axis=1)
                    matr=confusion_matrix(xrp_target_train, rfe_dt.predict(X)).ravel()
                    print("Criterion=",c, "Max Depth=",depth,"Min Samples Split=",split, "ROC:" , metrics.roc_auc_score(xrp_target_train, rfe_dt.predict_proba(X)[:, 1]), keep_vars_dt[:25],metrics.classification_report(xrp_target_train, rfe_dt.predict(X)))

### Decision Tree Training & Test Set

#### With Tweet Data

In [None]:
#scaled raw data
xrp_data_dt_tr2 = xrp_scaled_tr[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h', 'avg_senti'
                                 , 'avg_senti_L2h', 'avg_senti_L6h', 'fav_percent','max_senti', 'min_senti', 'num_tweets'
                                 , 'retweet_percent','top_decile_num_tweets', 'top_quartile_num_tweets']]
xrp_data_dt_tst2 = xrp_scaled_tst[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h', 'avg_senti'
                                 , 'avg_senti_L2h', 'avg_senti_L6h', 'fav_percent','max_senti', 'min_senti', 'num_tweets'
                                 , 'retweet_percent','top_decile_num_tweets', 'top_quartile_num_tweets']]
#PCA data
xrp_data_dt_tr3 = xrp_trainPCA[['PC2','PC4']]
xrp_data_dt_tst3 = xrp_testPCA[['PC2','PC4']]

#### Without Tweet Data

In [None]:
#scaled raw data
xrp_data_dt_no_tweet_tr2 = xrp_scaled_no_tweet_tr[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h']]
xrp_data_dt_no_tweet_tst2 = xrp_scaled_no_tweet_tst[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h']]

#PCA data
xrp_data_dt_no_tweet_tr3 = xrp_train_no_tweet_PCA[['PC1']]
xrp_data_dt_no_tweet_tst3 = xrp_test_no_tweet_PCA[['PC1']]

### SVM: Parameter and Feature Investigation

#### With Tweet Data

In [None]:
for X in (xrp_scaled_tr,xrp_trainPCA, xrp_scaled_tr[['avg_senti_L12h','min_senti','top_decile_num_tweets']]):
    for kern in ('rbf','linear', 'poly', 'sigmoid'):
        for c in (.001, 0.1, 1, 10, 100):
            for g in (0.001, 0.01, 0.1, 1):
                #All variables
                rfe_svm = svm.SVC(probability = True,kernel = kern,C= c,class_weight = 'balanced', gamma = g)
                rfe_svm = rfe_svm.fit(X, xrp_target_train)                
                matr=confusion_matrix(xrp_target_train, rfe_svm.predict(X)).ravel()
                print("Data=",X.iloc[0,1:3],"C=",c, "Kernel=",kern,"Gamma=",g,"ROC=",metrics.roc_auc_score(xrp_target_train, rfe_svm.predict_proba(X)[:, 1]))

#### Without Tweet Data

In [None]:
for X in (xrp_scaled_no_tweet_tr ,xrp_train_no_tweet_PCA):
    for kern in ('rbf','linear', 'poly', 'sigmoid'):
        for c in (.001, 0.1, 1, 10, 100):
            for g in (0.001, 0.01, 0.1, 1):
                #All variables
                rfe_svm = svm.SVC(probability = True,kernel = kern,C= c,class_weight = 'balanced', gamma = g)
                rfe_svm = rfe_svm.fit(X, xrp_target_train)                
                matr=confusion_matrix(xrp_target_train, rfe_svm.predict(X)).ravel()
                print("Data=",X.iloc[0,1:3],"C=",c, "Kernel=",kern,"Gamma=",g,"ROC=",metrics.roc_auc_score(xrp_target_train, rfe_svm.predict_proba(X)[:, 1]))

### Best features for SVM According to RFECV

#### With Tweet Data

In [None]:
#scaled raw data
xrp_data_svm_tr2 = xrp_scaled_tr
xrp_data_svm_tst2 = xrp_scaled_tst

#PCA data
xrp_data_svm_tr3 = xrp_trainPCA[['PC1','PC2','PC3','PC4','PC5']].as_matrix()
xrp_data_svm_tst3 = xrp_testPCA[['PC1','PC2','PC3','PC4','PC5']].as_matrix()

#### Without Tweet Data

In [None]:
#scaled raw data
xrp_data_svm_no_tweet_tr2 = xrp_scaled_no_tweet_tr[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h']]
xrp_data_svm_no_tweet_tst2 = xrp_scaled_no_tweet_tst[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h']]

#PCA data
xrp_data_svm_no_tweet_tr3 = xrp_train_no_tweet_PCA[['PC1','PC2']]
xrp_data_svm_no_tweet_tst3 = xrp_test_no_tweet_PCA[['PC1','PC2']]

### Naive Bayes: Parameter and Feature Investigation

#### With Tweet Data

In [None]:
for X in (xrp_scaled_tr ,xrp_trainPCA):
    model_RFE_nb =  nb.GaussianNB()
    # RFECV doesn't work
    rfe_nb = model_RFE_nb.fit(X, xrp_target_train)
    print("Data=",X.iloc[0,1:3],"ROC:" , metrics.roc_auc_score(xrp_target_train, rfe_nb.predict_proba(X)[:, 1]),metrics.classification_report(xrp_target_train, rfe_nb.predict(X)))

#### Without Tweet Data

In [None]:
for X in (xrp_scaled_no_tweet_tr ,xrp_train_no_tweet_PCA):
    model_RFE_nb =  nb.GaussianNB()
    # RFECV doesn't work
    rfe_nb = model_RFE_nb.fit(X, xrp_target_train)
    print("Data=",X.iloc[0,1:3],"ROC:" , metrics.roc_auc_score(xrp_target_train, rfe_nb.predict_proba(X)[:, 1]),metrics.classification_report(xrp_target_train, rfe_nb.predict(X)))

### Best features for Naive Bayes According to RFECV

#### With Tweet Data

In [None]:
#PCA data
xrp_data_nb_tr3 = xrp_trainPCA[['PC1','PC2','PC3','PC4','PC5']]
xrp_data_nb_tst3 = xrp_testPCA[['PC1','PC2','PC3','PC4','PC5']]

#### Without Tweet Data

In [None]:
#PCA data
xrp_data_nb_no_tweet_tr3 = xrp_train_no_tweet_PCA[['PC1','PC2']]
xrp_data_nb_no_tweet_tst3 = xrp_test_no_tweet_PCA[['PC1','PC2']]

### Random Forest: Parameter and Feature Investigation

#### With Tweet Data

In [None]:
for X in (xrp_scaled_tr ,xrp_trainPCA):
    for b in (True, False):
        for split in (0.1, 0.2 ,0.3):
            for n_est in (100, 300, 500):
                model_RFE_rf = rf(class_weight = 'balanced' , random_state = 42 ,bootstrap = b, max_depth = 5, 
                                   min_samples_leaf = 0.1, min_samples_split = split
                                  , n_estimators = n_est)
                # create the RFECV model
                rfe_rf = RFECV(model_RFE_rf, step=1,  scoring='roc_auc')   #'roc_auc' 
                rfe_rf = rfe_rf.fit(X, xrp_target_train)
                result_rf = rfe_rf.support_
                rank_rf = rfe_rf.ranking_
                RFE_cols_rf = pd.DataFrame(result_rf, columns = ['Keep'])
                RFE_rank_rf = pd.DataFrame(rank_rf, columns = ['Rank'])
                data_cols_df = pd.DataFrame(X.columns.tolist(), columns =['Variable'])
                RFE_cols_rf['indexs'] = RFE_cols_rf.index
                RFE_rank_rf['indexs'] = RFE_rank_rf.index
                data_cols_df['indexs'] = data_cols_df.index
                keep_vars_rf = pd.merge(data_cols_df, RFE_cols_rf , on=['indexs'])
                keep_vars_rf = pd.merge(keep_vars_rf, RFE_rank_rf , on=['indexs'])
                keep_vars_rf = keep_vars_rf.drop('indexs', axis=1)
                matr=confusion_matrix(xrp_target_train, rfe_rf.predict(X)).ravel()
                print("Bootstrap=",b,"Max Depth=",5,"Min Samples Split=",split
                      ,"No. of Estimators",n_est,"ROC:" , metrics.roc_auc_score(xrp_target_train, rfe_rf.predict_proba(X)[:, 1])
                      ,keep_vars_rf[:25],metrics.classification_report(xrp_target_train, rfe_rf.predict(X)))

#### Without Tweet Data

In [None]:
for X in (xrp_scaled_no_tweet_tr ,xrp_train_no_tweet_PCA):
    for b in (True, False):
        for split in (0.1, 0.2 ,0.3):
                for n_est in (100, 300, 500):
                    model_RFE_rf = rf(class_weight = 'balanced' , random_state = 42 ,bootstrap = b, max_depth = 5, 
                                      min_samples_leaf = 0.1, min_samples_split = split
                                      , n_estimators = n_est)
                    # create the RFECV model
                    rfe_rf = RFECV(model_RFE_rf, step=1,  scoring='roc_auc')   #'roc_auc' 
                    rfe_rf = rfe_rf.fit(X, xrp_target_train)
                    result_rf = rfe_rf.support_
                    rank_rf = rfe_rf.ranking_
                    RFE_cols_rf = pd.DataFrame(result_rf, columns = ['Keep'])
                    RFE_rank_rf = pd.DataFrame(rank_rf, columns = ['Rank'])
                    data_cols_df = pd.DataFrame(X.columns.tolist(), columns =['Variable'])
                    RFE_cols_rf['indexs'] = RFE_cols_rf.index
                    RFE_rank_rf['indexs'] = RFE_rank_rf.index
                    data_cols_df['indexs'] = data_cols_df.index
                    keep_vars_rf = pd.merge(data_cols_df, RFE_cols_rf , on=['indexs'])
                    keep_vars_rf = pd.merge(keep_vars_rf, RFE_rank_rf , on=['indexs'])
                    keep_vars_rf = keep_vars_rf.drop('indexs', axis=1)
                    matr=confusion_matrix(xrp_target_train, rfe_rf.predict(X)).ravel()
                    print("Bootstrap=",b,"Max Depth=",5,"Min Samples Split=",split
                          ,"No. of Estimators",n_est,"ROC:" , metrics.roc_auc_score(xrp_target_train, rfe_rf.predict_proba(X)[:, 1])
                          ,keep_vars_rf[:25],metrics.classification_report(xrp_target_train, rfe_rf.predict(X)))

### Best features for Random Forest According to RFECV

#### With Tweet Data

In [None]:
#scaled raw data
xrp_data_rf_tr2 = xrp_scaled_tr[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h','avg_senti', 'avg_senti_L12h'
                                 , 'avg_senti_L2h', 'avg_senti_L6h','fav_percent', 'max_senti', 'min_senti', 'num_tweets',
                                   'retweet_percent', 'top_decile_num_tweets', 'top_quartile_num_tweets']]

xrp_data_rf_tst2 = xrp_scaled_tst[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h','avg_senti', 'avg_senti_L12h'
                                 , 'avg_senti_L2h', 'avg_senti_L6h','fav_percent', 'max_senti', 'min_senti', 'num_tweets',
                                   'retweet_percent', 'top_decile_num_tweets', 'top_quartile_num_tweets']]

#PCA data
xrp_data_rf_tr3 = xrp_trainPCA[['PC1','PC2','PC3','PC4','PC5']].as_matrix()
xrp_data_rf_tst3 = xrp_testPCA[['PC1','PC2','PC3','PC4', 'PC5']].as_matrix()

#### Without Tweet Data

In [None]:
#scaled raw data
xrp_data_rf_no_tweet_tr2 = xrp_scaled_no_tweet_tr[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h']]
xrp_data_rf_no_tweet_tst2 = xrp_scaled_no_tweet_tst[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h']]

#PCA data
xrp_data_rf_no_tweet_tr3 = xrp_train_no_tweet_PCA[['PC1','PC2']]
xrp_data_rf_no_tweet_tst3 = xrp_test_no_tweet_PCA[['PC1','PC2']]

# Model Build

#### Logistic Regression Model Evaluation on Test Set 

#### With Tweet Data

In [None]:
#model1
xrp_data_lr_tr3 = xrp_trainPCA[['PC1','PC2', 'PC3','PC4','PC5']].as_matrix()
xrp_data_lr_tst3 = xrp_testPCA[['PC1','PC2', 'PC3','PC4','PC5']].as_matrix()
model_lr = model_lr = lm.LogisticRegression(C= 0.001, solver='liblinear',class_weight = 'balanced')
model_lr.fit(xrp_data_lr_tr3, xrp_target_train)
pred_model_lr = model_lr.predict(xrp_data_lr_tst3) 
pred_probs_model_lr = model_lr.predict_proba(xrp_data_lr_tst3)
print ('ROC TR AUC:     LR1 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_lr.predict_proba(xrp_data_lr_tr3)[:, 1]))
print ('ROC TST AUC:    LR1 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_lr[:, 1]))


#model2
xrp_data_lr_tr3 = xrp_trainPCA[['PC2', 'PC3','PC4','PC5']].as_matrix()
xrp_data_lr_tst3 = xrp_testPCA[['PC2', 'PC3','PC4','PC5']].as_matrix()
model_lr = lm.LogisticRegression(C= 10, solver='liblinear',class_weight = 'balanced')
model_lr.fit(xrp_data_lr_tr3, xrp_target_train)
pred_model_lr = model_lr.predict(xrp_data_lr_tst3) 
pred_probs_model_lr = model_lr.predict_proba(xrp_data_lr_tst3)
print ('ROC TR AUC:     LR2 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_lr.predict_proba(xrp_data_lr_tr3)[:, 1]))
print ('ROC TST AUC:    LR2 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_lr[:, 1]))


#model3

xrp_data_lr_tst3 = xrp_testPCA[['PC2','PC4','PC5']].as_matrix()
model_lr = lm.LogisticRegression(C= 0.1, solver='liblinear',class_weight = 'balanced')
model_lr.fit(xrp_data_lr_tr2, xrp_target_train)
pred_model_lr = model_lr.predict(xrp_data_lr_tst2) 
pred_probs_model_lr = model_lr.predict_proba(xrp_data_lr_tst2)
print ('ROC TR AUC:     LR3 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_lr.predict_proba(xrp_data_lr_tr2)[:, 1]))
print ('ROC TST AUC:    LR3 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_lr[:, 1]))

#### Without Tweet Data

In [None]:
#model1
xrp_data_lr_no_tweet_tr2 = xrp_scaled_no_tweet_tr[['avg_return_L12h']]
xrp_data_lr_no_tweet_tst2 = xrp_scaled_no_tweet_tst[['avg_return_L12h']]
model_lr = lm.LogisticRegression(C= 0.001, solver='liblinear',class_weight = 'balanced')
model_lr.fit(xrp_data_lr_no_tweet_tr2, xrp_target_train)
pred_model_lr = model_lr.predict(xrp_data_lr_no_tweet_tst2) 
pred_probs_model_lr = model_lr.predict_proba(xrp_data_lr_no_tweet_tst2)
print ('ROC TR AUC:     LR1 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_lr.predict_proba(xrp_data_lr_no_tweet_tr2)[:, 1]))
print ('ROC TST AUC:    LR1 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_lr[:, 1]))


#model2
xrp_data_lr_no_tweet_tr3 = xrp_train_no_tweet_PCA[['PC1','PC2']]
xrp_data_lr_no_tweet_tst3 = xrp_test_no_tweet_PCA[['PC1','PC2']]
model_lr = lm.LogisticRegression(C= 0.1, solver='liblinear',class_weight = 'balanced')
model_lr.fit(xrp_data_lr_no_tweet_tr3, xrp_target_train)
pred_model_lr = model_lr.predict(xrp_data_lr_no_tweet_tst3) 
pred_probs_model_lr = model_lr.predict_proba(xrp_data_lr_no_tweet_tst3)
print ('ROC TR AUC:     LR2 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_lr.predict_proba(xrp_data_lr_no_tweet_tr3)[:, 1]))
print ('ROC TST AUC:    LR2 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_lr[:, 1]))

### Best features for Logistic Regression According to Evaluation

#### With Tweet Data

In [None]:
xrp_data_lr_tr3 = xrp_trainPCA[['PC2', 'PC3','PC4','PC5']].as_matrix()
xrp_data_lr_tst3 = xrp_testPCA[['PC2', 'PC3','PC4','PC5']].as_matrix()

#### Without Tweet Data

In [None]:
xrp_data_lr_no_tweet_tr3 = xrp_train_no_tweet_PCA[['PC1','PC2']]
xrp_data_lr_no_tweet_tst3 = xrp_test_no_tweet_PCA[['PC1','PC2']]

### Build Logistic Regression

#### Logistic Regression Model Parameters

#### With Tweet Data

In [None]:
model_lr =lm.LogisticRegression(C= 10, solver='liblinear',class_weight = 'balanced')
model_lr.fit(xrp_data_lr_tr3, xrp_target_train)

#### Without Tweet Data

In [None]:
model_lr_no_tweet = lm.LogisticRegression(C= 0.1, solver='liblinear',class_weight = 'balanced')
model_lr_no_tweet.fit(xrp_data_lr_no_tweet_tr3, xrp_target_train)

#### Apply Model to Test Set

#### With Tweet Data

In [None]:
#Predict target for the test set
pred_model_lr = model_lr.predict(xrp_data_lr_tst3)
#generate class probabilities
pred_probs_model_lr = model_lr.predict_proba(xrp_data_lr_tst3)

In [None]:
print('Training Accuracy:  LR - %0.3f'%model_lr.score(xrp_data_lr_tr3, xrp_target_train))
print('Test Accuracy:      LR - %0.3f'%model_lr.score(xrp_data_lr_tst3, xrp_target_test))
print ('Training ROC AUC:  LR - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_lr.predict_proba(xrp_data_lr_tr3)[:, 1]))
print ('Testing ROC AUC:   LR - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_lr[:, 1]))

#### Without Tweet Data

In [None]:
#Predict target for the test set
pred_model_lr_no_tweet = model_lr_no_tweet.predict(xrp_data_lr_no_tweet_tst3)
#generate class probabilities
pred_probs_model_lr_no_tweet = model_lr_no_tweet.predict_proba(xrp_data_lr_no_tweet_tst3)

In [None]:
print('Training Accuracy:  LR - %0.3f'%model_lr_no_tweet.score(xrp_data_lr_no_tweet_tr3, xrp_target_train))
print('Test Accuracy:      LR - %0.3f'%model_lr_no_tweet.score(xrp_data_lr_no_tweet_tst3, xrp_target_test))
print ('Training ROC AUC:  LR - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_lr_no_tweet.predict_proba(xrp_data_lr_no_tweet_tr3)[:, 1]))
print ('Testing ROC AUC:   LR - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_lr_no_tweet[:, 1]))

### Build Decision Tree

#### Decision Tree Model Evaluation on Test Set 

#### With Tweet Data

In [None]:
#model1
model_dt = dt(criterion = 'gini',class_weight = 'balanced', max_depth=5, min_samples_split=0.1, min_samples_leaf=0.1)
model_dt.fit(xrp_data_dt_tr2, xrp_target_train)
pred_model_dt = model_dt.predict(xrp_data_dt_tst2) 
pred_probs_model_dt = model_dt.predict_proba(xrp_data_dt_tst2)
print ('ROC TR AUC:     DT1 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_dt.predict_proba(xrp_data_dt_tr2)[:, 1]))
print ('ROC TST AUC:    DT1 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_dt[:, 1]))

#model2
model_dt = dt(criterion = 'gini',class_weight = 'balanced', max_depth=5, min_samples_split=0.1, min_samples_leaf=0.1)
model_dt.fit(xrp_data_dt_tr3, xrp_target_train)
pred_model_dt = model_dt.predict(xrp_data_dt_tst3) 
pred_probs_model_dt = model_dt.predict_proba(xrp_data_dt_tst3)
print ('ROC TR AUC:     DT4 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_dt.predict_proba(xrp_data_dt_tr3)[:, 1]))
print ('ROC TST AUC:    DT4 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_dt[:, 1]))


#### Without Tweet Data

In [None]:
#model1
xrp_data_dt_no_tweet_tr2 = xrp_scaled_no_tweet_tr
xrp_data_dt_no_tweet_tst2 = xrp_scaled_no_tweet_tst
model_dt = dt(criterion = 'gini',class_weight = 'balanced', max_depth=5, min_samples_split=0.1, min_samples_leaf=0.1)
model_dt.fit(xrp_data_dt_no_tweet_tr2, xrp_target_train)
pred_model_dt = model_dt.predict(xrp_data_dt_no_tweet_tst2) 
pred_probs_model_dt = model_dt.predict_proba(xrp_data_dt_no_tweet_tst2)
print ('ROC TR AUC:     DT1 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_dt.predict_proba(xrp_data_dt_no_tweet_tr2)[:, 1]))
print ('ROC TST AUC:    DT1 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_dt[:, 1]))

#model2
xrp_data_dt_no_tweet_tr2 = xrp_scaled_no_tweet_tr[['avg_return_L12h','avg_return_L6h']]
xrp_data_dt_no_tweet_tst2 = xrp_scaled_no_tweet_tst[['avg_return_L12h','avg_return_L6h']]
model_dt = dt(criterion = 'gini',class_weight = 'balanced', max_depth=10, min_samples_split=0.1, min_samples_leaf=0.1)
model_dt.fit(xrp_data_dt_no_tweet_tr2, xrp_target_train)
pred_model_dt = model_dt.predict(xrp_data_dt_no_tweet_tst2) 
pred_probs_model_dt = model_dt.predict_proba(xrp_data_dt_no_tweet_tst2)
print ('ROC TR AUC:     DT2 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_dt.predict_proba(xrp_data_dt_no_tweet_tr2)[:, 1]))
print ('ROC TST AUC:    DT2 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_dt[:, 1]))


#model3
xrp_data_dt_no_tweet_tr3 = xrp_train_no_tweet_PCA[['PC1']].as_matrix()
xrp_data_dt_no_tweet_tst3 = xrp_test_no_tweet_PCA[['PC1']].as_matrix()
model_dt = dt(criterion = 'gini',class_weight = 'balanced', max_depth=5, min_samples_split=0.2, min_samples_leaf=0.1)
model_dt.fit(xrp_data_dt_no_tweet_tr3, xrp_target_train)
pred_model_dt = model_dt.predict(xrp_data_dt_no_tweet_tst3) 
pred_probs_model_dt = model_dt.predict_proba(xrp_data_dt_no_tweet_tst3)
print ('ROC TR AUC:     DT3 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_dt.predict_proba(xrp_data_dt_no_tweet_tr3)[:, 1]))
print ('ROC TST AUC:    DT3 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_dt[:, 1]))

#### Best features for Decision Tree According to Evaluation

#### With Tweet Data

In [None]:
xrp_data_dt_tr2 = xrp_scaled_tr[['avg_return_L6h','avg_return_L12h']]
xrp_data_dt_tst2 = xrp_scaled_tst[['avg_return_L6h','avg_return_L12h']]

#### Without Tweet Data

In [None]:
xrp_data_dt_no_tweet_tr2 = xrp_scaled_no_tweet_tr[['avg_return_L12h','avg_return_L6h']]
xrp_data_dt_no_tweet_tst2 = xrp_scaled_no_tweet_tst[['avg_return_L12h','avg_return_L6h']]

#### Decision Tree Model Parameters

#### With Tweet Data

In [None]:
model_dt = dt(criterion = 'gini',class_weight = 'balanced', max_depth=10, min_samples_split=0.1, min_samples_leaf=0.1)
model_dt.fit(xrp_data_dt_tr2, xrp_target_train)

#### Without Tweet Data

In [None]:
model_dt_no_tweet = dt(criterion = 'gini',class_weight = 'balanced', max_depth=10, min_samples_split=0.1, min_samples_leaf=0.1)
model_dt_no_tweet.fit(xrp_data_dt_no_tweet_tr2, xrp_target_train)

#### Apply Model to Test Set

#### With Tweet Data

In [None]:
#Predict target for the test set
pred_model_dt = model_dt.predict(xrp_data_dt_tst2) 
#generate class probabilities
pred_probs_model_dt = model_dt.predict_proba(xrp_data_dt_tst2)

In [None]:
print('Training Accuracy:  DT - %0.3f'%model_dt.score(xrp_data_dt_tr2, xrp_target_train))
print('Test Accuracy:      DT - %0.3f'%model_dt.score(xrp_data_dt_tst2, xrp_target_test))
print ('Training ROC AUC:  DT - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_dt.predict_proba(xrp_data_dt_tr2)[:, 1]))
print ('Testing ROC AUC:   DT - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_dt[:, 1]))

#### Without Tweet Data

In [None]:
#Predict target for the test set
pred_model_dt_no_tweet = model_dt_no_tweet.predict(xrp_data_dt_no_tweet_tst2) 
#generate class probabilities
pred_probs_model_dt_no_tweet = model_dt_no_tweet.predict_proba(xrp_data_dt_no_tweet_tst2)

In [None]:
print('Training Accuracy:  DT - %0.3f'%model_dt_no_tweet.score(xrp_data_dt_no_tweet_tr2, xrp_target_train))
print('Test Accuracy:      DT - %0.3f'%model_dt_no_tweet.score(xrp_data_dt_no_tweet_tst2, xrp_target_test))
print ('Training ROC AUC:  DT - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_dt_no_tweet.predict_proba(xrp_data_dt_no_tweet_tr2)[:, 1]))
print ('Testing ROC AUC:   DT - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_dt_no_tweet[:, 1]))

### Build SVM 

#### SVM Model Evaluation on Test Set 

#### With Tweet Data

In [None]:
#model1
xrp_data_svm_tr2 = xrp_scaled_tr
xrp_data_svm_tst2 = xrp_scaled_tst
model_svm = svm.SVC(probability = True, kernel = 'rbf', C= 1, class_weight = 'balanced', gamma = 1)
model_svm.fit(xrp_data_svm_tr2, xrp_target_train)
pred_model_svm = model_svm.predict(xrp_data_svm_tst2) 
pred_probs_model_svm = model_svm.predict_proba(xrp_data_svm_tst2)
print ('ROC TR AUC:     SVM1 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_svm.predict_proba(xrp_data_svm_tr2)[:, 1]))
print ('ROC TST AUC:    SVM1 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_svm[:, 1]))


#model2
xrp_data_svm_tr3 = xrp_trainPCA[['PC1','PC2','PC3','PC4','PC5']].as_matrix()
xrp_data_svm_tst3 = xrp_testPCA[['PC1','PC2','PC3','PC4','PC5']].as_matrix()
model_svm = svm.SVC(probability = True,kernel = 'rbf',C= 10,class_weight = 'balanced', gamma = 1)
model_svm.fit(xrp_data_svm_tr3, xrp_target_train)
pred_model_svm = model_svm.predict(xrp_data_svm_tst3) 
pred_probs_model_svm = model_svm.predict_proba(xrp_data_svm_tst3)
print ('ROC TR AUC:     SVM2 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_svm.predict_proba(xrp_data_svm_tr3)[:, 1]))
print ('ROC TST AUC:    SVM2 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_svm[:, 1]))


#### Without Tweet Data

In [None]:
#model1
xrp_data_lr_no_tweet_tr2 = xrp_scaled_no_tweet_tr
xrp_data_lr_no_tweet_tst2 = xrp_scaled_no_tweet_tst
model_svm = svm.SVC(probability = True, kernel = 'rbf', C= 10, class_weight = 'balanced', gamma = 1)
model_svm.fit(xrp_data_lr_no_tweet_tr2, xrp_target_train)
pred_model_svm = model_svm.predict(xrp_data_lr_no_tweet_tst2) 
pred_probs_model_svm = model_svm.predict_proba(xrp_data_lr_no_tweet_tst2)
print ('ROC TR AUC:     SVM1 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_svm.predict_proba(xrp_data_lr_no_tweet_tr2)[:, 1]))
print ('ROC TST AUC:    SVM1 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_svm[:, 1]))


#model2
xrp_data_svm_no_tweet_tr3 = xrp_train_no_tweet_PCA[['PC1','PC2']].as_matrix()
xrp_data_svm_no_tweet_tst3 = xrp_test_no_tweet_PCA[['PC1','PC2']].as_matrix()
model_svm = svm.SVC(probability = True,kernel = 'linear',C= 100,class_weight = 'balanced', gamma = 0.001)
model_svm.fit(xrp_data_svm_no_tweet_tr3, xrp_target_train)
pred_model_svm = model_svm.predict(xrp_data_svm_no_tweet_tst3) 
pred_probs_model_svm = model_svm.predict_proba(xrp_data_svm_no_tweet_tst3)
print ('ROC TR AUC:     SVM2 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_svm.predict_proba(xrp_data_svm_no_tweet_tr3)[:, 1]))
print ('ROC TST AUC:    SVM2 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_svm[:, 1]))


####  SVM Model Parameters

#### With Tweet Data

In [None]:
model_svm =  svm.SVC(probability = True, kernel = 'rbf', C= 1, class_weight = 'balanced', gamma = 1)
model_svm.fit(xrp_data_svm_tr2, xrp_target_train)

#### Without Tweet Data

In [None]:
model_svm_no_tweet =  svm.SVC(probability = True,kernel = 'linear',C= 100,class_weight = 'balanced', gamma = 0.001)
model_svm_no_tweet.fit(xrp_data_svm_no_tweet_tr3, xrp_target_train)

#### Apply Model to Test Set

#### With Tweet Data

In [None]:
#Predict target for the test set
pred_model_svm = model_svm.predict(xrp_data_svm_tst2)
#generate class probabilities
pred_probs_model_svm = model_svm.predict_proba(xrp_data_svm_tst2)

In [None]:
print('Training Accuracy:  SVM - %0.3f'%model_svm.score(xrp_data_svm_tr2, xrp_target_train))
print('Test Accuracy:      SVM - %0.3f'%model_svm.score(xrp_data_svm_tst2, xrp_target_test))
print ('Training ROC AUC:  SVM - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_svm.predict_proba(xrp_data_svm_tr2)[:, 1]))
print ('Testing ROC AUC:   SVM - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_svm[:, 1]))

#### Without Tweet Data

In [None]:
#Predict target for the test set
pred_model_svm_no_tweet = model_svm_no_tweet.predict(xrp_data_svm_no_tweet_tst3)
#generate class probabilities
pred_probs_model_svm_no_tweet = model_svm_no_tweet.predict_proba(xrp_data_svm_no_tweet_tst3)

In [None]:
print('Training Accuracy:  SVM - %0.3f'%model_svm_no_tweet.score(xrp_data_svm_no_tweet_tr3, xrp_target_train))
print('Test Accuracy:      SVM - %0.3f'%model_svm_no_tweet.score(xrp_data_svm_no_tweet_tst3, xrp_target_test))
print ('Training ROC AUC:  SVM - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_svm_no_tweet.predict_proba(xrp_data_svm_no_tweet_tr3)[:, 1]))
print ('Testing ROC AUC:   SVM - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_svm_no_tweet[:, 1]))

### Build Naive Bayes

####  Naive Bayes Model Parameters

#### With Tweet Data

In [None]:
model_nb = nb.GaussianNB()
model_nb.fit(xrp_data_nb_tr3, xrp_target_train)

#### Without Tweet Data

In [None]:
model_nb_no_tweet = nb.GaussianNB()
model_nb_no_tweet.fit(xrp_data_nb_no_tweet_tr3, xrp_target_train)

#### Apply Model to Test Set

#### With Tweet Data

In [None]:
#Predict target for the test set
pred_model_nb = model_nb.predict(xrp_data_nb_tst3)
#generate class probabilities
pred_probs_model_nb = model_nb.predict_proba(xrp_data_nb_tst3)

In [None]:
print('Training Accuracy:  NB - %0.3f'%model_nb.score(xrp_data_nb_tr3, xrp_target_train))
print('Test Accuracy:      NB - %0.3f'%model_nb.score(xrp_data_nb_tst3, xrp_target_test))
print ('Training ROC AUC:  NB - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_nb.predict_proba(xrp_data_nb_tr3)[:, 1]))
print ('Testing ROC AUC:   NB - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_nb[:, 1]))

#### Without Tweet Data

In [None]:
#Predict target for the test set
pred_model_nb_no_tweet = model_nb_no_tweet.predict(xrp_data_nb_no_tweet_tst3)
#generate class probabilities
pred_probs_model_nb_no_tweet = model_nb_no_tweet.predict_proba(xrp_data_nb_no_tweet_tst3)

In [None]:
print('Training Accuracy:  NB - %0.3f'%model_nb_no_tweet.score(xrp_data_nb_no_tweet_tr3, xrp_target_train))
print('Test Accuracy:      NB - %0.3f'%model_nb_no_tweet.score(xrp_data_nb_no_tweet_tst3, xrp_target_test))
print ('Training ROC AUC:  NB - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_nb_no_tweet.predict_proba(xrp_data_nb_no_tweet_tr3)[:, 1]))
print ('Testing ROC AUC:   NB - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_nb_no_tweet[:, 1]))

### Build Random Forest

#### Model evaluation on Test Set 

#### With Tweet Data

In [None]:
#model1
xrp_data_rf_tr2 = xrp_scaled_tr[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h', 'avg_senti',
                                   'avg_senti_L12h', 'avg_senti_L2h', 'avg_senti_L6h', 'fav_percent',
                                   'max_senti', 'min_senti', 'num_tweets', 'retweet_percent',
                                   'top_quartile_num_tweets']]
xrp_data_rf_tst2 = xrp_scaled_tst[['avg_return_L12h', 'avg_return_L2h', 'avg_return_L6h', 'avg_senti',
                                   'avg_senti_L12h', 'avg_senti_L2h', 'avg_senti_L6h', 'fav_percent',
                                   'max_senti', 'min_senti', 'num_tweets', 'retweet_percent',
                                   'top_quartile_num_tweets']]
model_rf = rf(class_weight = 'balanced' , random_state = 42 ,bootstrap = False, max_depth = 5, 
             min_samples_leaf = 0.1, min_samples_split = 0.1, n_estimators = 100)
model_rf.fit(xrp_data_rf_tr2, xrp_target_train)
pred_model_rf = model_rf.predict(xrp_data_rf_tst2) 
pred_probs_model_rf = model_rf.predict_proba(xrp_data_rf_tst2)
print ('ROC TR AUC:     RF1 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_rf.predict_proba(xrp_data_rf_tr2)[:, 1]))
print ('ROC TST AUC:    RF1 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_rf[:, 1]))


#model2
xrp_data_rf_tr2 = xrp_scaled_tr[['avg_return_L12h', 'avg_return_L6h', 'avg_senti','avg_senti_L2h', 'avg_senti_L6h', 
                                   'max_senti', 'num_tweets']]
xrp_data_rf_tst2 = xrp_scaled_tst[['avg_return_L12h', 'avg_return_L6h', 'avg_senti','avg_senti_L2h', 'avg_senti_L6h', 
                                   'max_senti', 'num_tweets']]
model_rf = rf(class_weight = 'balanced' , random_state = 42 ,bootstrap = False, max_depth = 5, 
            min_samples_leaf = 0.1, min_samples_split = 0.1, n_estimators = 500)
model_rf.fit(xrp_data_rf_tr2, xrp_target_train)
pred_model_rf = model_rf.predict(xrp_data_rf_tst2) 
pred_probs_model_rf = model_rf.predict_proba(xrp_data_rf_tst2)
print ('ROC TR AUC:     RF2 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_rf.predict_proba(xrp_data_rf_tr2)[:, 1]))
print ('ROC TST AUC:    RF2 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_rf[:, 1]))



#model3
xrp_data_rf_tr3 = xrp_trainPCA[['PC3','PC4']].as_matrix()
xrp_data_rf_tst3 = xrp_testPCA[['PC3','PC4']].as_matrix()
model_rf = rf(class_weight = 'balanced' , random_state = 42 ,bootstrap = True, max_depth = 5, 
            min_samples_leaf = 0.1, min_samples_split = 0.1, n_estimators = 300)
model_rf.fit(xrp_data_rf_tr3, xrp_target_train)
pred_model_rf = model_rf.predict(xrp_data_rf_tst3) 
pred_probs_model_rf = model_rf.predict_proba(xrp_data_rf_tst3)
print ('ROC TR AUC:     RF3 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_rf.predict_proba(xrp_data_rf_tr3)[:, 1]))
print ('ROC TST AUC:    RF3 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_rf[:, 1]))



#model4
xrp_data_rf_tr3 = xrp_trainPCA[['PC4']].as_matrix()
xrp_data_rf_tst3 = xrp_testPCA[['PC4']].as_matrix()
model_rf = rf(class_weight = 'balanced' , random_state = 42 ,bootstrap = True, max_depth = 5, 
            min_samples_leaf = 0.1, min_samples_split = 0.1, n_estimators = 500)
model_rf.fit(xrp_data_rf_tr3, xrp_target_train)
pred_model_rf = model_rf.predict(xrp_data_rf_tst3) 
pred_probs_model_rf = model_rf.predict_proba(xrp_data_rf_tst3)
print ('ROC TR AUC:     RF4 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_rf.predict_proba(xrp_data_rf_tr3)[:, 1]))
print ('ROC TST AUC:    RF4 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_rf[:, 1]))

#### Without Tweet Data

In [None]:
#model1
xrp_data_rf_no_tweet_tr3 = xrp_train_no_tweet_PCA[['PC1']].as_matrix()
xrp_data_rf_no_tweet_tst3 = xrp_test_no_tweet_PCA[['PC1']].as_matrix()
model_rf = rf(class_weight = 'balanced' , random_state = 42 ,bootstrap = False, max_depth = 5, 
            min_samples_leaf = 0.1, min_samples_split = 0.1, n_estimators = 100)
model_rf.fit(xrp_data_rf_no_tweet_tr3, xrp_target_train)
pred_model_rf = model_rf.predict(xrp_data_rf_no_tweet_tst3) 
pred_probs_model_rf = model_rf.predict_proba(xrp_data_rf_no_tweet_tst3)
print ('ROC TR AUC:     RF1 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_rf.predict_proba(xrp_data_rf_no_tweet_tr3)[:, 1]))
print ('ROC TST AUC:    RF1 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_rf[:, 1]))


#model2
xrp_data_rf_no_tweet_tr2 = xrp_scaled_no_tweet_tr[['avg_return_L6h','avg_return_L12h','avg_return_L2h']]
xrp_data_rf_no_tweet_tst2 = xrp_scaled_no_tweet_tst[['avg_return_L6h','avg_return_L12h','avg_return_L2h']]
model_rf = rf(class_weight = 'balanced' , random_state = 42 ,bootstrap = False, max_depth = 5, 
            min_samples_leaf = 0.1, min_samples_split = 0.3, n_estimators = 300)
model_rf.fit(xrp_data_rf_no_tweet_tr2, xrp_target_train)
pred_model_rf = model_rf.predict(xrp_data_rf_no_tweet_tst2) 
pred_probs_model_rf = model_rf.predict_proba(xrp_data_rf_no_tweet_tst2)
print ('ROC TR AUC:     RF2 - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_rf.predict_proba(xrp_data_rf_no_tweet_tr2)[:, 1]))
print ('ROC TST AUC:    RF2 - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_rf[:, 1]))


#### Best features for Random Forest According to Evaluation

#### With Tweet Data

In [None]:
xrp_data_rf_tr2 = xrp_scaled_tr[['avg_return_L6h','avg_return_L12h','avg_return_L2h']]
xrp_data_rf_tst2 = xrp_scaled_tst[['avg_return_L6h','avg_return_L12h','avg_return_L2h']]

#### Without Tweet Data

In [None]:
xrp_data_rf_no_tweet_tr2 = xrp_scaled_no_tweet_tr[['avg_return_L6h','avg_return_L12h','avg_return_L2h']]
xrp_data_rf_no_tweet_tst2 = xrp_scaled_no_tweet_tst[['avg_return_L6h','avg_return_L12h','avg_return_L2h']]

####  Random Forest Model Parameters

#### With Tweet Data

In [None]:
model_rf = rf(class_weight = 'balanced' , random_state = 42 ,bootstrap = False, max_depth = 5, 
            min_samples_leaf = 0.1, min_samples_split = 0.3, n_estimators = 300)

model_rf.fit(xrp_data_rf_tr2, xrp_target_train)

#### Without Tweet Data

In [None]:
model_rf_no_tweet = rf(class_weight = 'balanced' , random_state = 42 ,bootstrap = False, max_depth = 5, 
            min_samples_leaf = 0.1, min_samples_split = 0.3, n_estimators = 300)

model_rf_no_tweet.fit(xrp_data_rf_no_tweet_tr2, xrp_target_train)

#### Apply Model to Test Set

#### With Tweet Data

In [None]:
#Predict target for the test set
pred_model_rf = model_rf.predict(xrp_data_rf_tst2)
#generate class probabilities
pred_probs_model_rf = model_rf.predict_proba(xrp_data_rf_tst2)

In [None]:
print('Training Accuracy:  RF - %0.3f'%model_rf.score(xrp_data_rf_tr2, xrp_target_train))
print('Test Accuracy:      RF - %0.3f'%model_rf.score(xrp_data_rf_tst2, xrp_target_test))
print ('Training ROC AUC:  RF - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_rf.predict_proba(xrp_data_rf_tr2)[:, 1]))
print ('Testing ROC AUC:   RF - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_rf[:, 1]))

#### Without Tweet Data

In [None]:
#Predict target for the test set
pred_model_rf_no_tweet = model_rf_no_tweet.predict(xrp_data_rf_no_tweet_tst2)
#generate class probabilities
pred_probs_model_rf_no_tweet = model_rf_no_tweet.predict_proba(xrp_data_rf_no_tweet_tst2)

In [None]:
print('Training Accuracy:  RF - %0.3f'%model_rf_no_tweet.score(xrp_data_rf_no_tweet_tr2, xrp_target_train))
print('Test Accuracy:      RF - %0.3f'%model_rf_no_tweet.score(xrp_data_rf_no_tweet_tst2, xrp_target_test))
print ('Training ROC AUC:  RF - %0.3f'%metrics.roc_auc_score(xrp_target_train, model_rf_no_tweet.predict_proba(xrp_data_rf_no_tweet_tr2)[:, 1]))
print ('Testing ROC AUC:   RF - %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_rf_no_tweet[:, 1]))

# Evaluation

### Prepare Data for ROC

In [None]:
#With Tweets
#generate class probabilities for target ind = 1
pred_probs_target_lr = model_lr.predict_proba(xrp_data_lr_tst3)[:, 1]
pred_probs_target_dt = model_dt.predict_proba(xrp_data_dt_tst2)[:, 1]
pred_probs_target_svm = model_svm.predict_proba(xrp_data_svm_tst2)[:, 1]
pred_probs_target_nb = model_nb.predict_proba(xrp_data_nb_tst3)[:, 1]
pred_probs_target_rf = model_rf.predict_proba(xrp_data_rf_tst2)[:, 1]

#ROC Curve
fpr_lr, tpr_lr, thresholds_lr = metrics.roc_curve(xrp_target_test , pred_probs_target_lr)
fpr_dt, tpr_dt, thresholds_dt = metrics.roc_curve(xrp_target_test , pred_probs_target_dt)
fpr_svm, tpr_svm, thresholds_svm = metrics.roc_curve(xrp_target_test , pred_probs_target_svm)
fpr_nb, tpr_nb, thresholds_nb = metrics.roc_curve(xrp_target_test , pred_probs_target_nb)
fpr_rf, tpr_rf, thresholds_rf = metrics.roc_curve(xrp_target_test , pred_probs_target_rf)

#Without Tweets
#generate class probabilities for target ind = 1
pred_probs_target_lr_no_tweet = model_lr_no_tweet.predict_proba(xrp_data_lr_no_tweet_tst3)[:, 1]
pred_probs_target_dt_no_tweet = model_dt_no_tweet.predict_proba(xrp_data_dt_no_tweet_tst2)[:, 1]
pred_probs_target_svm_no_tweet = model_svm_no_tweet.predict_proba(xrp_data_svm_no_tweet_tst3)[:, 1]
pred_probs_target_nb_no_tweet = model_nb_no_tweet.predict_proba(xrp_data_nb_no_tweet_tst3)[:, 1]
pred_probs_target_rf_no_tweet = model_rf_no_tweet.predict_proba(xrp_data_rf_no_tweet_tst2)[:, 1]

#ROC Curve
fpr_lr_no_tweet, tpr_lr_no_tweet, thresholds_lr_no_tweet = metrics.roc_curve(xrp_target_test , pred_probs_target_lr_no_tweet)
fpr_dt_no_tweet, tpr_dt_no_tweet, thresholds_dt_no_tweet = metrics.roc_curve(xrp_target_test , pred_probs_target_dt_no_tweet)
fpr_svm_no_tweet, tpr_svm_no_tweet, thresholds_svm_no_tweet = metrics.roc_curve(xrp_target_test , pred_probs_target_svm_no_tweet)
fpr_nb_no_tweet, tpr_nb_no_tweet, thresholds_nb_no_tweet = metrics.roc_curve(xrp_target_test , pred_probs_target_nb_no_tweet)
fpr_rf_no_tweet, tpr_rf_no_tweet, thresholds_rf_no_tweet = metrics.roc_curve(xrp_target_test , pred_probs_target_rf_no_tweet)

### ROC Curve

In [None]:
sns.set_style('white')
#ROC Curve
#line to show Logistic Regression
plt.figure(figsize=(6,5))
plt.plot(fpr_lr, tpr_lr, linewidth= 2.5,linestyle='--', color = '#9b59b6', label='Exp3: LR AUC - %0.3f'% metrics.auc(fpr_lr, tpr_lr))
#line to show Logistic Regression
plt.plot(fpr_lr_no_tweet, tpr_lr_no_tweet, linewidth= 2.5,linestyle='--', color = '#350050', label='Exp4: LR AUC - %0.3f'% metrics.auc(fpr_lr_no_tweet, tpr_lr_no_tweet))
#ref line
plt.plot(fpr_lr_no_tweet, fpr_lr_no_tweet, linewidth= 1.5,linestyle='--', color = 'white')
#plot details
plt.legend( loc="lower right", borderaxespad=0. , fontsize=11)
plt.xlim(-0.02 , 1.02)
plt.ylim(-0.02 , 1.02)
plt.title('Ripple ROC Curve', fontsize=14)
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.xticks(fontsize=11)
plt.yticks(fontsize=11)
plt.show()


plt.figure(figsize=(6,5))
#line to show Decision Tree
plt.plot(fpr_dt, tpr_dt, linewidth= 2.5,linestyle='--', color = '#3498db', label='Exp3: DT AUC - %0.3f'% metrics.auc(fpr_dt, tpr_dt))
#line to show Decision Tree
plt.plot(fpr_dt_no_tweet, tpr_dt_no_tweet, linewidth= 2.5,linestyle='--', color = '#00195C', label='Exp4: DT AUC - %0.3f'% metrics.auc(fpr_dt_no_tweet, tpr_dt_no_tweet))
#ref line
plt.plot(fpr_lr_no_tweet, fpr_lr_no_tweet, linewidth= 1.5,linestyle='--', color = 'white')
#plot details
plt.legend( loc="lower right", borderaxespad=0. , fontsize=12)
plt.xlim(-0.02 , 1.02)
plt.ylim(-0.02 , 1.02)
plt.title('Ripple ROC Curve', fontsize=14)
plt.xlabel('False Positive Rate', fontsize=13)
plt.ylabel('True Positive Rate', fontsize=13)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()


plt.figure(figsize=(6,5))
#line to show Support Vector Machine
plt.plot(fpr_svm, tpr_svm, linewidth= 2.5,linestyle='--', color = '#e74c3c', label='Exp3: SVM AUC - %0.3f'% metrics.auc(fpr_svm, tpr_svm))
#line to show Support Vector Machine
plt.plot(fpr_svm_no_tweet, tpr_svm_no_tweet, linewidth= 2.5,linestyle='--', color = '#4e0000', label='Exp4: SVM AUC - %0.3f'% metrics.auc(fpr_svm_no_tweet, tpr_svm_no_tweet))
#ref line
plt.plot(fpr_lr_no_tweet, fpr_lr_no_tweet, linewidth= 1.5,linestyle='--', color = 'white')
#plot details
plt.legend( loc="lower right", borderaxespad=0. , fontsize=12)
plt.xlim(-0.02 , 1.02)
plt.ylim(-0.02 , 1.02)
plt.title('Ripple ROC Curve', fontsize=14)
plt.xlabel('False Positive Rate', fontsize=13)
plt.ylabel('True Positive Rate', fontsize=13)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()


plt.figure(figsize=(6,5))
#line to show Naive Bayes
plt.plot(fpr_nb, tpr_nb, linewidth= 2.5,linestyle='--', color = '#34495e', label='Exp3: NB AUC - %0.3f'% metrics.auc(fpr_nb, tpr_nb))
#line to show Naive Bayes
plt.plot(fpr_nb_no_tweet, tpr_nb_no_tweet, linewidth= 2.5,linestyle='--', color = '#cde2f7', label='Exp4: NB AUC - %0.3f'% metrics.auc(fpr_nb_no_tweet, tpr_nb_no_tweet))
#ref line
plt.plot(fpr_lr_no_tweet, fpr_lr_no_tweet, linewidth= 1.5,linestyle='--', color = 'white')
#plot details
plt.legend( loc="lower right", borderaxespad=0. , fontsize=12)
plt.xlim(-0.02 , 1.02)
plt.ylim(-0.02 , 1.02)
plt.title('Ripple ROC Curve', fontsize=14)
plt.xlabel('False Positive Rate', fontsize=13)
plt.ylabel('True Positive Rate', fontsize=13)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()



plt.figure(figsize=(6,5))
#line to show Random Forest
plt.plot(fpr_rf, tpr_rf, linewidth= 2.5,linestyle='--', color = '#2ecc71', label='Exp3: RF AUC - %0.3f'% metrics.auc(fpr_rf, tpr_rf))
#line to show Random Forest
plt.plot(fpr_rf_no_tweet, tpr_rf_no_tweet, linewidth= 2.5,linestyle='--', color = '#00660b', label='Exp4: RF AUC - %0.3f'% metrics.auc(fpr_rf_no_tweet, tpr_rf_no_tweet))
#ref line
plt.plot(fpr_lr_no_tweet, fpr_lr_no_tweet, linewidth= 1.5,linestyle='--', color = 'white')
#plot details
plt.legend( loc="lower right", borderaxespad=0. , fontsize=12)
plt.xlim(-0.02 , 1.02)
plt.ylim(-0.02 , 1.02)
plt.title('Ripple ROC Curve Curve', fontsize=14)
plt.xlabel('False Positive Rate', fontsize=13)
plt.ylabel('True Positive Rate', fontsize=13)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.show()


## Confusion Matrix

#### With Tweet Data

In [None]:
tn_lr, fp_lr, fn_lr, tp_lr = confusion_matrix(xrp_target_test, pred_model_lr).ravel()
tn_dt, fp_dt, fn_dt, tp_dt = confusion_matrix(xrp_target_test, pred_model_dt).ravel()
tn_svm, fp_svm, fn_svm, tp_svm = confusion_matrix(xrp_target_test, pred_model_svm).ravel()
tn_nb, fp_nb, fn_nb, tp_nb = confusion_matrix(xrp_target_test, pred_model_nb).ravel()
tn_rf, fp_rf, fn_rf, tp_rf = confusion_matrix(xrp_target_test, pred_model_rf).ravel()
print('Prevalence = %0.3f'%(sum(xrp_target_test)/len(xrp_target_test)))
print(' ')

print('Logistic Regression Model')
print('ROC AUC: %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_lr[:, 1]))
print('Accuracy: %0.3f'%metrics.accuracy_score(xrp_target_test, pred_model_lr))
print('TN: %d'%tn_lr, 'FP: %d'%fp_lr, 'FN: %d'%fn_lr, 'TP: %d'%tp_lr)
print (metrics.classification_report(xrp_target_test, pred_model_lr))

print('Decision Tree Model')
print('ROC AUC: %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_dt[:, 1]))
print('Accuracy: %0.3f'%metrics.accuracy_score(xrp_target_test, pred_model_dt))
print('TN: %d'%tn_dt, 'FP: %d'%fp_dt, 'FN: %d'%fn_dt, 'TP: %d'%tp_dt)
print (metrics.classification_report(xrp_target_test, pred_model_dt))

print('SVM Model')
print('ROC AUC: %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_svm[:, 1]))
print('Accuracy: %0.3f'%metrics.accuracy_score(xrp_target_test, pred_model_svm))
print('TN: %d'%tn_svm, 'FP: %d'%fp_svm, 'FN: %d'%fn_svm, 'TP: %d'%tp_svm)
print (metrics.classification_report(xrp_target_test, pred_model_svm))

print('Naive Bayes Model')
print('ROC AUC: %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_nb[:, 1]))
print('Accuracy: %0.3f'%metrics.accuracy_score(xrp_target_test, pred_model_nb))
print('TN: %d'%tn_nb, 'FP: %d'%fp_nb, 'FN: %d'%fn_nb, 'TP: %d'%tp_nb)
print (metrics.classification_report(xrp_target_test, pred_model_nb))

print('Random Forest Model')
print('ROC AUC: %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_rf[:, 1]))
print('Accuracy: %0.3f'%metrics.accuracy_score(xrp_target_test, pred_model_rf))
print('TN: %d'%tn_rf, 'FP: %d'%fp_rf, 'FN: %d'%fn_rf, 'TP: %d'%tp_rf)
print (metrics.classification_report(xrp_target_test, pred_model_rf))

#### Without Tweet Data

In [None]:
tn_lr_no_tweet, fp_lr_no_tweet, fn_lr_no_tweet, tp_lr_no_tweet = confusion_matrix(xrp_target_test, pred_model_lr_no_tweet).ravel()
tn_dt_no_tweet, fp_dt_no_tweet, fn_dt_no_tweet, tp_dt_no_tweet = confusion_matrix(xrp_target_test, pred_model_dt_no_tweet).ravel()
tn_svm_no_tweet, fp_svm_no_tweet, fn_svm_no_tweet, tp_svm_no_tweet = confusion_matrix(xrp_target_test, pred_model_svm_no_tweet).ravel()
tn_nb_no_tweet, fp_nb_no_tweet, fn_nb_no_tweet, tp_nb_no_tweet = confusion_matrix(xrp_target_test, pred_model_nb_no_tweet).ravel()
tn_rf_no_tweet, fp_rf_no_tweet, fn_rf_no_tweet, tp_rf_no_tweet = confusion_matrix(xrp_target_test, pred_model_rf_no_tweet).ravel()
print('Prevalence = %0.3f'%(sum(xrp_target_test)/len(xrp_target_test)))
print(' ')

print('Logistic Regression Model')
print('ROC AUC: %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_lr_no_tweet[:, 1]))
print('Accuracy: %0.3f'%metrics.accuracy_score(xrp_target_test, pred_model_lr_no_tweet))
print('TN: %d'%tn_lr_no_tweet, 'FP: %d'%fp_lr_no_tweet, 'FN: %d'%fn_lr_no_tweet, 'TP: %d'%tp_lr_no_tweet)
print (metrics.classification_report(xrp_target_test, pred_model_lr_no_tweet))

print('Decision Tree Model')
print('ROC AUC: %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_dt_no_tweet[:, 1]))
print('Accuracy: %0.3f'%metrics.accuracy_score(xrp_target_test, pred_model_dt_no_tweet))
print('TN: %d'%tn_dt_no_tweet, 'FP: %d'%fp_dt_no_tweet, 'FN: %d'%fn_dt_no_tweet, 'TP: %d'%tp_dt_no_tweet)
print (metrics.classification_report(xrp_target_test, pred_model_dt_no_tweet))

print('SVM Model')
print('ROC AUC: %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_svm_no_tweet[:, 1]))
print('Accuracy: %0.3f'%metrics.accuracy_score(xrp_target_test, pred_model_svm_no_tweet))
print('TN: %d'%tn_svm_no_tweet, 'FP: %d'%fp_svm_no_tweet, 'FN: %d'%fn_svm_no_tweet, 'TP: %d'%tp_svm_no_tweet)
print (metrics.classification_report(xrp_target_test, pred_model_svm_no_tweet))

print('Naive Bayes Model')
print('ROC AUC: %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_nb_no_tweet[:, 1]))
print('Accuracy: %0.3f'%metrics.accuracy_score(xrp_target_test, pred_model_nb_no_tweet))
print('TN: %d'%tn_nb_no_tweet, 'FP: %d'%fp_nb_no_tweet, 'FN: %d'%fn_nb_no_tweet, 'TP: %d'%tp_nb_no_tweet)
print (metrics.classification_report(xrp_target_test, pred_model_nb_no_tweet))

print('Random Forest Model')
print('ROC AUC: %0.3f'%metrics.roc_auc_score(xrp_target_test, pred_probs_model_rf_no_tweet[:, 1]))
print('Accuracy: %0.3f'%metrics.accuracy_score(xrp_target_test, pred_model_rf_no_tweet))
print('TN: %d'%tn_rf_no_tweet, 'FP: %d'%fp_rf_no_tweet, 'FN: %d'%fn_rf_no_tweet, 'TP: %d'%tp_rf_no_tweet)
print (metrics.classification_report(xrp_target_test, pred_model_rf_no_tweet))