In [27]:
pd.set_option("display.max_rows", 400)

In [63]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn import ensemble
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

#os.chdir("../data")
dataset = pd.read_csv("Dataset.csv")
dataset.drop('Unnamed: 0', axis=1, inplace=True)
#dataset.set_index("date")

#add label
price = dataset["price"]
pre_price = price.shift(-1)

labels = []
up = 1
down = 0
for i in range(len(dataset)):
    if price[i] <= pre_price[i]:
        labels.append(up)
    else:
        labels.append(down)

dataset["label"] = labels

#パラメータ初期値
dt_param1 = 'gini'
dt_param2 = 1
rf_param1 = 'gini'
rf_param2 = 1
rf_param3 = 12
sgdc_param1 = 0.0001
sgdc_param2 = 'log'
sgdc_param3 = 43
sgdc_param4 = 'elasticnet'
sgdc_param5 = False
svm_param1 = 1
svm_param2 = 2
svm_param3 = 0.001
svm_param4 = 'poly'

#Model定義
clf_dt = tree.DecisionTreeClassifier(criterion=dt_param1, max_depth=dt_param2)
clf_rf = ensemble.RandomForestClassifier(criterion=rf_param1, max_depth=rf_param2, n_estimators=rf_param2)
clf_sgdc = SGDClassifier(alpha=sgdc_param1, loss=sgdc_param2, max_iter=sgdc_param3, penalty=sgdc_param4, shuffle=sgdc_param5)
clf_svm = SVC(C=svm_param1, degree=svm_param2, gamma=svm_param3, kernel=svm_param4)

#X, y定義
X = dataset.drop("label", axis=1)
y = dataset["label"]
X = X.set_index('date')

#正規化
X_array = np.array(X)
def zscore(X, axis=None):
    xmean = X.mean(axis=axis, keepdims=True)
    xstd = np.std(X, axis=axis, keepdims=True)
    zscore = (X-xmean)/xstd
    return zscore

X_norm = zscore(X_array)

#データ分割
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2)#random_state=0

#パラメータ最適化/Decision Tree
def dtbestparam(X, y):
    features = X
    targets = y
    #試行するパラメータを並べる
    params = {
        'max_depth' : list(range(1, 20)),
        'criterion' : ['gini', 'entropy'],
        }
    grid_search = GridSearchCV(clf_dt, #分類器を渡す
                                param_grid=params, #試行して欲しいパラメータを渡す
                                cv=10, # 10-Fold CVで汎化性能を調べる
                                )
    grid_search.fit(features, targets)
    print(grid_search.best_score_, grid_search.best_params_)
    return grid_search.best_params_

#パラメータ最適化/ensemble.RandomForestClassifier
def rfbestparam(X, y):
    features = X
    targets = y
    #試行するパラメータを並べる
    params = {
        'max_depth' : list(range(1, 20)),
        'criterion' : ['gini', 'entropy'],
        'n_estimators' : list(range(1, 20)),
        }
    grid_search = GridSearchCV(clf_rf, #分類器を渡す
                                param_grid=params, #試行して欲しいパラメータを渡す
                                cv=10, # 10-Fold CVで汎化性能を調べる
                                )
    grid_search.fit(features, targets)
    print(grid_search.best_score_, grid_search.best_params_)
    return grid_search.best_params_

#パラメータ最適化/SGDClassifier
def sgdcbestparam(X, y):
    features = X
    targets = y
    #試行するパラメータを並べる
    params = {
        'loss' : ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive',
        'squared_epsilon_insensitive'],
        'penalty' : ['None', 'l2', 'l1', 'elasticnet'],
        'alpha' : list(np.arange(0.0001, 0.1, 10)),
        'max_iter' : list(range(5, 100)),
        'shuffle' : [False, True],
        }
    grid_search = GridSearchCV(clf_sgdc, #分類器を渡す
                                param_grid=params, #試行して欲しいパラメータを渡す
                                cv=10, # 10-Fold CVで汎化性能を調べる
                                )
    grid_search.fit(features, targets)
    print(grid_search.best_score_, grid_search.best_params_)
    return grid_search.best_params_

#パラメータ最適化/SVM
def svmbestparam(X, y):
    features = X
    targets = y
    #試行するパラメータを並べる
    params = {
        'C' : [1, 10, 100, 1000],
        'kernel' : ['poly', 'rbf', 'sigmoid'],
        'degree' : [2, 3, 4],
        'gamma' : [0.001, 0.0001],
        }
    grid_search = GridSearchCV(clf_svm, #分類器を渡す
                                param_grid=params, #試行して欲しいパラメータを渡す
                                cv=10, # 10-Fold CVで汎化性能を調べる
                                )
    grid_search.fit(features, targets)
    print(grid_search.best_score_, grid_search.best_params_)
    return grid_search.best_params_

#Xとx_norm(正規化されたX)を各モデルに入れ、ベストパラメータを表示
#dtbestparam(X, y)
dtbestparam(X_norm, y)
#取得したベストパラメータに代入、モデルのパラメータを更新(X_normの学習値を使用)
dt_bestparam = dtbestparam(X_norm, y)
dt_param1 = dt_bestparam['criterion']
dt_param2 = dt_bestparam['max_depth']
#Xとx_norm(正規化されたX)を各モデルに入れ、ベストパラメータを表示
#rfbestparam(X, y)
rfbestparam(X_norm, y)
#取得したベストパラメータに代入、モデルのパラメータを更新(X_normの学習値を使用)
rf_bestparam = rfbestparam(X_norm, y)
rf_param1 = rf_bestparam['criterion']
rf_param2 = rf_bestparam['max_depth']
rf_param3 = rf_bestparam['n_estimators']
#Xとx_norm(正規化されたX)を各モデルに入れ、ベストパラメータを表示
#sgdcbestparam(X, y)
sgdcbestparam(X_norm, y)
#取得したベストパラメータに代入、モデルのパラメータを更新(X_normの学習値を使用)
sgdc_bestparam = sgdcbestparam(X_norm, y)
sgdc_param1 = sgdc_bestparam['alpha']
sgdc_param2 = sgdc_bestparam['loss']
sgdc_param3 = sgdc_bestparam['max_iter']
sgdc_param4 = sgdc_bestparam['penalty']
sgdc_param5 = sgdc_bestparam['shuffle']
#Xとx_norm(正規化されたX)を各モデルに入れ、ベストパラメータを表示
#svmbestparam(X, y)
svmbestparam(X_norm, y)
#取得したベストパラメータに代入、モデルのパラメータを更新(X_normの学習値を使用)
svm_bestparam = svmbestparam(X_norm, y)
svm_param1 = svm_bestparam['C']
svm_param2 = svm_bestparam['degree']
svm_param3 = svm_bestparam['gamma']
svm_param4 = svm_bestparam['kernel']

#Model再定義  
clf_dt = tree.DecisionTreeClassifier(criterion=dt_param1, max_depth=dt_param2)
clf_rf = ensemble.RandomForestClassifier(criterion=rf_param1, max_depth=rf_param2, n_estimators=rf_param2)
clf_sgdc = SGDClassifier(alpha=sgdc_param1, loss=sgdc_param2, max_iter=sgdc_param3, penalty=sgdc_param4, shuffle=sgdc_param5)
clf_svm = SVC(C=svm_param1, degree=svm_param2, gamma=svm_param3, kernel=svm_param4)

#学習
clf_dt.fit(X_train, y_train)
clf_rf.fit(X_train, y_train)
clf_sgdc.fit(X_train, y_train)
clf_svm.fit(X_train, y_train)

#予測
score_dt = clf_dt.score(X_test, y_test)
score_rf = clf_rf.score(X_test, y_test)
score_sgdc = clf_sgdc.score(X_test, y_test)
score_svm = clf_svm.score(X_test, y_test)
print("score_dt:", score_dt)
print("score_rf:", score_rf)
print("score_sgdc:", score_sgdc)
print("score_svm:", score_svm)

0.608219178082 {'criterion': 'gini', 'max_depth': 1}
0.608219178082 {'criterion': 'gini', 'max_depth': 1}
0.630136986301 {'criterion': 'gini', 'max_depth': 1, 'n_estimators': 9}
0.627397260274 {'criterion': 'gini', 'max_depth': 2, 'n_estimators': 3}
0.66301369863 {'alpha': 0.0001, 'loss': 'huber', 'max_iter': 41, 'penalty': 'l2', 'shuffle': True}
0.657534246575 {'alpha': 0.0001, 'loss': 'huber', 'max_iter': 22, 'penalty': 'l1', 'shuffle': True}
0.624657534247 {'C': 1, 'degree': 2, 'gamma': 0.001, 'kernel': 'poly'}
0.624657534247 {'C': 1, 'degree': 2, 'gamma': 0.001, 'kernel': 'poly'}
score_dt: 0.643835616438
score_rf: 0.643835616438
score_sgdc: 0.616438356164
score_svm: 0.671232876712


In [64]:
'''Emotion'''
import pandas as pd
import numpy as np
import glob
import os

path = "/Users/user/jupyter/DiveIntoCode/DIC05_BTC/final_tweet_data.csv"
tweet = pd.read_csv(path)
tweet = tweet[["created_at", "text"]][:-1]
#tweet["text"] = tweet["text"].replace(np.nan, 0)
text = tweet["text"]
text.reset_index(drop=True, inplace=True)
text.replace(np.nan, "1", inplace=True)

from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types

client = language.LanguageServiceClient()
result = []
for i in range(len(text)):
    word = text[i]
    document = types.Document(
        content=word,
        type=enums.Document.Type.PLAIN_TEXT
    )
    sentiment = client.analyze_sentiment(document=document).document_sentiment
    result.append(sentiment.score)
# print("Text : {}".format(word))
# print("Sentiment : {},{}".format(sentiment.score, sentiment.magnitude))

tweet["score"] = result
tweet["label"] = labels

Unnamed: 0,created_at,text,score,label
0,2017-01-06 16:23:22,PBOC Meets With Leading Chinese #Bitcoin Excha...,0.3,1
1,,1,0.0,0
2,2017-01-09 19:19:10,Barry Silbert Shares DGC’s Perspective on #Eth...,0.0,1
3,2017-01-10 09:23:52,イーサリアムの新構想「エンタープライズ版イーサリアム」がお披露目へ - https://t....,-0.1,0
4,2017-01-11 14:49:03,Qtum: Connecting #Blockchain Technology With t...,0.2,1
5,2017-01-12 21:31:59,Here’s Why India’s GBMiners Mining Pool Switch...,0.0,1
6,2017-01-13 23:49:33,Ethereum Classic Hard Forks; Diffuses ‘Difficu...,0.7,0
7,2017-01-14 00:39:08,Feature Interview: North American Bitcoin Conf...,0.1,1
8,,1,0.0,1
9,2017-01-16 20:07:16,"Don Tapscott Predicts ""Blockchain Davos"" at Wo...",0.0,1


In [62]:
# date = dataset["date"]
# tweet = tweet.set_index(date)
# tweet.drop(["created_at"],axis=1,inplace=True)
# dataset.index = dataset.pop("date")

Unnamed: 0_level_0,average block size,difficulty,estimated transaction volumes in usd,estimated transaction volumes,hash rate,market capitalization,miners revenues,number of orphaned blocks,number of transactions per blocks,number of unique addresses,total bitcoins,trade volumes,transaction fees,price,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2017-01-07 00:00:00,0.987337,317688400000.0,310258500.0,344066.519218,2195140.0,14507900000.0,1583005.0,1583005.0,2048.338129,503340.0,16088787.5,16088787.5,107.136643,896.830375,1
2017-01-08 00:00:00,0.810419,317688400000.0,211475100.0,230638.883902,2637327.0,14753880000.0,1915883.0,1915883.0,1557.491018,455738.0,16090875.0,16090875.0,91.241348,908.149037,0
2017-01-09 00:00:00,0.923003,317688400000.0,267769200.0,297521.311414,2147763.0,14483320000.0,1548900.0,1548900.0,1984.904412,475571.0,16092575.0,16092575.0,102.045927,894.18025,1
2017-01-10 00:00:00,0.95972,319489500000.0,284258000.0,311898.393375,2679582.0,14668270000.0,1842810.0,1842810.0,2024.3,541167.0,16094575.0,16094575.0,119.940871,906.056914,0
2017-01-11 00:00:00,0.923016,336899900000.0,389299100.0,495840.25635,2562350.0,12637840000.0,1522760.0,1522760.0,1980.771242,491692.0,16096487.5,16096487.5,111.820875,785.223737,1
2017-01-12 00:00:00,0.802814,336899900000.0,317369300.0,394742.846166,2897298.0,12943140000.0,1750688.0,1750688.0,1721.132948,483077.0,16098637.5,16098637.5,106.356373,803.372375,1
2017-01-13 00:00:00,0.841531,336899900000.0,256962400.0,310296.050079,2528855.0,13333160000.0,1577155.0,1577155.0,1822.178808,466560.0,16100512.5,16100512.5,97.90854,826.295663,0
2017-01-14 00:00:00,0.742767,336899900000.0,154658300.0,189694.960647,2880550.0,13128500000.0,1759417.0,1759417.0,1408.540698,446468.0,16102662.5,16102662.5,86.856662,817.912875,1
2017-01-15 00:00:00,0.677068,336899900000.0,159034200.0,193769.248545,2796814.0,13217810000.0,1716578.0,1716578.0,1413.167665,417278.0,16104750.0,16104750.0,82.224321,822.2076,1
2017-01-16 00:00:00,0.758335,336899900000.0,207038800.0,249414.310177,2830308.0,13370310000.0,1769358.0,1769358.0,1583.668639,473646.0,16106862.5,16106862.5,103.642703,830.505125,1


In [69]:
#os.chdir("../TextData")
#pd.read_csv("btcnews_jp_20180113.csv")

Unnamed: 0,id,created_at,text,fav,RT
0,951708768626200576,2018-01-12 06:53:46,韓国法務部、ビットコイン取引所の全面禁止法案　方針曲げず（BTCN） - https://t...,103,90
1,951378432197365760,2018-01-11 09:01:08,投資の神様バフェット氏が再び仮想通貨を攻撃　「悲惨な最後を迎える」（BTCN） - http...,43,16
2,951316997291298817,2018-01-11 04:57:01,韓国取引所禁止法と中国マイニング禁止令、連鎖する報道でビットコイン急落-10%（BTCN） ...,116,108
3,951036145588162561,2018-01-10 10:21:01,「ブロックサイズを引き上げることへのリスクとは？」Jimmy Song氏　動画インタビュー ...,26,9
4,951016021531549696,2018-01-10 09:01:03,仮想通貨トロン　ホワイトペーパーにコピペ疑惑（BTCN） - https://t.co/d5...,123,78
5,950925669311176704,2018-01-10 03:02:01,週間ビットコイン相場 2018/1/9 −　ビットコイン　下降トレンドから脱出か（BTCN）...,16,8
6,950910315545456642,2018-01-10 02:01:00,「ソフトフォークとハードフォークの違いとは？」Jimmy Song氏　動画インタビュー Vo...,10,1
7,950653640095739905,2018-01-09 09:01:04,中国マイニングファームViaBTC　クラウドマイニング市場の提供を停止、当局の規制が影響か（...,48,39
8,949204079514763265,2018-01-05 09:01:02,元リップルCEOの資産がマーク・ザッカーバーグを超える（BTCN） - https://t....,114,63
9,949187862875996160,2018-01-05 07:56:36,中国PBoC、ビットコインのマイニング業者への優遇政策を廃止へ（BTCN） - https:...,46,36


In [None]:
#add label
"""
price = dataset["price"]
pre_price = price.shift(-1)

labels = []
up = 1
down = 0
for i in range(len(dataset)):
    if price[i] <= pre_price[i]:
        labels.append(up)
    else:
        labels.append(down)
    
dataset["label"] = labels

#Model定義
clf_dt = tree.DecisionTreeClassifier()
clf_rf = ensemble.RandomForestClassifier()
clf_sgdc = SGDClassifier()
"""