# YahooFinanceAPI_株価予測モデル作成_v1
## 基礎：株価を取得する

In [17]:
from yahoo_finance_api2 import share
from yahoo_finance_api2.exceptions import YahooFinanceError
import pandas as pd

code = 3323 #ターゲット
S_year = 10
S_day = 1

# 目的変数を作成する
def kabuka():
    company_code = str(code) + '.T'
    my_share = share.Share(company_code)
    symbol_data = None

    try:
        symbol_data = my_share.get_historical(share.PERIOD_TYPE_YEAR,
                                              S_year,
                                              share.FREQUENCY_TYPE_DAY,
                                              S_day)
    except YahooFinanceError as e:
        print(e.message)
        sys.exit(1)
    # 株価をデータフレームに入れている
    df_base = pd.DataFrame(symbol_data)
    df_base = pd.DataFrame(symbol_data.values(), index=symbol_data.keys()).T
    df_base.timestamp = pd.to_datetime(df_base.timestamp, unit='ms')
    df_base.index = pd.DatetimeIndex(df_base.timestamp, name='timestamp').tz_localize('UTC').tz_convert('Asia/Tokyo')
    #df_base = df_base.drop(['timestamp', 'open', 'high', 'low', 'volume'], axis=1)
    
    #df_base = df_base.rename(columns={'close':company_code + '対象'})
    #df_base = df_base[:-1] #一番最後の行を削除
    df_base = df_base.reset_index(drop=True)
    
    
    return company_code, df_base

result = kabuka()
print(str(result[0]), result[1].shape)

# データフレームへもどす
df_base = result[1]

df_base.head()

3323.T (2467, 6)


Unnamed: 0,timestamp,open,high,low,close,volume
0,2011-06-22,29.0,30.0,27.0,30.0,1095400.0
1,2011-06-23,28.299999,29.6,26.110001,28.51,332800.0
2,2011-06-24,27.51,30.5,25.719999,26.299999,169200.0
3,2011-06-27,26.040001,26.1,24.9,26.1,80500.0
4,2011-06-28,27.1,27.1,25.549999,26.4,49000.0


## 前処理：予測するためのデータ加工

今回は、open、high、low、volumeデータを説明変数とし、目的変数をclose（終値）とする。  
ただし、1日ずらし、前日のデータから翌日を終値を予測できる形状にする。

In [11]:
# 目的変数と説明変数にわける
#説明変数：close
y = df_base['close'][1:]
y

1        28.510000
2        26.299999
3        26.100000
4        26.400000
5        25.350000
           ...    
2462    105.000000
2463    105.000000
2464    105.000000
2465    105.000000
2466    105.000000
Name: close, Length: 2466, dtype: float64

In [12]:
import datetime as dt

# 曜日を足す
df_base['day_of_week'] = df_base['timestamp'].dt.day_name()
df_base

# 曜日をダミー変数化
ohe_columns = ['day_of_week']
# pd.get_dummiesがone-hotエンコード、ダミー変数化
df_base = pd.get_dummies(df_base, dummy_na=True, columns=ohe_columns)

#目的変数：timestamp,closeを削除
df_base = df_base.drop(['timestamp', 'close'], axis=1)

# 予測モデルようDF
X = df_base.shift()[1:]
X

Unnamed: 0,open,high,low,volume,day_of_week_Friday,day_of_week_Monday,day_of_week_Thursday,day_of_week_Tuesday,day_of_week_Wednesday,day_of_week_nan
1,29.000000,30.0,27.000000,1095400.0,0.0,0.0,0.0,0.0,1.0,0.0
2,28.299999,29.6,26.110001,332800.0,0.0,0.0,1.0,0.0,0.0,0.0
3,27.510000,30.5,25.719999,169200.0,1.0,0.0,0.0,0.0,0.0,0.0
4,26.040001,26.1,24.900000,80500.0,0.0,1.0,0.0,0.0,0.0,0.0
5,27.100000,27.1,25.549999,49000.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2462,107.000000,108.0,106.000000,234300.0,1.0,0.0,0.0,0.0,0.0,0.0
2463,108.000000,108.0,105.000000,369300.0,0.0,1.0,0.0,0.0,0.0,0.0
2464,105.000000,108.0,102.000000,1337700.0,0.0,0.0,0.0,1.0,0.0,0.0
2465,105.000000,106.0,105.000000,258800.0,0.0,0.0,0.0,0.0,1.0,0.0


# 予測モデルを作成する
* 標準化する
* 複数のモデルを利用し、正解率を確認する

In [13]:
# 前処理用
from sklearn.preprocessing import StandardScaler

#データを切り分ける
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression,Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor

#便利ツール
from sklearn.pipeline import Pipeline

#評価方法
from sklearn.metrics import r2_score

# ホールドアウト（Xとyに切っていく）
X_train,X_test,y_train,y_test = train_test_split(X,
                                                 y,
                                                 test_size=0.25,
                                                 random_state=1)

In [14]:
print('検証Xの分割')
print('X_train data:', X_train.shape)
print('X_test data:', X_test.shape)
print('---------')
print('検証yの分割')
print('y_train data:', y_train.shape)
print('y_test data:', y_test.shape)

検証Xの分割
X_train data: (1849, 10)
X_test data: (617, 10)
---------
検証yの分割
y_train data: (1849,)
y_test data: (617,)


In [15]:
# pipeline setting
pipelines = {
     'ols': Pipeline([('scl',StandardScaler()),
                      ('est',LinearRegression())]),
     
     'ridge':Pipeline([('scl',StandardScaler()),
                       ('est',Ridge(random_state=0))]),

     'tree': Pipeline([('scl',StandardScaler()),
                     ('est',DecisionTreeRegressor(random_state=0))]),

     'rf': Pipeline([('scl',StandardScaler()),
                     ('est',RandomForestRegressor(random_state=0))]),
     
     'gbr1': Pipeline([('scl',StandardScaler()),
                      ('est',GradientBoostingRegressor(random_state=0))]),

     'gbr2': Pipeline([('scl',StandardScaler()),
                      ('est',GradientBoostingRegressor(n_estimators=200,
                                                       random_state=0))])
}

In [16]:
# build and evaluate
scores = {}
for pipe_name, pipeline in pipelines.items():
    pipeline.fit(X_train, y_train)
    #trainデータで学習
    scores[(pipe_name,'train')] = r2_score(y_train, pipeline.predict(X_train))
    
    #テストデータを予測
    scores[(pipe_name,'test')] = r2_score(y_test, pipeline.predict(X_test))

pd.Series(scores).unstack()

Unnamed: 0,test,train
gbr1,0.990814,0.996731
gbr2,0.990329,0.997916
ols,0.99372,0.991175
rf,0.990727,0.998562
ridge,0.993754,0.99083
tree,0.984228,1.0
