# 4차시: 대전광역시공용자전거(타슈)데이터 결정트리

## 2023. 06. 07. 14:10 ~ 16:00 (50분×2)
1. 결정트리 기초 이론 (Decision Tree)
1. Fisher's Iris data set
1. Iris decision tree
1. 대전광역시 타슈 결정트리

### 참고자료
- [파이썬 3 표준 문서](https://docs.python.org/3/index.html)
- [대전광역시 시설관리공단](https://www.djsiseol.or.kr/)
- [공공데이터포털](https://www.data.go.kr/)
- [기상청 기상자료개발포털](https://data.kma.go.kr/)
- [Scikit learn Decision Tree](https://scikit-learn.org/stable/modules/tree.html)

In [None]:
import datetime

import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.model_selection import train_test_split

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

### Iris data decision tree

In [None]:
iris = load_iris()
print(iris.DESCR)

In [None]:
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df

In [None]:
sy = pd.Series(iris.target, dtype='category')
sy = sy.cat.rename_categories(iris.target_names)
df['species'] = sy
df

In [None]:
scatter_matrix(df[iris.feature_names],
               c=iris.target,
               alpha=0.5, 
               figsize=(7, 7),)

In [None]:
X = df[iris.feature_names]
y = df['species']

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X, y)

In [None]:
fig = plt.figure(figsize=(16, 12))
ax = fig.add_subplot()
_ = tree.plot_tree(clf, 
                   class_names=iris.target_names,
                   feature_names=iris.feature_names, 
                   max_depth=None, filled=True, ax=ax, fontsize='medium')

In [None]:
print(tree.export_text(clf, feature_names=iris.feature_names))

### 타슈 데이터셋 결정트리

In [None]:
### pickle 파일을 코드와 같은 위치에 둘 것!
### Colab 환경이라면 pkl 파일을 업로드할 것! (`.../content/`)
df_rent = pd.read_pickle('tashu_dataset-rental_history_2021.pkl')
df_rent['대여일시'] = pd.to_datetime(df_rent['대여일시'], format='%Y%m%d%H%M%S')
df_rent['반납일시'] = pd.to_datetime(df_rent['반납일시'], format='%Y%m%d%H%M%S')
df_rent

In [None]:
df_station = pd.read_pickle('tashu_dataset-station_information.pkl')
df_station

In [None]:
df = df_rent
df_rent_by_station_count = df['반납스테이션'].groupby(by=df['대여스테이션']).count()
df_rent_by_station_count

In [None]:
df_return_count = df.loc[df['대여스테이션'] == df['반납스테이션']]['반납스테이션'].groupby(by=df['대여스테이션']).count()
df_return_count

In [None]:
df_return_ratio = df_return_count / df_rent_by_station_count
df_return_ratio

In [None]:
fig = plt.figure(figsize= (8, 6))
ax = fig.add_subplot()
ax.bar(df_return_ratio.index, df_return_ratio)
ax.set_title('Return ratio by station', fontsize='x-large')
ax.set_xlabel('Station number', fontsize='large')
ax.set_ylabel('Return ratio', fontsize='large')
ax.set_ylim((0, 1))

In [None]:
print_max = 30
count = 0
for index, row in df_return_ratio.sort_values(ascending=False).items():
    station_name = df_station[df_station.index == index]['대여소명'].values[0]
    print(f'정류장 번호: {index}, 정류장 이름: {station_name}, 반납 비율: {row}')
    count += 1
    if count > print_max:
        break

In [None]:
fig = plt.figure(figsize= (8, 6))
ax = fig.add_subplot()
ax.hist(df_return_ratio)
ax.set_title('Return ratio histogram', fontsize='x-large')
ax.set_xlabel('Return ratio', fontsize='large')
ax.set_ylabel('Count', fontsize='large')

In [None]:
df_feature = pd.DataFrame()
df_feature['대여스테이션'] = df['대여스테이션']
df_feature['대여시'] = df['대여일시'].dt.hour
df_feature['대여요일'] = df['대여일시'].dt.dayofweek
df_feature['귀환'] = df['대여스테이션'] == df['반납스테이션']
df_feature

In [None]:
features = ['대여스테이션', '대여시', '대여요일']
X = df_feature[features]
y = df_feature['귀환']

In [None]:
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
print(f'학습에 사용할 피처 크기: {X_train.shape}')
print(f'예측에 사용할 피처 크기: {X_test.shape}')

In [None]:
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [None]:
print(f'학습 정확도: {clf.score(X_train, y_train)}') 
print(f'예측 정확도: {clf.score(X_test, y_test)}') 

In [None]:
for name, importance in zip (features, clf.feature_importances_):
    print(f'{name} = {importance}')

In [None]:
fig = plt.figure(figsize=(16, 12))
ax = fig.add_subplot()
_ = tree.plot_tree(clf, 
                   feature_names=['RentStation', 'RentHour', 'RentWeekday'],
                   max_depth=10, filled=True, ax=ax, fontsize='medium')

In [None]:
clf = tree.DecisionTreeClassifier(max_depth=5)
clf.fit(X_train, y_train)

In [None]:
print(f'학습 평균 정확도: {clf.score(X_train, y_train)}') 
print(f'예측 평균 정확도: {clf.score(X_test, y_test)}') 

In [None]:
for name, importance in zip (features, clf.feature_importances_):
    print(f'{name} = {importance}')

In [None]:
fig = plt.figure(figsize=(16, 12))
ax = fig.add_subplot()
_ = tree.plot_tree(clf, 
                   feature_names=['RentStation', 'RentHour', 'RentWeekday'],
                   max_depth=None, filled=True, ax=ax, fontsize='medium')