# 타슈 데이터 분석

## 공유 자전거 데이터 랜덤포레스트 반납 예측

In [None]:
import datetime

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib widget

In [None]:
df_rent = pd.read_pickle('tashu_dataset-rental_history.pkl')

In [None]:
df_rent.sample(10)

In [None]:
df_rent = pd.read_pickle('tashu_dataset-rental_history.pkl')
df_rent['대여일시'] = pd.to_datetime(df_rent['대여일시'], format='%Y%m%d%H%M%S')
df_rent['반납일시'] = pd.to_datetime(df_rent['반납일시'], format='%Y%m%d%H%M%S')
df_rent

In [None]:
df_rent.dtypes

In [None]:
df_station = pd.read_pickle('tashu_dataset-station_information.pkl')
df_station

In [None]:
df_station.sample(10)

In [None]:
df_station.dtypes

# 대여 - 반납 분석

In [None]:
df = df_rent

In [None]:
df_rent_by_station_count = df['반납스테이션'].groupby(by=df['대여스테이션']).count()
df_rent_by_station_count

In [None]:
df_return_count = df.loc[df['대여스테이션'] == df['반납스테이션']]['반납스테이션'].groupby(by=df['대여스테이션']).count()
df_return_count

In [None]:
df_return_ratio = df_return_count / df_rent_by_station_count
df_return_ratio

In [None]:
fig = plt.figure(figsize= (8, 6))
ax = fig.add_subplot()
ax.bar(df_return_ratio.index, df_return_ratio)
ax.set_title('Return ratio by station', fontsize='x-large')
ax.set_xlabel('Station number', fontsize='large')
ax.set_ylabel('Return ratio', fontsize='large')
ax.set_ylim((0, 1))

In [None]:
print_max = 15
count = 0
for index, row in df_return_ratio.sort_values(ascending=False).items():
    try:
        station_name = df_station[df_station.index == index]['대여소명'].values[0]
    except IndexError:
        station_name = '공개되지 않음'
    print(f'정류장 번호: {index}, 정류장 이름: {station_name}, 반납 비율: {row}')
    count += 1
    if count > print_max:
        break

In [None]:
fig = plt.figure(figsize= (8, 6))
ax = fig.add_subplot()
ax.hist(df_return_ratio)
ax.set_title('Return ratio histogram', fontsize='x-large')
ax.set_xlabel('Return ratio', fontsize='large')
ax.set_ylabel('Count', fontsize='large')

# 날씨 데이터 준비

In [None]:
df_weather = pd.read_pickle('weather.pkl')
df_weather['일시'] = pd.to_datetime(df_weather['일시'], format='%Y-%m-%d')
df_weather

In [None]:
df_rent['연월일'] = pd.to_datetime(df_rent['대여일시'].dt.strftime('%Y-%m-%d'), format='%Y-%m-%d')
df_weather['연월일'] = pd.to_datetime(df_weather['일시'].dt.strftime('%Y-%m-%d'), format='%Y-%m-%d')
df = df_rent.merge(df_weather, on=['연월일'])
df.rename(columns={'평균기온(°C)': '평균기온', '일강수량(mm)': '일강수량'}, inplace=True)
df_rent.drop(columns='연월일', inplace=True)
df_weather.drop(columns='연월일', inplace=True)
df

In [None]:
rent_index = pd.DatetimeIndex(df_rent['대여일시'])
daily_df_rent = df['대여일시'].groupby(by=rent_index.strftime('%Y-%m-%d')).count()
daily_df_rent

In [None]:
daily_temperature = df['평균기온'].groupby(by=rent_index.strftime('%Y-%m-%d')).mean()
daily_temperature

In [None]:
daily_rain = df['일강수량'].groupby(by=rent_index.strftime('%Y-%m-%d')).mean()
daily_rain

In [None]:
fig = plt.figure(figsize= (8, 6))
ax1 = fig.add_subplot()
ax1.bar(daily_df_rent.index, daily_df_rent, color='black')
ax2 = ax1.twinx()
ax2.plot(daily_temperature.index, daily_temperature, color='orange')
ax2.scatter(daily_rain.index, daily_rain, color='blue', marker='.')

In [None]:
fig = plt.figure(figsize= (8, 6))
ax1 = fig.add_subplot()
ax1.bar(daily_df_rent.index, daily_df_rent, color='black')
ax1.set_xlim(('2020-01-01', '2021-12-31'))
ax2 = ax1.twinx()
ax2.plot(daily_temperature.index, daily_temperature, color='orange')
ax2.scatter(daily_rain.index, daily_rain, color='blue', marker='.')

# 특징 데이터 정리

In [None]:
df_feature = pd.DataFrame()
df_feature['대여스테이션'] = df['대여스테이션']
df_feature['대여시'] = df['대여일시'].dt.hour
df_feature['대여요일'] = df['대여일시'].dt.dayofweek
df_feature['회원구분'] = df['회원구분']
df_feature['평균기온'] = df['평균기온']
df_feature['일강수량'] = df['일강수량']
df_feature['귀환'] = df['대여스테이션'] == df['반납스테이션']
df_feature

In [None]:
features = ['대여스테이션', '대여시', '대여요일', '평균기온', '일강수량']
X = df_feature[features]
y = df_feature['귀환']

In [None]:
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
print(f'학습에 사용할 피처 크기: {X_train.shape}')
print(f'예측에 사용할 피처 크기: {X_test.shape}')

In [None]:
random_state = 0
clf = RandomForestClassifier(random_state=random_state)

In [None]:
clf.fit(X_train, y_train)

In [None]:
print(f'학습 평균 정확도: {clf.score(X_train, y_train)}') 
print(f'예측 평균 정확도: {clf.score(X_test, y_test)}') 

In [None]:
for name, importance in zip (features, clf.feature_importances_):
    print(f'{name} = {importance}')

In [None]:
fig = plt.figure(figsize= (8*1, 6*1))
ax = fig.add_subplot()
annotations = tree.plot_tree(clf.estimators_[5], feature_names=features,
               max_depth=2,
               ax=ax, fontsize='medium')
for ann in annotations:
    ann.set_font("nanumgothic")

### 이게 최선일까?