<a href="https://colab.research.google.com/github/nullpitch-dev/DS_L1_Notebooks/blob/master/DS_L1_EX_01_2nd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd

# load data
url_1 = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/highway_01_07.csv'
url_2 = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/highway_08_12.csv'

data_1 = pd.read_csv(url_1)
data_2 = pd.read_csv(url_2)

# change Korean StartPoint to English StartPoint
def changeLang(x):
    if x == '경기':
        return 'Gyeonggi'                                                       
    elif x == '충청':                                                           
        return 'Chungcheong'                                                    
    elif x == '전라':                                                           
        return 'Jeolla'                                                         
    elif x == '경상':                                                           
        return 'Gyeongsang'                                                     
    elif x == '강원':                                                           
        return 'Gangwon'                                                        
    else:                                                                       
        return 'ERROR'   

data_2 = data_2.assign(StartPoint=data_2.apply(lambda x:
                                               changeLang(x['StartPoint']),
                                               axis=1))

# merge into one dataframe
data = pd.concat([data_1, data_2], axis=0)


In [123]:
# [1] quantile

# get destinations
dest = data.columns[2:]

# filter travels from Gangwon
from_GW = data[data['StartPoint'] == 'Gangwon']

# move traffics into one column
traffic = from_GW[dest[0]]
for i in dest[1:]:
    traffic = pd.concat([traffic, from_GW[i]], axis=0)

# calculate quantile
q = traffic.quantile([0.25, 0.5, 0.75])
print(f'Answer [1] : q1 = {q[0.25]:.0f}, meadian = {q[0.5]:.0f}, q3 = {q[0.75]:.0f}')

Answer [1] : q1 = 988, meadian = 5455, q3 = 54882


In [124]:
# [2] ttest_ind

from scipy import stats

GG_GS = data[data['StartPoint'] == 'Gyeonggi']['Gyeongsang']
GS_GG = data[data['StartPoint'] == 'Gyeongsang']['Gyeonggi']

# T-Test
t_val, p_val = stats.ttest_ind(GG_GS, GS_GG)

if p_val < 0.5:
    reject = 'YES'
else:
    reject = 'NO'

print(f'Answer [2] : P-value = {p_val:.4f}, Reject H0 : {reject}')

Answer [2] : P-value = 0.0013, Reject H0 : YES


In [125]:
# [3] datetime, KMeans,
#     Array(unique values and counts, index with certain value, zfill)

import datetime
from sklearn.cluster import KMeans
import numpy as np

CC_GW = data[data['StartPoint'] == 'Chungcheong'][['date', 'Gangwon']]

# convert date column to int and then convert to string and to datetime
# otherwise, str(data['date']) will produce 'YYYYMMDD.0' format
CC_GW = CC_GW.assign(dt_date=CC_GW.apply(lambda x:
                                         datetime.datetime.strptime(str(int(x['date'])),
                                                                    '%Y%m%d'),
                                         axis=1))

# create month and day of week columns (Sunday = 0)
CC_GW = CC_GW.assign(month=CC_GW.apply(lambda x: x['dt_date'].strftime('%m'), axis=1))
CC_GW = CC_GW.assign(wd=CC_GW.apply(lambda x: x['dt_date'].strftime('%w'), axis=1))

# groupby dayofweek and month
wd_m = CC_GW.groupby(['wd', 'month']).agg({'Gangwon': 'mean'})
wd_m.reset_index(inplace=True)

# find min and max per day of week
wd_min = wd_m.groupby('wd').agg({'Gangwon': 'min'})
wd_min.rename(columns={'Gangwon': 'min'}, inplace=True)
wd_max = wd_m.groupby('wd').agg({'Gangwon': 'max'})
wd_max.rename(columns={'Gangwon': 'max'}, inplace=True)
wd_minmax = pd.concat([wd_min, wd_max], axis=1)
wd_minmax.reset_index(inplace=True)

# merge into one dataframe
wd_m = pd.merge(wd_m, wd_minmax, how='inner', on='wd')

# MinMax normalize
wd_m = wd_m.assign(norm_gw=
                  (wd_m['Gangwon'] - wd_m['min']) / (wd_m['max'] - wd_m['min']))

# groupby month
month_ave = wd_m.groupby('month').agg({'norm_gw': 'mean'})

# conduct KMeans
cluster = KMeans(n_clusters=3, n_init=3, max_iter=100, random_state=1234)
pred = cluster.fit(month_ave)

# find A_group(October's group)
A_group = pred.labels_[9]

# find labels and counts for each label
labels, counts = np.unique(pred.labels_, return_counts=True)
cnt = pd.DataFrame(data=counts, index=labels, columns=['counts'])

# find A_group's count
A_count = cnt.loc[A_group]['counts']

# find months in A_group
A_months = np.where(pred.labels_ == A_group)[0] + 1

# convert months(8) to string('08')
A_months_str = []

for m in A_months:
    A_months_str.append(str(m).zfill(2))

# filter with A_group months and Mondays
wd_m_filter = wd_m[(wd_m['month'].isin(A_months_str)) & (wd_m['wd'] == '1')]

# Average of A_group months' monday travels
ave = wd_m_filter['norm_gw'].mean()

print(f'Answer [3] : No of A group = {A_count}, Average = {ave:.2f}')

Answer [3] : No of A group = 3, Average = 0.75


In [126]:
# [4] LinearRegression, datetime

from sklearn.linear_model import LinearRegression

# create datetime type date field
data = data.assign(dt_date=data.apply(lambda x:
                                      datetime.datetime.strptime(
                                          str(int(x['date'])), '%Y%m%d'),
                                      axis=1)) 

# create day of week column
data = data.assign(wd=data.apply(lambda x: int(x['dt_date'].strftime('%w')),
                                 axis=1))

# create week number column
data = data.assign(week=data.apply(lambda x: int(x['dt_date'].strftime('%U')),
                                 axis=1))

# make train set
train = data[(data['dt_date'] >= '20140101') & (data['dt_date'] < '20140701')]
train = train[train['StartPoint'] == 'Gyeonggi']

train_x1 = pd.DataFrame(train[train['wd'] == 0]['Chungcheong']).reset_index()
train_x2 = pd.DataFrame(train[train['wd'] == 0]['Gyeongsang']).reset_index()
train_x3 = pd.DataFrame(train[train['wd'] == 0]['Gangwon']).reset_index()
train_x4 = pd.DataFrame(train[train['wd'] == 6]['Jeolla']).reset_index()

train_X = pd.concat([train_x1, train_x2, train_x3, train_x4], axis=1)
train_X = train_X.drop(columns=['index'])

train_y = pd.DataFrame(train[train['wd'] == 0]['Jeolla']).reset_index()
train_y = train_y.drop(columns=['index'])

# make test set
test = data[(data['dt_date'] == '20140705') | (data['dt_date'] == '20140706') |\
            (data['dt_date'] == '20140712') | (data['dt_date'] == '20140713') |\
            (data['dt_date'] == '20140719') | (data['dt_date'] == '20140720')]
test = test[test['StartPoint'] == 'Gyeonggi']

test_x1 = pd.DataFrame(test[test['wd'] == 0]['Chungcheong']).reset_index()
test_x2 = pd.DataFrame(test[test['wd'] == 0]['Gyeongsang']).reset_index()
test_x3 = pd.DataFrame(test[test['wd'] == 0]['Gangwon']).reset_index()
test_x4 = pd.DataFrame(test[test['wd'] == 6]['Jeolla']).reset_index()

test_X = pd.concat([test_x1, test_x2, test_x3, test_x4], axis=1)
test_X = test_X.drop(columns=['index'])

# train
lr = LinearRegression()
model = lr.fit(train_X, train_y)

# predict
pred = model.predict(test_X)

print(f'Answer [4] : {pred[0][0]:.1f}, {pred[1][0]:.1f}, {pred[2][0]:.1f}')

Answer [4] : 2962.8, 3161.4, 3027.5
