In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
url_1 = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/highway_01_07.csv'
url_2 = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/highway_08_12.csv'

data_1 = pd.read_csv(url_1)
data_2 = pd.read_csv(url_2)

### [1]

In [3]:
def convert(kr):
    if (kr == '강원') | (kr == 'Gangwon'):
        return 'Gangwon'
    elif (kr == '경기') | (kr == 'Gyeonggi'):
        return 'Gyeonggi'
    elif (kr == '경상') | (kr == 'Gyeongsang'):
        return 'Gyeongsang'
    elif (kr == '전라') | (kr == 'Jeolla'):
        return 'Jeolla'
    elif (kr == '충청') | (kr == 'Chungcheong'):
        return 'Chungcheong'
    else:
        return 'ERROR'

data_2 = data_2.assign(StartPoint=data_2.StartPoint.apply(lambda x: convert(x)))

In [4]:
d = pd.concat([data_1, data_2], axis=0)

In [5]:
d1 = d[d.StartPoint == 'Gangwon']
d1 = pd.concat([d1['Gyeonggi'], d1['Chungcheong'], d1['Jeolla'], d1['Gyeongsang'], d1['Gangwon']], axis=0)

In [6]:
q1, med, q3 = d1.quantile([0.25, 0.5, 0.75])

In [7]:
print(f"{q1:.0f}, {med:.0f}, {q3:.0f}")

988, 5455, 54882


### [2]

In [8]:
d2_gs = d[d.StartPoint == 'Gyeonggi'].Gyeongsang
d2_sg = d[d.StartPoint == 'Gyeongsang'].Gyeonggi

In [9]:
from scipy.stats import ttest_ind

t_val, p_val = ttest_ind(d2_gs, d2_sg)

In [10]:
print(f"{p_val:.4f}")
print(f"YES")

0.0013
YES


### [3]

In [11]:
d3 = d[d.StartPoint == 'Chungcheong'][['date', 'Gangwon']]

In [12]:
d3 = d3.assign(date=pd.to_datetime(d3.date, format='%Y%m%d'))

In [13]:
d3 = d3.assign(month=d3.date.dt.month)
d3 = d3.assign(dow=d3.date.dt.dayofweek)

In [14]:
d3 = d3.groupby(['dow', 'month']).agg({'Gangwon': 'mean'}).reset_index()

In [15]:
dowMin = d3.groupby('dow').agg({'Gangwon': 'min'}).reset_index()
dowMax = d3.groupby('dow').agg({'Gangwon': 'max'}).reset_index()

In [16]:
d3 = pd.merge(d3, dowMin, how='inner', on='dow')
d3 = pd.merge(d3, dowMax, how='inner', on='dow')

In [17]:
d3 = d3.rename(columns={'Gangwon_x': 'Dist', 'Gangwon_y': 'Min', 'Gangwon': 'Max'})

In [18]:
d3 = d3.assign(norm=(d3.Dist - d3.Min) / (d3.Max - d3.Min))

In [19]:
d3_month = d3.groupby('month').agg({'norm': 'mean'})

In [20]:
from sklearn.cluster import KMeans

cluster = KMeans(n_clusters=3, random_state=1234, max_iter=100).fit(d3_month)

In [21]:
oct_cl = cluster.labels_[9]

In [22]:
idx_A = [i for i, v in enumerate(cluster.labels_) if v == oct_cl]
no_of_A = len(idx_A)

In [23]:
print(f"{no_of_A}")

3


In [24]:
month_A = list(np.array(idx_A) + 1)

In [25]:
print(f"{d3[(d3.dow == 0) & (d3.month.isin(month_A))].norm.mean():.2f}")

0.75


### [4]

In [26]:
d4 = d.assign(date=pd.to_datetime(d.date, format='%Y%m%d'))

In [27]:
d4 = d4.assign(dow=d4.date.dt.dayofweek)
d4 = d4.assign(woy=d4.date.dt.isocalendar().week)
d4 = d4[d4.StartPoint == 'Gyeonggi']

In [28]:
d4 = d4.drop(columns=['StartPoint', 'Gyeonggi'])

In [29]:
def get_yesterday(woy):
    if woy < 30:
        return d4[(d4.woy == woy) & (d4.dow == 5)].Jeolla.iloc[0]

d4 = d4.assign(YJL=d4.woy.apply(lambda x: get_yesterday(x)))

In [30]:
d4 = d4[d4.dow == 6]

In [31]:
from sklearn.linear_model import LinearRegression

X_var = ['Chungcheong', 'Gyeongsang', 'Gangwon', 'YJL']
d4_train = d4[d4.date <= '2014-06-30']

model = LinearRegression().fit(d4_train[X_var], d4_train.Jeolla)

In [32]:
d4_test = d4[(d4.date == '2014-07-06') | (d4.date == '2014-07-13') | (d4.date == '2014-07-20')]

In [33]:
pred = model.predict(d4_test[X_var])
pred = pd.DataFrame(pred, columns=['esti'])
pred = pred.assign(esti=pred.esti.apply(lambda x: round(x, 1)))

In [34]:
pred

Unnamed: 0,esti
0,2962.8
1,3161.4
2,3027.5
