<a href="https://colab.research.google.com/github/nullpitch-dev/DS_L1_Notebooks/blob/master/DS_L1_EX_01.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd

# loading data from a csv fle
url1 = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/highway_01_07.csv'
url2 = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/highway_08_12.csv'

data_1st = pd.read_csv(url1)
data_2nd = pd.read_csv(url2)

def conv(x):
    if x == '경기':
        return 'Gyeonggi'
    elif x == '충청':
        return 'Chungcheong'
    elif x == '전라':
        return 'Jeolla'
    elif x == '경상':
        return 'Gyeongsang'
    elif x == '강원':
        return 'Gangwon'
    else:
        return 'ERROR'

data_2nd['StartPoint'] = data_2nd.apply(lambda x: conv(x['StartPoint']), axis=1)
data = data_1st.append(data_2nd)

In [0]:
# [1]

data1 = data[data['StartPoint'] == 'Gangwon']
data1_1 = pd.concat([data1['Gyeonggi'], data1['Chungcheong'], data1['Jeolla'],
                    data1['Gyeongsang'], data1['Gangwon']], ignore_index=True)

quantile = data1_1.quantile(q=[0.25, 0.5, 0.75])

print(f'Answer [1] : q1 = {quantile[0.25]:.0f}, ' +
                   f'median = {quantile[0.50]:.0f}, ' +
                   f'q3 = {quantile[0.75]:.0f}')

Answer [1] : q1 = 988, median = 5455, q3 = 54882


In [0]:
# [2]

from scipy.stats import stats

G_S = data[data['StartPoint'] == 'Gyeonggi']['Gyeongsang']
S_G = data[data['StartPoint'] == 'Gyeongsang']['Gyeonggi']

t_value, p_value = stats.ttest_ind(G_S, S_G)

if p_value < 0.05:
    reject = 'YES'
else:
    reject = 'NO'

print(f'Answer[2] : P-value = {p_value:.4f}, Reject = {reject}')

Answer[2] : P-value = 0.0013, Reject = YES


In [0]:
# [3]

import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
import numpy as np

data3 = data[data['StartPoint'] == 'Chungcheong'][['date', 'Gangwon']]

data3 = data3.assign(strdate=data3.apply(lambda x: str(int(x['date'])), axis=1))
data3 = data3.assign(conv_date=data3.apply(lambda x: datetime.datetime.strptime(
                                                      x['strdate'], '%Y%m%d'
                                                   ), axis=1))
data3 = data3.assign(weekday=data3.apply(lambda x: int(x['conv_date'].strftime('%w')), axis=1))
data3 = data3.assign(month=data3.apply(lambda x: int(x['conv_date'].strftime('%m')), axis=1))

data3 = data3.groupby(['weekday', 'month']).agg({'Gangwon': 'mean'})
data3.reset_index(level=['weekday', 'month'], inplace=True)

norm_data = []
for wd in range(7):
    data_wd = data3[data3['weekday'] == wd]
    scaler = MinMaxScaler()
    result = scaler.fit_transform(data_wd['Gangwon'].to_numpy().reshape(-1, 1))
    result_df = pd.DataFrame(result, index=list(data_wd.index.values),
                             columns=['norm_traffic'])
    norm_data.append(result_df)

norm_data_df = pd.concat(objs=norm_data)
data3 = pd.merge(left=data3, right=norm_data_df, left_index=True, right_index=True)

data3_month = data3.groupby('month').agg({'norm_traffic': 'mean'})

cluster = KMeans(n_clusters=3, n_init=3, max_iter=100, random_state=1234)
pred = cluster.fit(data3_month)
km_result = pred.predict(data3_month)

# finding October's cluster
data3_month.reset_index(inplace=True)
oct_index = data3_month[data3_month['month'] == 10].index.values[0]
A_group = cluster.labels_[oct_index]

# counting number of months in the October's cluster
unique, counts = np.unique(km_result, return_counts=True)
no_months = dict(zip(unique, counts))[A_group]

# finding months in the October's cluster
a_group_month = np.where(km_result == A_group)[0] + 1

a_monday = data3[(data3['month'].isin(a_group_month)) & (data3['weekday'] == 1)]
ans = a_monday.groupby('weekday').agg({'norm_traffic': 'mean'})['norm_traffic'].loc[1]

print(f'Answer [3] : No of months = {no_months:.0f}, Mean value = {ans:.2f}')

Answer [3] : No of months = 3, Mean value = 0.75


In [0]:
# [4]

import datetime
from sklearn.linear_model import LinearRegression

data5 = data[data['StartPoint'] == 'Gyeonggi']

data5 = data5.assign(strdate=data5.apply(lambda x: str(int(x['date'])), axis=1))
data5 = data5.assign(conv_date=data5.apply(lambda x: datetime.datetime.strptime(
                                                      x['strdate'], '%Y%m%d'
                                                   ), axis=1))
data5 = data5.assign(month=data5.apply(lambda x: int(x['conv_date'].strftime('%m')),
                                       axis=1))
data5 = data5.assign(week=data5.apply(lambda x: int(x['conv_date'].strftime('%U')),
                                      axis=1))
data5 = data5.assign(weekday=data5.apply(lambda x: int(x['conv_date'].strftime('%w')),
                                         axis=1))

def yesterdayDepart(x):
    wn = data5[data5['conv_date'] == x]['week'].iloc[0]
    wd = data5[data5['conv_date'] == x]['weekday'].iloc[0]

    if (wn == 0) & (wd == 0):  # if there is no yesterday
        return 0
    else:
        if wd == 0:
            y_wd = 6
            y_wn = wn - 1
        else:
            y_wd = wd - 1
            y_wn = wn

        traffic = data5[(data5['week'] == y_wn) & (data5['weekday'] == y_wd)]['Jeolla']
        
        if len(traffic) == 0:  # if there is no traffic yesterday
            return 0
        else:
            return traffic.iloc[0]
    
data5 = data5.assign(yesterday= data5.apply(lambda x: yesterdayDepart(x['conv_date']),
                                            axis=1))

train = data5[(data5['month'] >= 1) & (data5['month'] <= 6) & (data5['weekday'] == 0)]
train_X = train[['Chungcheong', 'Gyeongsang', 'Gangwon', 'yesterday']]
train_y = train['Jeolla']

test = data5[(data5['date'] == 20140706) | (data5['date'] == 20140713) |
             (data5['date'] == 20140720)]
test_X = test[['Chungcheong', 'Gyeongsang', 'Gangwon', 'yesterday']]

lr = LinearRegression()
model = lr.fit(train_X, train_y)
pred = model.predict(test_X)

print(f'Answer [4] : {pred[0]:.1f}, {pred[1]:.1f}, {pred[2]:.1f}')

Answer [4] : 2962.8, 3161.4, 3027.5
