In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# junseo - data visualization library
import plotly.graph_objs as go
import plotly.express as px

In [3]:
plt.rc('font', family='Malgun Gothic')
plt.rc('axes', unicode_minus=False)

In [None]:
import jeju_traffic_data as jt

train, test = jt.load()

In [6]:
# junseo - data information

# data size
print("Train Data Size :", train.shape)
print("Test Data Size :", test.shape)

# missing data
print("Missing Train Data")
print(train.isnull().sum())
print("Total :", sum(train.isnull().sum()))
print()
print("Missing Test Data")
print(test.isnull().sum())
print("Total :", sum(test.isnull().sum()))

# data central tendency
print(train.describe())

Train Data Size : (4426519, 39)
Test Data Size : (274695, 39)
Missing Train Data
id                       0
base_date                0
day_of_week              0
base_hour                0
lane_count               0
road_rating              0
road_name                0
multi_linked             0
connect_code             0
maximum_speed_limit      0
vehicle_restricted       0
weight_restricted        0
height_restricted        0
road_type                0
start_node_name          0
start_latitude           0
start_longitude          0
start_turn_restricted    0
end_node_name            0
end_latitude             0
end_longitude            0
end_turn_restricted      0
target                   0
year                     0
month                    0
day                      0
is_holiday               0
season_autumn            0
season_spring            0
season_summer            0
season_winter            0
cos_time                 0
sin_time                 0
hour_mean_target         0
w

In [None]:
# junseo - data distribution

# base_date
plt.boxplot(train[["base_date"]])
plt.title("base_date")
plt.show()

# day_of_week
category_orders_dict = dict(day_of_week=["Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"])
fig = px.histogram(train, x="day_of_week", category_orders=category_orders_dict)
fig.show()

# base_hour
fig = px.histogram(train, x="base_hour")
fig.show()

# lane_count
fig = px.histogram(train, x="lane_count")
fig.show()

# road_rating
fig = px.histogram(train, x="road_rating")
fig.show()

# road_name
fig = px.histogram(train, x="road_name")
fig.show()

# multi_linked
fig = px.histogram(train, x="multi_linked")
fig.show()

# connect_code
fig = px.histogram(train, x="connect_code")
fig.show()

# maximum_speed_limit
fig = px.histogram(train, x="maximum_speed_limit")
fig.show()

# weight_restricted
fig = px.histogram(train, x="weight_restricted")
fig.show()

# road_type
fig = px.histogram(train, x="road_type")
fig.show()

# start_node_name
fig = px.histogram(train, x="start_node_name")
fig.show()

# start_latitude
plt.boxplot(train[["start_latitude"]])
plt.title("start_latitude")
plt.show()

# start_longitude
plt.boxplot(train[["start_longitude"]])
plt.title("start_longitude")
plt.show()

# end_node_name
fig = px.histogram(train, x="end_node_name")
fig.show()

# end_latitude
plt.boxplot(train[["end_latitude"]])
plt.title("end_latitude")
plt.show()

# end_longitude
plt.boxplot(train[["end_longitude"]])
plt.title("end_longitude")
plt.show()

In [None]:
# target
plt.boxplot(train[["target"]])
plt.title("target")
plt.show()

In [None]:
# junseo - data correlation

# 요일별 평균 속도
plt.figure(figsize=(8, 5))
sns.barplot(data=train, x="day_of_week", y="target", ci=None, order=["월", "화", "수", "목", "금", "토", "일"])
plt.title("요일별 평균 속도")
plt.ylabel("평균 속도 (km/h)")
plt.xlabel("요일")
plt.tight_layout()
plt.show()

# 시간대별 평균 속도
plt.figure(figsize=(10, 5))
sns.lineplot(data=train, x="base_hour", y="target", marker="o")
plt.title("시간대별 평균 속도")
plt.ylabel("평균 속도 (km/h)")
plt.xlabel("시간대")
plt.xticks(range(0, 24))
# plt.grid(True)
plt.tight_layout()
plt.show()

# 도로 등급별 평균 속도
plt.figure(figsize=(8, 5))
sns.boxplot(data=train, x="road_rating", y="target")
plt.title("도로 등급별 평균 속도 분포")
plt.ylabel("평균 속도")
plt.xlabel("도로 등급")
plt.tight_layout()
plt.show()

# 차로 수와 평균 속도
plt.figure(figsize=(8, 5))
sns.scatterplot(data=train, x="lane_count", y="target", alpha=0.6)
plt.title("차로 수와 평균 속도 관계")
plt.xlabel("차로 수")
plt.ylabel("평균 속도")
plt.tight_layout()
plt.show()

# 제한 속도와 평균 속도
plt.figure(figsize=(8, 5))
sns.scatterplot(data=train, x="maximum_speed_limit", y="target", alpha=0.6)
plt.title("제한 속도와 평균 속도 관계")
plt.xlabel("제한 속도 (km/h)")
plt.ylabel("평균 속도 (km/h)")
plt.tight_layout()
plt.show()