# Colab 지도 시각화

1. 외부 드라이브 연결 : https://colab.research.google.com/notebooks/io.ipynb#scrollTo=XDg9OBaYqRMd
2. geopandas : https://colab.research.google.com/github/shakasom/GDS/blob/master/Part1%20-%20Introduction.ipynb#scrollTo=Ck2S7UY9M9w8

In [None]:
%%time

# Important library for many geopython libraries
!apt install gdal-bin python-gdal python3-gdal 
# Install rtree - Geopandas requirment
!apt install python3-rtree 
# Install Geopandas
!pip install git+git://github.com/geopandas/geopandas.git
# Install descartes - Geopandas requirment
!pip install descartes 
# Install Folium for Geographic data visualization
!pip install folium
# Install plotlyExpress
!pip install plotly_express

In [1]:
import warnings
warnings.filterwarnings(action='ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import folium
import geopandas as gpd

import tqdm
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
path = '/content/drive/My Drive/dacon_jeju_sales'

In [3]:
os.listdir(path)

['data.zip',
 '사이트 주소.txt',
 '개요.txt',
 'jeju datahub 데이터',
 '재난지원금 관련 보고서',
 '기타 보고서',
 'data',
 '.ipynb_checkpoints',
 '공공데이터포털 데이터',
 'nsdi_dataset',
 'kosis_dataset',
 '201222-기본 EDA.ipynb',
 'bc_dataset',
 'dg_dataset',
 '201224-기본 EDA(내용정리)',
 '201219-지도시각화.ipynb',
 '201225-좌표계변경.ipynb']

## 파이썬으로 좌표계 변경하기

In [4]:
df5 = pd.read_csv('/content/drive/My Drive/dacon_jeju_sales/data/KRI-DAC_Jeju_data5.csv')
df6 = pd.read_csv('/content/drive/My Drive/dacon_jeju_sales/data/KRI-DAC_Jeju_data6.csv')
df7 = pd.read_csv('/content/drive/My Drive/dacon_jeju_sales/data/KRI-DAC_Jeju_data7.csv')
df8 = pd.read_csv('/content/drive/My Drive/dacon_jeju_sales/data/KRI-DAC_Jeju_data8.csv')

In [5]:
df = pd.concat([df5, df6, df7, df8])
df.drop(columns=['X', 'Y'], inplace=True) # 7월 데이터에만 있는 (구)좌표를 제거합니다

In [6]:
df.head()

Unnamed: 0,OBJECTID,Field1,YM,SIDO,SIGUNGU,FranClass,Type,Time,TotalSpent,DisSpent,NumofSpent,NumofDisSpent,POINT_X,POINT_Y
0,1,1,202005,제주특별자치도,제주시,영세,일반한식,00시,363000,66500,10,2,877005.9834,1479766.0
1,2,2,202005,제주특별자치도,제주시,영세,단란주점,00시,1180000,0,3,0,877005.7447,1479816.0
2,3,3,202005,제주특별자치도,제주시,중소1,편의점,00시,157670,6850,20,2,877056.6756,1479616.0
3,4,4,202005,제주특별자치도,제주시,영세,편의점,00시,46600,0,2,0,877055.9593,1479766.0
4,5,5,202005,제주특별자치도,제주시,영세,주점,00시,66000,0,2,0,877055.4817,1479866.0


In [8]:
df.head()

Unnamed: 0,OBJECTID,Field1,YM,SIDO,SIGUNGU,FranClass,Type,Time,TotalSpent,DisSpent,NumofSpent,NumofDisSpent,POINT_X,POINT_Y
0,1,1,202005,제주특별자치도,제주시,영세,일반한식,00시,363000,66500,10,2,877005.9834,1479766.0
1,2,2,202005,제주특별자치도,제주시,영세,단란주점,00시,1180000,0,3,0,877005.7447,1479816.0
2,3,3,202005,제주특별자치도,제주시,중소1,편의점,00시,157670,6850,20,2,877056.6756,1479616.0
3,4,4,202005,제주특별자치도,제주시,영세,편의점,00시,46600,0,2,0,877055.9593,1479766.0
4,5,5,202005,제주특별자치도,제주시,영세,주점,00시,66000,0,2,0,877055.4817,1479866.0


읍면동을 생성하기 위해서 POINT_X, POINT_Y 좌표데이터와 공간정보포털에서 제공하고 있는 읍면동 경계데이터를 사용합니다.

1. X,Y 좌표에서 점(POINT) 좌표를 생성합니다
2. 데이터의 좌표계와 경계 데이터의 좌표계를 일치시켜줍니다. 여기서는 WGS 84 좌표계로 통일합니다
3. 특정 점을 포함하고 있는 경계를 찾고 해당 읍면동 명칭을 입력합니다

In [9]:
from pyproj import Proj, transform

In [10]:
#좌표계 변환(pyproj의 변환과정이 geopandas보다 연산속도가 빠름)
proj_ITRF = Proj(init='epsg:5179')
proj_WGS = Proj(init='epsg:4326')
df['x'], df['y'] = transform(proj_ITRF, proj_WGS, list(df['POINT_X']), list(df['POINT_Y']))
# Point를 생성
#gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.x, df.y))
df['geometry'] = gpd.points_from_xy(df.x, df.y)

In [11]:
df.head()

Unnamed: 0,OBJECTID,Field1,YM,SIDO,SIGUNGU,FranClass,Type,Time,TotalSpent,DisSpent,NumofSpent,NumofDisSpent,POINT_X,POINT_Y,x,y,geometry
0,1,1,202005,제주특별자치도,제주시,영세,일반한식,00시,363000,66500,10,2,877005.9834,1479766.0,126.178894,33.302315,POINT (126.17889 33.30232)
1,2,2,202005,제주특별자치도,제주시,영세,단란주점,00시,1180000,0,3,0,877005.7447,1479816.0,126.178884,33.302766,POINT (126.17888 33.30277)
2,3,3,202005,제주특별자치도,제주시,중소1,편의점,00시,157670,6850,20,2,877056.6756,1479616.0,126.179458,33.300971,POINT (126.17946 33.30097)
3,4,4,202005,제주특별자치도,제주시,영세,편의점,00시,46600,0,2,0,877055.9593,1479766.0,126.17943,33.302323,POINT (126.17943 33.30232)
4,5,5,202005,제주특별자치도,제주시,영세,주점,00시,66000,0,2,0,877055.4817,1479866.0,126.179412,33.303224,POINT (126.17941 33.30322)


경계 데이터도 변환합니다.

In [12]:
b_path = '/content/drive/My Drive/dacon_jeju_sales/nsdi_dataset/LSMD_ADM_SECT_UMD_제주/LSMD_ADM_SECT_UMD_50_202012.shp'
jeju_bound = gpd.read_file(b_path, crs='EPSG:5174',encoding='euc-kr')
# 좌표계 수정
jeju_bound = jeju_bound.set_crs(epsg=5174)
jeju_bound['geometry'] = jeju_bound['geometry'].to_crs('EPSG:4326')

In [13]:
jeju_bound.head()

Unnamed: 0,EMD_CD,EMD_NM,SGG_OID,COL_ADM_SE,geometry
0,50110250,한림읍,1425,50110,"MULTIPOLYGON (((126.27237 33.43499, 126.27245 ..."
1,50110320,추자면,1745,50110,"MULTIPOLYGON (((126.30404 33.94992, 126.30411 ..."
2,50110134,오등동,97,50110,"POLYGON ((126.52685 33.47778, 126.52703 33.477..."
3,50110253,애월읍,98,50110,"MULTIPOLYGON (((126.29777 33.44650, 126.29788 ..."
4,50110115,삼양삼동,99,50110,"POLYGON ((126.57829 33.52136, 126.57828 33.521..."


경계값을 이용해서 읍면동에 할당합니다. 

In [38]:
def find_dong_with_boundray(jeju_boundary, df):
  coor_df = gpd.GeoDataFrame(columns={'dong', 'geometry'}) # 좌표를 저장할 DataFrame을 생성
  coors = df['geometry'].unique() # 중복되는 좌표값 제거하기 위해 unique 값만 뽑는다.

  print('Start Boundary Loop')
  for num in tqdm.tqdm_notebook(range(len(jeju_boundary))):
    # 특정 경계를 지정하고 해당 읍면동에 포함되는지 확인
    bound = jeju_boundary.loc[num, 'geometry'] 
    dong_name = jeju_boundary.loc[num, 'EMD_NM']
    f = coors.within(bound)

    idx = [num for num, i in enumerate(f) if i] # True인 좌표의 index로 변환
    coor_df = coor_df.append(gpd.GeoDataFrame({'dong':[dong_name for _ in range(len(idx))], 'geometry':coors[idx]})) #해당 좌표에 동명을 할당해서 coor_df에 추가

  coor_df = coor_df.drop_duplicates(subset='geometry', keep='first').reset_index(drop=True) #유니크 값만 뽑는다.
  df = df.merge(coor_df, how='left', on='geometry') # df에 merge에서 dong 생성

  null_df = df.loc[df['dong'].isnull(), 'geometry'] # 특정 좌표의 경우, 경계값에 포함되지 않는 경우가 있다
  print("Null data : {}".format(len(null_df)))
  '''
  return df, null_df
  # 이렇게 포함되지 않는 데이터는 거리가 가장 짧은 좌표의 동명으로 채워준다
  print('Start fill Null values')
  for num, xy in tqdm.tqdm_notebook(null_df.iteritems()):
    idx = coor_df['geometry'].distance(xy).argmin()
    df.loc[num, 'dong'] = coor_df.loc[idx, 'dong']
  '''
  print(len(coor_df))
  coor_null = null_df.unique()
  for xy in tqdm.tqdm_notebook(coor_null):
    idx = coor_df['geometry'].distance(xy).argmin()
    dong_name = coor_df.loc[idx, 'dong']
    coor_df = coor_df.append(gpd.GeoDataFrame({'dong':[dong_name], 'geometry': [xy]}), ignore_index=True)

  df.drop(columns='dong', inplace=True)
  df = df.merge(coor_df, how='left', on='geometry') # df에 merge에서 dong 생성
  return df, null_df, coor_df

In [39]:
df2, null_d, cd = find_dong_with_boundray(jeju_bound, df)

Start Boundary Loop


HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))


Null data : 11076
15191


HBox(children=(FloatProgress(value=0.0, max=178.0), HTML(value='')))




In [45]:
df2.dong.isnull().sum()

0

In [44]:
#assert sum(df['dong'].isnull()) == 0
df2[['OBJECTID', 'YM', 'geometry', 'dong']].to_csv('/content/drive/My Drive/dacon_jeju_sales/nsdi_dataset/제주도_전체_읍면동추가2.csv', index=False)

In [None]:
# 좌표를 저장할 DataFrame을 생성한다
coor_df = gpd.GeoDataFrame(columns={'dong', 'geometry'})

In [None]:
z# 중복되는 좌표값 제거하기 위해 unique 값만 뽑는다.
coors = df5['geometry'].unique()

# 이미 계산된 좌표값은 제외한다.
#coors = gpd.GeoDataFrame([xy for xy in coors if xy not in coor_df['geometry']])
#coors = gpd.array.GeometryArray(xy for xy in coors if xy not in coor_df['geometry'])

In [None]:
%%time

for num in tqdm.tqdm_notebook(range(len(jeju_boundary))):
  # 특정 경계를 지정하고 해당 읍면에 포함되는지 확인한다.
  bound = jeju_boundary.loc[num, 'geometry']
  dong_name = jeju_boundary.loc[num, 'EMD_NM']
  f = coors.within(bound)

  # True인 좌표의 index로 변환한다.
  idx = [num for num, i in enumerate(f) if i]

  #해당 좌표에 동명을 할당해서 coor_df에 추가한다.
  coor_df = coor_df.append(gpd.GeoDataFrame({'dong':[dong_name for _ in range(len(idx))], 'geometry':coors[idx]}))
  coor_df = coor_df.drop_duplicates(subset='geometry', keep='first').reset_index(drop=True) #유니크 값만 뽑는다.

HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))


CPU times: user 14.1 s, sys: 69.7 ms, total: 14.2 s
Wall time: 14.3 s


In [None]:
coor_df.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 13658 entries, 0 to 13657
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   geometry  13658 non-null  geometry
 1   dong      13658 non-null  object  
dtypes: geometry(1), object(1)
memory usage: 213.5+ KB


In [None]:
coor_df.drop_duplicates(subset='geometry', keep='first')

In [None]:
c.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 13658 entries, 0 to 13657
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   geometry  13658 non-null  geometry
 1   dong      13658 non-null  object  
dtypes: geometry(1), object(1)
memory usage: 320.1+ KB


In [None]:
# 좌표가 하나의 동에만 할당됬는지를 확인한다.
temp = pd.DataFrame(coor_df.copy())#.groupby('geometry')['dong'].count()
temp['geometry'] = temp['geometry'].astype(str)

print(sum(temp.groupby('geometry')['dong'].nunique() != 1))

#인덱스를 초기화한다.
#coor_df = coor_df.reset_index(drop=True)
'''# 중복된 좌표값을 제거하고 인덱스를 정렬한다.
coor_df['temp'] = coor_df['geometry'].astype(str)
coor_df.drop_duplicates('temp').drop(columns='temp').reset_index(drop=True, inplace=True)
# 최종적으로 13658개의 좌표가 생성된다.
print(len(coor_df))'''

0


"# 중복된 좌표값을 제거하고 인덱스를 정렬한다.\ncoor_df['temp'] = coor_df['geometry'].astype(str)\ncoor_df.drop_duplicates('temp').drop(columns='temp').reset_index(drop=True, inplace=True)\n# 최종적으로 13658개의 좌표가 생성된다.\nprint(len(coor_df))"

In [None]:
# 기존에 존재하는 df에 merge에서 dong을 넣는다.
df5 = df5.merge(coor_df, how='left', on='geometry')

In [None]:
df5.head()

Unnamed: 0,OBJECTID,Field1,YM,SIDO,SIGUNGU,FranClass,Type,Time,TotalSpent,DisSpent,NumofSpent,NumofDisSpent,POINT_X,POINT_Y,geometry,dong
0,1,1,202005,제주특별자치도,제주시,영세,일반한식,00시,363000,66500,10,2,877005.9834,1479766.0,POINT (126.17889 33.30232),한경면
1,2,2,202005,제주특별자치도,제주시,영세,단란주점,00시,1180000,0,3,0,877005.7447,1479816.0,POINT (126.17888 33.30277),한경면
2,3,3,202005,제주특별자치도,제주시,중소1,편의점,00시,157670,6850,20,2,877056.6756,1479616.0,POINT (126.17946 33.30097),한경면
3,4,4,202005,제주특별자치도,제주시,영세,편의점,00시,46600,0,2,0,877055.9593,1479766.0,POINT (126.17943 33.30232),한경면
4,5,5,202005,제주특별자치도,제주시,영세,주점,00시,66000,0,2,0,877055.4817,1479866.0,POINT (126.17941 33.30322),한경면


In [None]:
# 특정 좌표의 경우, 경계값에 포함되지 않는 경우가 있다
null_df = df5.loc[df5['dong'].isnull(), 'geometry']
print(len(null_df))

2592


In [None]:
# 이렇게 포함되지 않는 데이터를 거리가 가장 짧은 좌표의 동명으로 채워준다
for num, xy in tqdm.tqdm_notebook(null_df.iteritems()):
  idx = coor_df['geometry'].distance(xy).argmin()
  df5.loc[num, 'dong'] = coor_df.loc[idx, 'dong']

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [None]:
assert sum(df5['dong'].isnull()) == 0
df5.to_csv('/content/drive/My Drive/dacon_jeju_sales/nsdi_dataset/제주도_5월_읍면동추가.csv', index=False)
null_df.to_csv('/content/drive/My Drive/dacon_jeju_sales/nsdi_dataset/5월_null값.csv', index=True)

In [None]:

for xy in null_df.unique():
  idx = coor_df.distance(xy).argmin()
  break

In [None]:
def find_dong_with_boundray(jeju_boundary, df):
  coor_df = gpd.GeoDataFrame(columns={'dong', 'geometry'}) # 좌표를 저장할 DataFrame을 생성한다
  coors = df['geometry'].unique() # 중복되는 좌표값 제거하기 위해 unique 값만 뽑는다.

  print('Start Boundary Loop')
  for num in tqdm.tqdm_notebook(range(len(jeju_boundary))):
    # 특정 경계를 지정하고 해당 읍면에 포함되는지 확인한다.
    bound = jeju_boundary.loc[num, 'geometry'] 
    dong_name = jeju_boundary.loc[num, 'EMD_NM']
    f = coors.within(bound)

    idx = [num for num, i in enumerate(f) if i] # True인 좌표의 index로 변환한다.
    coor_df = coor_df.append(gpd.GeoDataFrame({'dong':[dong_name for _ in range(len(idx))], 'geometry':coors[idx]})) #해당 좌표에 동명을 할당해서 coor_df에 추가한다.

  coor_df = coor_df.drop_duplicates(subset='geometry', keep='first').reset_index(drop=True) #유니크 값만 뽑는다.
  df = df.merge(coor_df, how='left', on='geometry') # df에 merge에서 dong을 넣는다.

  null_df = df.loc[df['dong'].isnull(), 'geometry'] # 특정 좌표의 경우, 경계값에 포함되지 않는 경우가 있다
  print("Null data : {}".format(len(null_df)))

  # 이렇게 포함되지 않는 데이터를 거리가 가장 짧은 좌표의 동명으로 채워준다
  print('Start fill Null values')
  for num, xy in tqdm.tqdm_notebook(null_df.iteritems()):
    idx = coor_df['geometry'].distance(xy).argmin()
    df.loc[num, 'dong'] = coor_df.loc[idx, 'dong']
  return df, null_df

In [None]:
df.head()

Unnamed: 0,OBJECTID,Field1,YM,SIDO,SIGUNGU,FranClass,Type,Time,TotalSpent,DisSpent,NumofSpent,NumofDisSpent,POINT_X,POINT_Y,x,y,geometry
0,1,1,202005,제주특별자치도,제주시,영세,일반한식,00시,363000,66500,10,2,877005.9834,1479766.0,126.178894,33.302315,POINT (126.17889 33.30232)
1,2,2,202005,제주특별자치도,제주시,영세,단란주점,00시,1180000,0,3,0,877005.7447,1479816.0,126.178884,33.302766,POINT (126.17888 33.30277)
2,3,3,202005,제주특별자치도,제주시,중소1,편의점,00시,157670,6850,20,2,877056.6756,1479616.0,126.179458,33.300971,POINT (126.17946 33.30097)
3,4,4,202005,제주특별자치도,제주시,영세,편의점,00시,46600,0,2,0,877055.9593,1479766.0,126.17943,33.302323,POINT (126.17943 33.30232)
4,5,5,202005,제주특별자치도,제주시,영세,주점,00시,66000,0,2,0,877055.4817,1479866.0,126.179412,33.303224,POINT (126.17941 33.30322)


In [None]:
find_dong_with_boundray()

In [None]:
df6 = gpd.read_file('/content/drive/My Drive/dacon_jeju_sales/nsdi_dataset/제주도 6월 WGS84.geojson')

## 6월 데이터

In [None]:
%%time
df6, null6 = find_dong_with_boundray(jeju_boundary, df6)

Start Boundary Loop


HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))


2713
Start fill Null values


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


CPU times: user 6min 20s, sys: 1.49 s, total: 6min 22s
Wall time: 6min 26s


In [None]:
assert sum(df6['dong'].isnull()) == 0
df6.head()

Unnamed: 0,OBJECTID,Field1,YM,SIDO,SIGUNGU,FranClass,Type,Time,TotalSpent,DisSpent,NumofSpent,NumofDisSpent,POINT_X,POINT_Y,geometry,dong
0,1,1,202006,제주특별자치도,제주시,영세,일반한식,00시,502000,0,10,0,877005.9834,1479766.0,POINT (126.17889 33.30232),한경면
1,2,2,202006,제주특별자치도,제주시,영세,단란주점,00시,1520000,0,8,0,877005.7447,1479816.0,POINT (126.17888 33.30277),한경면
2,3,3,202006,제주특별자치도,제주시,중소1,편의점,00시,482310,0,35,0,877056.6756,1479616.0,POINT (126.17946 33.30097),한경면
3,4,4,202006,제주특별자치도,제주시,영세,편의점,00시,38050,5450,3,1,877055.9593,1479766.0,POINT (126.17943 33.30232),한경면
4,5,5,202006,제주특별자치도,제주시,영세,일반한식,00시,32000,32000,1,1,877055.4817,1479866.0,POINT (126.17941 33.30322),한경면


In [None]:
df6.to_csv('/content/drive/My Drive/dacon_jeju_sales/nsdi_dataset/제주도_6월_읍면동추가.csv', index=False)
null6.to_csv('/content/drive/My Drive/dacon_jeju_sales/nsdi_dataset/6월_null값.csv', index=True)

## 7월 데이터

In [None]:
df7 = gpd.read_file('/content/drive/My Drive/dacon_jeju_sales/nsdi_dataset/제주도 7월 WGS84.geojson')
df7.head()

Unnamed: 0,OBJECTID,Field1,YM,SIDO,SIGUNGU,FranClass,Type,Time,TotalSpent,DisSpent,NumofSpent,NumofDisSpent,POINT_X,POINT_Y,geometry
0,1,1,202007,제주특별자치도,제주시,영세,일반한식,00시,85500,0,4,0,877005.9834,1479766.0,POINT (126.17889 33.30232)
1,2,2,202007,제주특별자치도,제주시,영세,단란주점,00시,1960000,0,4,0,877005.7447,1479816.0,POINT (126.17888 33.30277)
2,3,3,202007,제주특별자치도,제주시,중소1,편의점,00시,475650,0,43,0,877056.6756,1479616.0,POINT (126.17946 33.30097)
3,4,4,202007,제주특별자치도,제주시,영세,편의점,00시,15650,0,1,0,877055.9593,1479766.0,POINT (126.17943 33.30232)
4,5,5,202007,제주특별자치도,제주시,영세,주점,00시,82500,0,1,0,877055.4817,1479866.0,POINT (126.17941 33.30322)


In [None]:
%%time
df7, null7 = find_dong_with_boundray(jeju_boundary, df7)

Start Boundary Loop


HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))


2848
Start fill Null values


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


CPU times: user 6min 48s, sys: 1.58 s, total: 6min 50s
Wall time: 6min 56s


In [None]:
assert sum(df7['dong'].isnull()) == 0
df7.head()

Unnamed: 0,OBJECTID,Field1,YM,SIDO,SIGUNGU,FranClass,Type,Time,TotalSpent,DisSpent,NumofSpent,NumofDisSpent,POINT_X,POINT_Y,geometry,dong
0,1,1,202007,제주특별자치도,제주시,영세,일반한식,00시,85500,0,4,0,877005.9834,1479766.0,POINT (126.17889 33.30232),한경면
1,2,2,202007,제주특별자치도,제주시,영세,단란주점,00시,1960000,0,4,0,877005.7447,1479816.0,POINT (126.17888 33.30277),한경면
2,3,3,202007,제주특별자치도,제주시,중소1,편의점,00시,475650,0,43,0,877056.6756,1479616.0,POINT (126.17946 33.30097),한경면
3,4,4,202007,제주특별자치도,제주시,영세,편의점,00시,15650,0,1,0,877055.9593,1479766.0,POINT (126.17943 33.30232),한경면
4,5,5,202007,제주특별자치도,제주시,영세,주점,00시,82500,0,1,0,877055.4817,1479866.0,POINT (126.17941 33.30322),한경면


In [None]:
df7.to_csv('/content/drive/My Drive/dacon_jeju_sales/nsdi_dataset/제주도_7월_읍면동추가.csv', index=False)
null7.to_csv('/content/drive/My Drive/dacon_jeju_sales/nsdi_dataset/7월_null값.csv', index=True)

## 8월

In [None]:
df8 = gpd.read_file('/content/drive/My Drive/dacon_jeju_sales/nsdi_dataset/제주도 8월 WGS84.geojson')

In [None]:
%%time
df8, null8 = find_dong_with_boundray(jeju_boundary, df8)

Start Boundary Loop


HBox(children=(FloatProgress(value=0.0, max=74.0), HTML(value='')))


2923
Start fill Null values


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


CPU times: user 6min 54s, sys: 1.72 s, total: 6min 56s
Wall time: 7min 2s


In [None]:
assert sum(df8['dong'].isnull()) == 0
df8.head()

Unnamed: 0,OBJECTID,Field1,YM,SIDO,SIGUNGU,FranClass,Type,Time,TotalSpent,DisSpent,NumofSpent,NumofDisSpent,POINT_X,POINT_Y,geometry,dong
0,1,1,202008,제주특별자치도,제주시,영세,일반한식,00시,249500,0,7,0,877005.9834,1479766.0,POINT (126.17889 33.30232),한경면
1,2,2,202008,제주특별자치도,제주시,영세,단란주점,00시,2010000,0,7,0,877005.7447,1479816.0,POINT (126.17888 33.30277),한경면
2,3,3,202008,제주특별자치도,제주시,중소1,편의점,00시,401210,0,41,0,877056.6756,1479616.0,POINT (126.17946 33.30097),한경면
3,4,4,202008,제주특별자치도,제주시,영세,편의점,00시,12250,0,1,0,877055.9593,1479766.0,POINT (126.17943 33.30232),한경면
4,5,5,202008,제주특별자치도,제주시,영세,주점,00시,33000,0,1,0,877055.4817,1479866.0,POINT (126.17941 33.30322),한경면


In [None]:
df8.to_csv('/content/drive/My Drive/dacon_jeju_sales/nsdi_dataset/제주도_8월_읍면동추가.csv', index=False)
null8.to_csv('/content/drive/My Drive/dacon_jeju_sales/nsdi_dataset/8월_null값.csv', index=True)

## EDA

In [None]:
df5 = pd.read_csv('./data/KRI-DAC_Jeju_data5.csv')
df6 = pd.read_csv('./data/KRI-DAC_Jeju_data6.csv')
df7 = pd.read_csv('./data/KRI-DAC_Jeju_data7.csv')
df8 = pd.read_csv('./data/KRI-DAC_Jeju_data8.csv')

In [None]:
df_list = [df5, df6, df7, df8]

In [None]:
len(df5), len(df6), len(df7), len(df8)

(273183, 281896, 284265, 280085)

In [None]:
df5.head()

Unnamed: 0,OBJECTID,Field1,YM,SIDO,SIGUNGU,FranClass,Type,Time,TotalSpent,DisSpent,NumofSpent,NumofDisSpent,POINT_X,POINT_Y
0,1,1,202005,제주특별자치도,제주시,영세,일반한식,00시,363000,66500,10,2,877005.9834,1479766.0
1,2,2,202005,제주특별자치도,제주시,영세,단란주점,00시,1180000,0,3,0,877005.7447,1479816.0
2,3,3,202005,제주특별자치도,제주시,중소1,편의점,00시,157670,6850,20,2,877056.6756,1479616.0
3,4,4,202005,제주특별자치도,제주시,영세,편의점,00시,46600,0,2,0,877055.9593,1479766.0
4,5,5,202005,제주특별자치도,제주시,영세,주점,00시,66000,0,2,0,877055.4817,1479866.0


## 1. 좌표 unique() 추출

`nunique`값이 만개가 넘는다. 개별 marker를 생성하기보다는 읍면동 단위로 묶도록한다.

In [None]:
df5['store_xy'] = df5['POINT_X'].astype(str) + '_' + df5['POINT_Y'].astype(str)

In [None]:
df5['store_xy'].nunique()

13820

## 1. 좌표계 변환

기본에 있는 x,y 좌표는 ITRF 2000 좌표계(EPSG:4385)를 사용한다. 일반적으로 많이 사용하는 WGS84(EPSG:4326)로 변경한다.

In [None]:
from pyproj import Proj, transform

In [None]:
proj_ITRF = Proj(init='epsg:5178')
proj_WGS = Proj(init='epsg:4326')

In [None]:
# 좌표 변환 예시
coor = transform(proj_ITRF, proj_WGS, 877005.9834, 1479766.0730)
print(coor)

(126.17674597473471, 33.305089670045035)


주의할 점은 좌표를 변환하게 되면 x,y좌표가 서로 바뀌게 된다. 이점을 수정해주자

In [None]:
xy = transform(proj_ITRF, proj_WGS, list(df5['POINT_X']), list(df5['POINT_Y']))

df5['x'] = xy[1]
df5['y'] = xy[0]

## 2. 기본 map 생성

In [None]:
#center = [df5['x'].mean(), df5['y'].mean()]
center = [33.37367575592206, 126.54599150955575] #한라산 위치 좌표

In [None]:
m = folium.Map(location=center, zoom_start=11) #crs='EPS4326')

In [None]:
m

In [None]:
'''# 위치 마커 생성
in_marker = list()
for y,x in np.array(xy).T[:10]:
    xy_str = str(x) + '_'+ str(y) 
    if not xy_str in in_marker:
        folium.Marker(location=[x,y]).add_to(m)'''