In [None]:
import pandas as pd
import numpy as np
import os
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
matplotlib.rcParams['axes.unicode_minus'] = False
pd.options.mode.chained_assignment = None  # default='warn'
plt.rcParams['font.family'] = 'Malgun Gothic'
font = {'size': 16}
matplotlib.rc('font', **font)
tqdm.pandas()

In [None]:
def read_coor_data():
    
    col_names = [
    '시군구코드', '출입구일련번호', '법정동코드', '시도명', '시군구명', '읍면동명', '도로명코드', '도로명', '지하여부', '건물본번',
    '건물부번', '건물명', '우편번호', '건물용도분류', '건물군여부', '관할행정동', 'X좌표', 'Y좌표'
    ]
    
    basedir = './위치정보관련/위치정보요약DB/'
    filenames = [f for f in os.listdir(basedir) if f.endswith('.txt')]
    print('filenames:', filenames)
    
    dfs_list = []
    for file in tqdm(filenames):
        df = pd.read_csv(basedir + file, sep='|', encoding='ansi', names=col_names)
        
        dfs_list.append(df)
    
    concat_df = pd.concat(dfs_list).drop_duplicates().reset_index(drop=True)
    
    return concat_df    

In [None]:
coor_df = read_coor_data()
print(coor_df.shape)
coor_df.head()

In [None]:
underground_df = coor_df[coor_df['지하여부'] == 1]
print(underground_df.shape)
underground_df.head()

In [None]:
def read_addr_data():
    
    col_names = [
        '관리번호', '도로명코드', '읍면동일련번호', '지하여부', '건물본번', '건물부번', '기초구역번호', '변경사유코드', '고시일자',
        '변경전도로명주소', '상세주소부여 여부'
    ]
    
    basedir = './위치정보관련/주소DB/'
    filenames = [f for f in os.listdir(basedir) if (f.startswith('주소_'))&(f.endswith('.txt'))]
    print('filenames:', filenames)
    
    dfs_list = []
    for file in tqdm(filenames):
        df = pd.read_csv(basedir + file, sep='|', names=col_names)
        
        dfs_list.append(df)
    
    concat_df = pd.concat(dfs_list).drop_duplicates().reset_index(drop=True)
    
    return concat_df

In [None]:
addr_df = read_addr_data()
print(addr_df.shape)
addr_df.head()

In [None]:
def read_landnum_data():
    
    col_names = [
        '관리번호', '일련번호', '법정동코드', '시도명', '시군구명', '법정읍면동명', '법정리명', '산여부', '지번본번(번지)',
        '지번부번(호)', '대표번호'
    ]
    
    basedir = './위치정보관련/주소DB/'
    filenames = [f for f in os.listdir(basedir) if (f.startswith('지번_'))&(f.endswith('.txt'))]
    print('filenames:', filenames)
    
    dfs_list = []
    for file in tqdm(filenames):
        df = pd.read_csv(basedir + file, sep='|', encoding='ansi', names=col_names)
        
        dfs_list.append(df)
    
    concat_df = pd.concat(dfs_list).drop_duplicates().reset_index(drop=True)
    
    return concat_df

In [None]:
landnum_df = read_landnum_data()
print(landnum_df.shape)
landnum_df.head()

In [None]:
landnum_df['산여부'] = landnum_df['산여부'].apply(lambda x: '일반' if x == 0 else '산')

In [None]:
landnum_df.head()

In [None]:
landnum_df['법정리명'].isna().sum()

In [None]:
landnum_df = landnum_df.drop(columns=['법정리명', '대표번호'])

In [None]:
landnum_df.head()

In [None]:
landnum_df['관리번호'].nunique()

In [None]:
landnum_df.dtypes

In [None]:
landnum_df.isna().sum()

In [None]:
landnum_df['산여부'].value_counts()

In [None]:
landnum_df['지번주소'] = landnum_df['시도명'] + ' ' + landnum_df['시군구명'] + ' ' + landnum_df['법정읍면동명']\
+ ' ' + landnum_df['산여부'] + ' ' + landnum_df['지번본번(번지)'].apply(str) + '-' + landnum_df['지번부번(호)'].apply(str)

In [None]:
landnum_df.head()

In [None]:
landnum_df = landnum_df.drop(columns=['시도명', '시군구명', '법정읍면동명', '산여부', '지번본번(번지)', '지번부번(호)'])
print(landnum_df.shape)
landnum_df.head()

In [None]:
landnum_df['지번주소'].nunique()

In [None]:
landnum_df['관리번호'].nunique()

In [None]:
addr_df.head()

In [None]:
addr_df['관리번호'].nunique()

In [None]:
addr_df = addr_df.drop(columns=['변경사유코드', '고시일자', '변경전도로명주소', '상세주소부여 여부', '기초구역번호'])
print(addr_df.shape)
addr_df.head()

In [None]:
addr_df = addr_df.rename(columns={'읍면동일련번호':'일련번호'})
print(addr_df.shape)
addr_df.head()

In [None]:
addr_df = addr_df.drop(columns=['일련번호'])
landnum_df = landnum_df.drop(columns=['일련번호'])

In [None]:
addr_landnum_df = landnum_df.merge(addr_df, on=['관리번호'])
print(addr_landnum_df.shape)
addr_landnum_df.head()

In [None]:
coor_df.head()

In [None]:
coor_df = coor_df.drop(columns=['시군구코드', '출입구일련번호', '시도명', '시군구명', '읍면동명', '도로명', '건물명',
                                '우편번호', '관할행정동'
                               ])

In [None]:
print(coor_df.shape)
coor_df.head()

In [None]:
#a

In [None]:
os.listdir('./Prepped Data/')

In [None]:
main_data_df = pd.read_csv('./Prepped Data/main_data_baseline_20211202.csv')
print(main_data_df.shape)
main_data_df.head()

In [None]:
main_data_df['도로명'].isna().sum()

In [None]:
main_data_df['도로명'].value_counts()

In [None]:
inv_df = main_data_df[main_data_df['도로명'] == ' ']
print(inv_df.shape)
inv_df.head()

In [None]:
print(addr_landnum_df.shape)
addr_landnum_df.head()

In [None]:
addr_landnum_df = addr_landnum_df.drop(columns=['법정동코드'])
coor_df = coor_df.drop(columns=['법정동코드'])

In [None]:
merge_df = addr_landnum_df.merge(coor_df, on=['도로명코드', '지하여부', '건물본번', '건물부번'])
print(merge_df.shape)
merge_df.head()

In [None]:
merge_df['지번주소'].nunique()

In [None]:
merge_df = merge_df.sort_values(['지번주소', '지하여부', '건물본번', '건물부번'])
print(merge_df.shape)
merge_df.head()

In [None]:
merge_df = merge_df.drop_duplicates(subset=['지번주소'], keep='first').reset_index(drop=True)
print(merge_df.shape)
merge_df.head()

In [None]:
%%time
merge_df.to_csv('./prepped_data/지번_도로명_좌표_20210917.csv', index=False)

In [None]:
land_specs_df.to_csv('./prepped_data/토지특성정보_last_20210819.csv', index=False)

In [None]:
land_specs_df['토지이동상황'].value_counts()

In [None]:
%%time
land_specs_df = pd.read_csv('./prepped_data/land_specs_ver_4.csv').sort_values(['지번주소', '년']).reset_index(drop=True).drop(columns=['공시지가'])
print(land_specs_df.shape)
land_specs_df.head()

In [None]:
last_df = land_specs_df.drop_duplicates(subset=['지번주소'], keep='last')
print(last_df.shape)
last_df.head()

In [None]:
last_df['토지이동상황'].value_counts()

In [None]:
addr_coor_df = pd.read_csv('./prepped_data/addr_coor_data_ver_1.csv')
print(addr_coor_df.shape)
addr_coor_df.head()

In [None]:
merge_df = last_df.merge(addr_coor_df, on=['지번주소'])
print(merge_df.shape)
merge_df.head()

In [None]:
merge_df['토지이동상황'].value_counts()

In [None]:
dandok_df = merge_df[merge_df['토지이동상황'] == '단독']
print(dandok_df.shape)

In [None]:
dasede_df = merge_df[merge_df['토지이동상황'] == '다세대']

In [None]:
apart_df = merge_df[merge_df['토지이동상황'] == '아파트']

In [None]:
commercial_df = merge_df[merge_df['토지이동상황'] == '상업용']

In [None]:
jusang_df = merge_df[merge_df['토지이동상황'] == '주상용']

In [None]:
not_dandok_df = merge_df[merge_df['토지이동상황'] != '단독']

In [None]:
f, ax = plt.subplots(figsize=(100,100))
plt.axis('equal')
#plt.title('{} {}'.format('서울전체', '지번평균'))
sns.scatterplot(dandok_df['X좌표'], dandok_df['Y좌표'], s=5)
sns.scatterplot(dasede_df['X좌표'], dasede_df['Y좌표'], s=5)
#plt.savefig(basedir + '{} {}.png'.format('서울전체', '지번평균'))

plt.show()
plt.clf()

In [None]:
merge_df['구'] = merge_df['지번주소'].apply(lambda x: x.split(' ')[1])
merge_df['동'] = merge_df['지번주소'].apply(lambda x: x.split(' ')[2])

In [None]:
merge_df['구'].unique()

In [None]:
basedir = './국토교통부_실거래가_공개시스템/땅값분석/단독다가구_and_다세대/'

In [None]:
for gu in merge_df['구'].unique():
    print(gu)
    
    gu_df = merge_df[merge_df['구'] == gu]
    
    dandok_df = gu_df[gu_df['토지이동상황'] == '단독']
    dasede_df = gu_df[gu_df['토지이동상황'] == '다세대']
    
    f, ax = plt.subplots(figsize=(30,30))
    plt.axis('equal')
    plt.title('{}'.format(gu))
    sns.scatterplot(dandok_df['X좌표'], dandok_df['Y좌표'], s=5)
    sns.scatterplot(dasede_df['X좌표'], dasede_df['Y좌표'], s=5)
    plt.savefig(basedir + '{}.png'.format(gu))

    plt.show()
    plt.clf()

In [None]:
dongjak_df = merge_df[merge_df['구'] == '동작구']
print(dongjak_df.shape)

In [None]:
dongjak_df.head()

In [None]:
dandok_df = dongjak_df[dongjak_df['토지이동상황'] == '단독']
dasede_df = dongjak_df[dongjak_df['토지이동상황'] == '다세대']

f, ax = plt.subplots(figsize=(30,30))
plt.axis('equal')
plt.title('{}'.format('동작구'))
sns.scatterplot(dandok_df['X좌표'], dandok_df['Y좌표'], s=5)
sns.scatterplot(dasede_df['X좌표'], dasede_df['Y좌표'], s=5)
#plt.savefig(basedir + '{}.png'.format(gu))

plt.show()
plt.clf()

In [None]:
target_df = dandok_df[(dandok_df['X좌표']>=950100)&(dandok_df['X좌표']<=950700)&(dandok_df['Y좌표']>=1.9455*1000000)&
                      (dandok_df['Y좌표']<=1.946*1000000)
                     ]
print(target_df.shape)

In [None]:
f, ax = plt.subplots(figsize=(30,30))
plt.axis('equal')
plt.title('{}'.format('동작구'))
sns.scatterplot(target_df['X좌표'], target_df['Y좌표'], s=5)
#plt.savefig(basedir + '{}.png'.format(gu))

plt.show()
plt.clf()

In [None]:
target_df.head(100)

In [None]:
chunho_df = merge_df[merge_df['동'].str.contains('천호동')]
print(chunho_df.shape)
chunho_df.head()

In [None]:
merge_df.shape

In [None]:
merge_df['토지이동상황'].value_counts()

In [None]:
dandok_df = chunho_df[chunho_df['토지이동상황']]