# **부동산 데이터: 최적의 자취방 구하기**

## **크롤링**

In [1]:
from user_agent import generate_user_agent, generate_navigator
user_agent = generate_user_agent()
user_agent

'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:48.0) Gecko/20100101 Firefox/48.0'

In [2]:
import requests
import numpy as np
import pandas as pd

In [3]:
from tqdm.notebook import tqdm
import time

In [4]:
article_list = []
for i in tqdm(range(1, 101)):
    try:
        url = f'https://m.land.naver.com/cluster/ajax/articleList?itemId=&mapKey=&lgeo=&showR0=&rletTpCd=OPST%3AVL%3AOR&tradTpCd=B2&z=12&lat=37.481021&lon=126.951601&btm=37.3398975&lft=126.6762562&top=37.6218785&rgt=127.2269458&totCnt=8360&cortarNo=1162000000&sort=rank&page={i}'

        user_agent = generate_user_agent()
        headers = {'User-Agent':user_agent}

        res = requests.get(url, headers=headers)
        time.sleep(1)

        article_json = res.json()
        article_body = article_json['body']
        article_list.append(article_body)
    except:
        break

  0%|          | 0/100 [00:00<?, ?it/s]

In [5]:
article_list1 = [j for i in article_list for j in i]

In [6]:
data = pd.DataFrame(article_list1)

In [7]:
data = data[['atclNo','rletTpNm','flrInfo','rentPrc','hanPrc','spc1','spc2','direction','atclCfmYmd','repImgUrl','lat','lng','atclFetrDesc','tagList']]
data.columns = ['물건번호','구분','층수(물건층/전체층)','월세','보증금','계약면적(m2)','전용면적(m2)','방향','확인일자','이미지','위도','경도','설명','태그']
data

Unnamed: 0,물건번호,구분,층수(물건층/전체층),월세,보증금,계약면적(m2),전용면적(m2),방향,확인일자,이미지,위도,경도,설명,태그
0,2419918257,원룸,저/4,40,300,19,16.53,남서향,24.04.24.,/20240424_83/land_naver_1713952322825vWEhO_JPE...,37.484710,126.911921,2호선 신대방역 확인매물여성취향저격풀옵션치안걱정NO,"[4년이내, 융자금적은, 역세권]"
1,2419934436,원룸,1/5,40,2000,21,18,남동향,24.04.24.,/20240424_52/land_naver_1713951003188lUMzH_JPE...,37.480955,126.900701,구디역 이쁜 화이트 원룸현황1.5층깔끔 풀옵션 원룸월세,"[4년이내, 융자금적은, 1층]"
2,2419858607,원룸,10/12,105,5000,26,23.1,남동향,24.04.24.,/20240424_243/land_naver_1713952441796YW8Nf_JP...,37.478109,126.962245,낙성대역 5번출구 도보1분 초역세권 1.5룸 매우 깨끗하고 예쁜 방,"[2년이내, 융자금적은, 역세권]"
3,2419914550,원룸,저/3,32,500,19,19.8,남서향,24.04.24.,/20240424_158/land_naver_1713952082115BAm9q_JP...,37.474110,126.967600,2호선 역세권 가성비 좋은 풀옵션,"[25년이내, 역세권, 화장실한개]"
4,2419768826,원룸,B1/5,30,1000,18,15,서향,24.04.24.,/20240424_299/land_naver_1713953221116pxTcd_JP...,37.478232,126.961926,낙성대 초역세권 즉시입주가능,"[25년이내, 융자금적은, 역세권]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1775,2419458867,원룸,3/4,35,300,23,19.8,남동향,24.04.22.,/20240422_251/land_naver_1713745673469CcBwe_JP...,37.477821,126.935265,신림역 채광 NO.1역도보7분양창원룸채광최강,"[25년이내, 융자금적은, 역세권]"
1776,2419452608,원룸,고/4,40,300,26,23.14,남동향,24.04.22.,/20240422_141/land_naver_17137455375894ggHz_JP...,37.486339,126.915350,넓고 깔끔한 집입니다 초역세권,"[25년이내, 융자금적은, 역세권]"
1777,2419452139,원룸,3/4,35,500,26,23.14,남동향,24.04.22.,/20240422_47/land_naver_1713745220011NfDDB_JPE...,37.484921,126.919564,"신축, 이쁜방 전세자금대출가능","[2년이내, 융자금적은, 소형평수]"
1778,2419452826,원룸,1/3,54,1000,-,26.44,북동향,24.04.22.,/20240422_201/land_naver_1713745434912bAgDx_JP...,37.486167,126.931098,신림 역세권 넓은 지상층 분리형 원룸. 공간 넓게 쓰실 분,"[25년이상, 역세권, 1층]"


In [8]:
data.to_excel('Data/data.xlsx')

## **질문 만들기**

- 최적의 자취방 구하기
    - 내가 원하는 조건은?
        1. 보증금 3000만원 이하
        2. 월세는 저렴할수록 좋음
        3. 지하, 반지하, 꼭대기층은 선호하지 않음
        4. 전용면적이 클수록 좋음
        5. 북향은 선호하지 않음
        6. 연식이 오래되지 않을수록 좋음
        7. 지하철 역에서 가까울수록 좋음

## **데이터 전처리**

In [9]:
import plotly.express as px
import folium

import warnings
warnings.filterwarnings(action='ignore')

In [10]:
data = pd.read_excel('/content/drive/MyDrive/강의자료_황수현_Python/Part3) 파이썬 데이터 분석 프로젝트/data/실습6: 직접 크롤링하기 - 부동산 데이터 분석을 통해 최적의 자취방 구하기/data.xlsx')

OSError: [Errno 22] Invalid argument: '/content/drive/MyDrive/강의자료_황수현_Python/Part3) 파이썬 데이터 분석 프로젝트/data/실습6: 직접 크롤링하기 - 부동산 데이터 분석을 통해 최적의 자취방 구하기/data.xlsx'

In [11]:
data.head()

Unnamed: 0,물건번호,구분,층수(물건층/전체층),월세,보증금,계약면적(m2),전용면적(m2),방향,확인일자,이미지,위도,경도,설명,태그
0,2419918257,원룸,저/4,40,300,19,16.53,남서향,24.04.24.,/20240424_83/land_naver_1713952322825vWEhO_JPE...,37.48471,126.911921,2호선 신대방역 확인매물여성취향저격풀옵션치안걱정NO,"[4년이내, 융자금적은, 역세권]"
1,2419934436,원룸,1/5,40,2000,21,18.0,남동향,24.04.24.,/20240424_52/land_naver_1713951003188lUMzH_JPE...,37.480955,126.900701,구디역 이쁜 화이트 원룸현황1.5층깔끔 풀옵션 원룸월세,"[4년이내, 융자금적은, 1층]"
2,2419858607,원룸,10/12,105,5000,26,23.1,남동향,24.04.24.,/20240424_243/land_naver_1713952441796YW8Nf_JP...,37.478109,126.962245,낙성대역 5번출구 도보1분 초역세권 1.5룸 매우 깨끗하고 예쁜 방,"[2년이내, 융자금적은, 역세권]"
3,2419914550,원룸,저/3,32,500,19,19.8,남서향,24.04.24.,/20240424_158/land_naver_1713952082115BAm9q_JP...,37.47411,126.9676,2호선 역세권 가성비 좋은 풀옵션,"[25년이내, 역세권, 화장실한개]"
4,2419768826,원룸,B1/5,30,1000,18,15.0,서향,24.04.24.,/20240424_299/land_naver_1713953221116pxTcd_JP...,37.478232,126.961926,낙성대 초역세권 즉시입주가능,"[25년이내, 융자금적은, 역세권]"


In [12]:
data = data.iloc[:,1:]

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1780 entries, 0 to 1779
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   구분           1780 non-null   object 
 1   층수(물건층/전체층)  1780 non-null   object 
 2   월세           1780 non-null   int64  
 3   보증금          1778 non-null   object 
 4   계약면적(m2)     1780 non-null   object 
 5   전용면적(m2)     1780 non-null   object 
 6   방향           1780 non-null   object 
 7   확인일자         1780 non-null   object 
 8   이미지          1590 non-null   object 
 9   위도           1780 non-null   float64
 10  경도           1780 non-null   float64
 11  설명           1663 non-null   object 
 12  태그           1780 non-null   object 
dtypes: float64(2), int64(1), object(10)
memory usage: 180.9+ KB


- 월세 0원인 경우 제외

In [14]:
data = data.query('월세 > 0')

- 보증금 숫자로 변환

In [15]:
data['보증금'].unique()

array(['300', '2,000', '5,000', '500', '1,000', '100', '3,000', '200',
       '20', '7,000', '50', '30', nan, '4,000', '3,500', '8,200',
       '2억 7,000', '3억 2,000', '1억', '5,500', '2억', '1억 7,000', '2억 500',
       '1억 2,000', '2억 4,900', '35', '8,000', '1억 3,000', '1억 5,000',
       '9,500', '1억 6,700', '4억 6,000', '1억 4,000', '3억', '2억 3,000',
       '2억 6,000', '7,500', '1억 9,000', '1억 395', '6,000', '2억 2,000',
       '1억 7,500', '6,500', '3억 3,000', '1억 1,000', '1,500', '4억 5,000',
       '1억 4,300', '2,500', '1억 9,600', '9,000', '1억 2,500', '3,700',
       '1억 4,800', '2억 5,000', '2억 700', '3억 5,000', '1억 6,500', '400',
       '1억 1,800', '1억 700', '1억 900', '1억 8,000', '1억 6,000', '9,540',
       '9,990', '6,900', '7,300'], dtype=object)

In [19]:
data = data.query('~보증금.str.contains("억", na=False)')

In [20]:
data['보증금'].unique()

array(['300', '2,000', '5,000', '500', '1,000', '100', '3,000', '200',
       '20', '7,000', '50', '30', nan, '4,000', '3,500', '8,200', '5,500',
       '35', '8,000', '9,500', '7,500', '6,000', '6,500', '1,500',
       '2,500', '9,000', '3,700', '400', '9,540', '9,990', '6,900',
       '7,300'], dtype=object)

In [26]:
data['보증금'] = data['보증금'].dropna()

In [28]:
# Convert '보증금' column to string type and remove commas
data['보증금'] = data['보증금'].astype(str).str.replace(',', '')

# Convert string values to integers, ignoring errors (NaN values)
data['보증금'] = pd.to_numeric(data['보증금'], errors='coerce')

# Print unique values in the '보증금' column
print(data['보증금'].unique())


[ 300. 2000. 5000.  500. 1000.  100. 3000.  200.   20. 7000.   50.   30.
   nan 4000. 3500. 8200. 5500.   35. 8000. 9500. 7500. 6000. 6500. 1500.
 2500. 9000. 3700.  400. 9540. 9990. 6900. 7300.]


In [27]:
data['보증금'] = data['보증금'].str.replace(',','').astype(int)
data['보증금'].unique()

ValueError: cannot convert float NaN to integer

- 물건층과 전체층 분리 후 1층과 꼭대기층 유무 컬럼 만들기
    - 지하의 경우 마이너스로 표시

In [29]:
data[['물건층','전체층']] = data['층수(물건층/전체층)'].str.split('/', expand=True)

In [30]:
data.head()

Unnamed: 0,구분,층수(물건층/전체층),월세,보증금,계약면적(m2),전용면적(m2),방향,확인일자,이미지,위도,경도,설명,태그,물건층,전체층
0,원룸,저/4,40,300.0,19,16.53,남서향,24.04.24.,/20240424_83/land_naver_1713952322825vWEhO_JPE...,37.48471,126.911921,2호선 신대방역 확인매물여성취향저격풀옵션치안걱정NO,"[4년이내, 융자금적은, 역세권]",저,4
1,원룸,1/5,40,2000.0,21,18.0,남동향,24.04.24.,/20240424_52/land_naver_1713951003188lUMzH_JPE...,37.480955,126.900701,구디역 이쁜 화이트 원룸현황1.5층깔끔 풀옵션 원룸월세,"[4년이내, 융자금적은, 1층]",1,5
2,원룸,10/12,105,5000.0,26,23.1,남동향,24.04.24.,/20240424_243/land_naver_1713952441796YW8Nf_JP...,37.478109,126.962245,낙성대역 5번출구 도보1분 초역세권 1.5룸 매우 깨끗하고 예쁜 방,"[2년이내, 융자금적은, 역세권]",10,12
3,원룸,저/3,32,500.0,19,19.8,남서향,24.04.24.,/20240424_158/land_naver_1713952082115BAm9q_JP...,37.47411,126.9676,2호선 역세권 가성비 좋은 풀옵션,"[25년이내, 역세권, 화장실한개]",저,3
4,원룸,B1/5,30,1000.0,18,15.0,서향,24.04.24.,/20240424_299/land_naver_1713953221116pxTcd_JP...,37.478232,126.961926,낙성대 초역세권 즉시입주가능,"[25년이내, 융자금적은, 역세권]",B1,5


In [31]:
data['물건층'].unique()

array(['저', '1', '10', 'B1', '4', '2', '3', '6', '8', '고', '중', '5', '7',
       '9', '12', '20', 'B2', '11', '13', '15'], dtype=object)

In [32]:
def floor_info(target, total):
    try:
        if target in ['B1','B2']: #지하이면
            return 'y'
        elif int(target) == 1 or int(target)/int(total) == 1: #1층이거나 꼭대기층이면
            return 'y'
        else:
            return 'n'
    except ValueError:
        return 'n'

In [33]:
data['비선호층여부'] = data.apply(lambda x: floor_info(x['물건층'], x['전체층']), axis=1)

In [34]:
data.head()

Unnamed: 0,구분,층수(물건층/전체층),월세,보증금,계약면적(m2),전용면적(m2),방향,확인일자,이미지,위도,경도,설명,태그,물건층,전체층,비선호층여부
0,원룸,저/4,40,300.0,19,16.53,남서향,24.04.24.,/20240424_83/land_naver_1713952322825vWEhO_JPE...,37.48471,126.911921,2호선 신대방역 확인매물여성취향저격풀옵션치안걱정NO,"[4년이내, 융자금적은, 역세권]",저,4,n
1,원룸,1/5,40,2000.0,21,18.0,남동향,24.04.24.,/20240424_52/land_naver_1713951003188lUMzH_JPE...,37.480955,126.900701,구디역 이쁜 화이트 원룸현황1.5층깔끔 풀옵션 원룸월세,"[4년이내, 융자금적은, 1층]",1,5,y
2,원룸,10/12,105,5000.0,26,23.1,남동향,24.04.24.,/20240424_243/land_naver_1713952441796YW8Nf_JP...,37.478109,126.962245,낙성대역 5번출구 도보1분 초역세권 1.5룸 매우 깨끗하고 예쁜 방,"[2년이내, 융자금적은, 역세권]",10,12,n
3,원룸,저/3,32,500.0,19,19.8,남서향,24.04.24.,/20240424_158/land_naver_1713952082115BAm9q_JP...,37.47411,126.9676,2호선 역세권 가성비 좋은 풀옵션,"[25년이내, 역세권, 화장실한개]",저,3,n
4,원룸,B1/5,30,1000.0,18,15.0,서향,24.04.24.,/20240424_299/land_naver_1713953221116pxTcd_JP...,37.478232,126.961926,낙성대 초역세권 즉시입주가능,"[25년이내, 융자금적은, 역세권]",B1,5,y


- 제외 조건에 해당하는 매물 제외
    - 내가 원하는 조건은?
        1. 보증금 3000만원 이하
        2. 월세는 저렴할수록 좋음
        3. 지하, 반지하, 꼭대기층은 선호하지 않음
        4. 전용면적이 클수록 좋음
        5. 북향은 선호하지 않음
        6. 연식이 오래되지 않을수록 좋음
        7. 지하철 역에서 가까울수록 좋음

In [35]:
#보증금 3000만원 이하
#지하, 반지하, 꼭대기층은 선호하지 않음
#북향은 선호하지 않음
data_filtered = data.query('300 <= 보증금 <= 3000 and 비선호층여부 == "n" and 전체층 != "1" and ~방향.str.contains("북")')

In [36]:
data_filtered

Unnamed: 0,구분,층수(물건층/전체층),월세,보증금,계약면적(m2),전용면적(m2),방향,확인일자,이미지,위도,경도,설명,태그,물건층,전체층,비선호층여부
0,원룸,저/4,40,300.0,19,16.53,남서향,24.04.24.,/20240424_83/land_naver_1713952322825vWEhO_JPE...,37.484710,126.911921,2호선 신대방역 확인매물여성취향저격풀옵션치안걱정NO,"[4년이내, 융자금적은, 역세권]",저,4,n
3,원룸,저/3,32,500.0,19,19.8,남서향,24.04.24.,/20240424_158/land_naver_1713952082115BAm9q_JP...,37.474110,126.967600,2호선 역세권 가성비 좋은 풀옵션,"[25년이내, 역세권, 화장실한개]",저,3,n
11,원룸,저/4,30,500.0,19,19.8,남동향,24.04.24.,/20240425_96/land_naver_1714015501394lNLip_JPE...,37.469927,126.965559,2호선 역세권 가성비 좋은 신축 대형 풀옵션,"[10년이내, 화장실한개, 소형평수]",저,4,n
12,원룸,3/4,45,3000.0,23,23.14,서향,24.04.24.,/20240424_240/land_naver_1713951961790HmQA3_JP...,37.486506,126.952803,2호선 역세권 넓은 1.5룸 가성비 최고 넓은집 찾는다면 여기요,"[2년이내, 화장실한개, 소형평수]",3,4,n
13,원룸,6/8,50,1000.0,23,23.1,동향,24.04.24.,/20240424_181/land_naver_1713952142631HCCmx_JP...,37.480189,126.945238,역6분 넓은방 깔끔컨디션 채광좋음 풀옵션 즉시입주가능 엘베有,"[25년이내, 융자금없는, 역세권]",6,8,n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1773,원룸,저/3,37,300.0,36,33.06,남동향,24.04.22.,/20240422_253/land_naver_1713745347339MhVf4_JP...,37.479770,126.928582,엄청 넓은 신축 원룸 입니다 퀸사이즈 침대가능,"[15년이내, 융자금적은, 역세권]",저,3,n
1775,원룸,3/4,35,300.0,23,19.8,남동향,24.04.22.,/20240422_251/land_naver_1713745673469CcBwe_JP...,37.477821,126.935265,신림역 채광 NO.1역도보7분양창원룸채광최강,"[25년이내, 융자금적은, 역세권]",3,4,n
1776,원룸,고/4,40,300.0,26,23.14,남동향,24.04.22.,/20240422_141/land_naver_17137455375894ggHz_JP...,37.486339,126.915350,넓고 깔끔한 집입니다 초역세권,"[25년이내, 융자금적은, 역세권]",고,4,n
1777,원룸,3/4,35,500.0,26,23.14,남동향,24.04.22.,/20240422_47/land_naver_1713745220011NfDDB_JPE...,37.484921,126.919564,"신축, 이쁜방 전세자금대출가능","[2년이내, 융자금적은, 소형평수]",3,4,n


- 태그 컬럼 분리

In [38]:
data_filtered['태그'] = data_filtered['태그'].astype(str)

data_filtered[['tag1','tag2','tag3','tag4']] = data_filtered['태그'].str.replace("\'|\[|\]","").str.split(', ', expand=True)

In [39]:
data_filtered.head()

Unnamed: 0,구분,층수(물건층/전체층),월세,보증금,계약면적(m2),전용면적(m2),방향,확인일자,이미지,위도,경도,설명,태그,물건층,전체층,비선호층여부,tag1,tag2,tag3,tag4
0,원룸,저/4,40,300.0,19,16.53,남서향,24.04.24.,/20240424_83/land_naver_1713952322825vWEhO_JPE...,37.48471,126.911921,2호선 신대방역 확인매물여성취향저격풀옵션치안걱정NO,"['4년이내', '융자금적은', '역세권']",저,4,n,4년이내,융자금적은,역세권,
3,원룸,저/3,32,500.0,19,19.8,남서향,24.04.24.,/20240424_158/land_naver_1713952082115BAm9q_JP...,37.47411,126.9676,2호선 역세권 가성비 좋은 풀옵션,"['25년이내', '역세권', '화장실한개']",저,3,n,25년이내,역세권,화장실한개,
11,원룸,저/4,30,500.0,19,19.8,남동향,24.04.24.,/20240425_96/land_naver_1714015501394lNLip_JPE...,37.469927,126.965559,2호선 역세권 가성비 좋은 신축 대형 풀옵션,"['10년이내', '화장실한개', '소형평수']",저,4,n,10년이내,화장실한개,소형평수,
12,원룸,3/4,45,3000.0,23,23.14,서향,24.04.24.,/20240424_240/land_naver_1713951961790HmQA3_JP...,37.486506,126.952803,2호선 역세권 넓은 1.5룸 가성비 최고 넓은집 찾는다면 여기요,"['2년이내', '화장실한개', '소형평수']",3,4,n,2년이내,화장실한개,소형평수,
13,원룸,6/8,50,1000.0,23,23.1,동향,24.04.24.,/20240424_181/land_naver_1713952142631HCCmx_JP...,37.480189,126.945238,역6분 넓은방 깔끔컨디션 채광좋음 풀옵션 즉시입주가능 엘베有,"['25년이내', '융자금없는', '역세권']",6,8,n,25년이내,융자금없는,역세권,


- 연식 컬럼 추가

In [40]:
data_filtered = data_filtered.query('tag1.str.contains("년")') #연식 정보가 있는 데이터만 필터링

In [41]:
data_filtered['연식'] = [int(i[0]) for i in data_filtered['tag1'].str.split('년')]
data_filtered

Unnamed: 0,구분,층수(물건층/전체층),월세,보증금,계약면적(m2),전용면적(m2),방향,확인일자,이미지,위도,...,설명,태그,물건층,전체층,비선호층여부,tag1,tag2,tag3,tag4,연식
0,원룸,저/4,40,300.0,19,16.53,남서향,24.04.24.,/20240424_83/land_naver_1713952322825vWEhO_JPE...,37.484710,...,2호선 신대방역 확인매물여성취향저격풀옵션치안걱정NO,"['4년이내', '융자금적은', '역세권']",저,4,n,4년이내,융자금적은,역세권,,4
3,원룸,저/3,32,500.0,19,19.8,남서향,24.04.24.,/20240424_158/land_naver_1713952082115BAm9q_JP...,37.474110,...,2호선 역세권 가성비 좋은 풀옵션,"['25년이내', '역세권', '화장실한개']",저,3,n,25년이내,역세권,화장실한개,,25
11,원룸,저/4,30,500.0,19,19.8,남동향,24.04.24.,/20240425_96/land_naver_1714015501394lNLip_JPE...,37.469927,...,2호선 역세권 가성비 좋은 신축 대형 풀옵션,"['10년이내', '화장실한개', '소형평수']",저,4,n,10년이내,화장실한개,소형평수,,10
12,원룸,3/4,45,3000.0,23,23.14,서향,24.04.24.,/20240424_240/land_naver_1713951961790HmQA3_JP...,37.486506,...,2호선 역세권 넓은 1.5룸 가성비 최고 넓은집 찾는다면 여기요,"['2년이내', '화장실한개', '소형평수']",3,4,n,2년이내,화장실한개,소형평수,,2
13,원룸,6/8,50,1000.0,23,23.1,동향,24.04.24.,/20240424_181/land_naver_1713952142631HCCmx_JP...,37.480189,...,역6분 넓은방 깔끔컨디션 채광좋음 풀옵션 즉시입주가능 엘베有,"['25년이내', '융자금없는', '역세권']",6,8,n,25년이내,융자금없는,역세권,,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1773,원룸,저/3,37,300.0,36,33.06,남동향,24.04.22.,/20240422_253/land_naver_1713745347339MhVf4_JP...,37.479770,...,엄청 넓은 신축 원룸 입니다 퀸사이즈 침대가능,"['15년이내', '융자금적은', '역세권']",저,3,n,15년이내,융자금적은,역세권,,15
1775,원룸,3/4,35,300.0,23,19.8,남동향,24.04.22.,/20240422_251/land_naver_1713745673469CcBwe_JP...,37.477821,...,신림역 채광 NO.1역도보7분양창원룸채광최강,"['25년이내', '융자금적은', '역세권']",3,4,n,25년이내,융자금적은,역세권,,25
1776,원룸,고/4,40,300.0,26,23.14,남동향,24.04.22.,/20240422_141/land_naver_17137455375894ggHz_JP...,37.486339,...,넓고 깔끔한 집입니다 초역세권,"['25년이내', '융자금적은', '역세권']",고,4,n,25년이내,융자금적은,역세권,,25
1777,원룸,3/4,35,500.0,26,23.14,남동향,24.04.22.,/20240422_47/land_naver_1713745220011NfDDB_JPE...,37.484921,...,"신축, 이쁜방 전세자금대출가능","['2년이내', '융자금적은', '소형평수']",3,4,n,2년이내,융자금적은,소형평수,,2


- 필요한 컬럼만 남기기

In [42]:
data_filtered.columns

Index(['구분', '층수(물건층/전체층)', '월세', '보증금', '계약면적(m2)', '전용면적(m2)', '방향', '확인일자',
       '이미지', '위도', '경도', '설명', '태그', '물건층', '전체층', '비선호층여부', 'tag1', 'tag2',
       'tag3', 'tag4', '연식'],
      dtype='object')

In [44]:
data_filtered = data_filtered[['구분','월세','보증금','전용면적(m2)','방향','위도','경도','물건층','전체층','연식']]
data_filtered.head()

Unnamed: 0,구분,월세,보증금,전용면적(m2),방향,위도,경도,물건층,전체층,연식
0,원룸,40,300.0,16.53,남서향,37.48471,126.911921,저,4,4
3,원룸,32,500.0,19.8,남서향,37.47411,126.9676,저,3,25
11,원룸,30,500.0,19.8,남동향,37.469927,126.965559,저,4,10
12,원룸,45,3000.0,23.14,서향,37.486506,126.952803,3,4,2
13,원룸,50,1000.0,23.1,동향,37.480189,126.945238,6,8,25


- 위도, 경도를 이용하여 역까지의 거리 재기

In [45]:
coordinate = pd.read_csv(r'Data\서울시 역사마스터 정보.csv')
coordinate = coordinate.query('호선 == "2호선"')
station_list = ['신대방', '신림', '봉천', '서울대입구(관악구청)', '낙성대', '사당']
coordinate.query('역사명 in @station_list')

Unnamed: 0,역사_ID,역사명,호선,위도,경도
727,231,신대방,2호선,37.487462,126.913149
728,230,신림,2호선,37.484201,126.929715
729,229,봉천,2호선,37.482362,126.941892
730,228,서울대입구(관악구청),2호선,37.481247,126.952739
731,227,낙성대,2호선,37.47693,126.963693
732,226,사당,2호선,37.476538,126.981544


In [46]:
from haversine import haversine

haversine((37.487462,126.913149), (37.484201,126.929715), unit = 'm')

1505.9854014218404

In [47]:
def distance(station_name, lat, long):
    station_lat = coordinate.query(f'역사명 == "{station_name}"')['위도'].values[0]
    station_long = coordinate.query(f'역사명 == "{station_name}"')['경도'].values[0]

    distance = haversine((station_lat, station_long), (lat, long), unit = 'm')

    return distance

In [48]:
for s in station_list:
    data_filtered[s] = data_filtered.apply(lambda x: distance(s, x['위도'], x['경도']), axis=1)

In [49]:
data_filtered.head()

Unnamed: 0,구분,월세,보증금,전용면적(m2),방향,위도,경도,물건층,전체층,연식,신대방,신림,봉천,서울대입구(관악구청),낙성대,사당
0,원룸,40,300.0,16.53,남서향,37.48471,126.911921,저,4,4,324.624838,1571.079624,2657.391032,3622.18783,4649.532183,6210.362767
3,원룸,32,500.0,19.8,남서향,37.47411,126.9676,저,3,25,5028.894601,3526.317039,2447.088993,1532.817356,466.044229,1259.773943
11,원룸,30,500.0,19.8,남동향,37.469927,126.965559,저,4,10,5018.988995,3538.912171,2504.738374,1692.413227,795.920254,1590.701783
12,원룸,45,3000.0,23.14,서향,37.486506,126.952803,3,4,2,3500.380635,2053.21346,1067.32872,584.802193,1434.284304,2767.70554
13,원룸,50,1000.0,23.1,동향,37.480189,126.945238,6,8,25,2944.639577,1440.537335,381.516674,672.260292,1668.344585,3229.353565


In [50]:
data_filtered['역까지최소거리'] = data_filtered.apply(lambda x: min([x['신대방'], x['신림'], x['봉천'], x['서울대입구(관악구청)'], x['낙성대'], x['사당']]), axis=1)
data_filtered.head()

Unnamed: 0,구분,월세,보증금,전용면적(m2),방향,위도,경도,물건층,전체층,연식,신대방,신림,봉천,서울대입구(관악구청),낙성대,사당,역까지최소거리
0,원룸,40,300.0,16.53,남서향,37.48471,126.911921,저,4,4,324.624838,1571.079624,2657.391032,3622.18783,4649.532183,6210.362767,324.624838
3,원룸,32,500.0,19.8,남서향,37.47411,126.9676,저,3,25,5028.894601,3526.317039,2447.088993,1532.817356,466.044229,1259.773943,466.044229
11,원룸,30,500.0,19.8,남동향,37.469927,126.965559,저,4,10,5018.988995,3538.912171,2504.738374,1692.413227,795.920254,1590.701783,795.920254
12,원룸,45,3000.0,23.14,서향,37.486506,126.952803,3,4,2,3500.380635,2053.21346,1067.32872,584.802193,1434.284304,2767.70554,584.802193
13,원룸,50,1000.0,23.1,동향,37.480189,126.945238,6,8,25,2944.639577,1440.537335,381.516674,672.260292,1668.344585,3229.353565,381.516674


In [51]:
data_filtered.drop(station_list, axis=1, inplace=True)
data_filtered.head()

Unnamed: 0,구분,월세,보증금,전용면적(m2),방향,위도,경도,물건층,전체층,연식,역까지최소거리
0,원룸,40,300.0,16.53,남서향,37.48471,126.911921,저,4,4,324.624838
3,원룸,32,500.0,19.8,남서향,37.47411,126.9676,저,3,25,466.044229
11,원룸,30,500.0,19.8,남동향,37.469927,126.965559,저,4,10,795.920254
12,원룸,45,3000.0,23.14,서향,37.486506,126.952803,3,4,2,584.802193
13,원룸,50,1000.0,23.1,동향,37.480189,126.945238,6,8,25,381.516674


## **분석**

- 각 항목 분포 확인하기

In [52]:
for x in ['월세','보증금','전용면적(m2)','연식','역까지최소거리']:
    fig = px.box(data_frame = data_filtered, x=x, width=700, height=400)
    fig.show()

- 내가 원하는 조건은?
    - 보증금 3000만원 이하 ✅
    - 월세는 저렴할수록 좋음
    - 지하, 반지하, 꼭대기층은 선호하지 않음 ✅
    - 전용면적이 클수록 좋음
    - 북향은 선호하지 않음 ✅
    - 연식이 오래되지 않을수록 좋음
    - 지하철 역에서 가까울수록 좋음



- 월세, 전용면적, 연식, 지하철역까지의 거리 점수 매기기

In [53]:
data_filtered

Unnamed: 0,구분,월세,보증금,전용면적(m2),방향,위도,경도,물건층,전체층,연식,역까지최소거리
0,원룸,40,300.0,16.53,남서향,37.484710,126.911921,저,4,4,324.624838
3,원룸,32,500.0,19.8,남서향,37.474110,126.967600,저,3,25,466.044229
11,원룸,30,500.0,19.8,남동향,37.469927,126.965559,저,4,10,795.920254
12,원룸,45,3000.0,23.14,서향,37.486506,126.952803,3,4,2,584.802193
13,원룸,50,1000.0,23.1,동향,37.480189,126.945238,6,8,25,381.516674
...,...,...,...,...,...,...,...,...,...,...,...
1773,원룸,37,300.0,33.06,남동향,37.479770,126.928582,저,3,15,502.745864
1775,원룸,35,300.0,19.8,남동향,37.477821,126.935265,3,4,25,772.604019
1776,원룸,40,300.0,23.14,남동향,37.486339,126.915350,고,4,25,230.882089
1777,원룸,35,500.0,23.14,남동향,37.484921,126.919564,3,4,2,632.619512


In [55]:
# Convert columns to numeric type if they contain string representations of numbers
data_filtered['전용면적(m2)'] = data_filtered['전용면적(m2)'].astype(float)
data_filtered['역까지최소거리'] = data_filtered['역까지최소거리'].astype(float)

In [58]:
data_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 700 entries, 0 to 1779
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   구분        700 non-null    object  
 1   월세        700 non-null    int64   
 2   보증금       700 non-null    float64 
 3   전용면적(m2)  700 non-null    float64 
 4   방향        700 non-null    object  
 5   위도        700 non-null    float64 
 6   경도        700 non-null    float64 
 7   물건층       700 non-null    object  
 8   전체층       700 non-null    object  
 9   연식        700 non-null    int64   
 10  역까지최소거리   700 non-null    float64 
 11  월세_등급     700 non-null    category
dtypes: category(1), float64(5), int64(2), object(4)
memory usage: 66.5+ KB


In [59]:
data_filtered['월세_등급'] = pd.qcut(data_filtered['월세'], 5, labels=[1,2,3,4,5])
data_filtered['전용면적_등급'] = pd.qcut(data_filtered['전용면적(m2)'], 5, labels=[5,4,3,2,1])
data_filtered['연식_등급'] = pd.qcut(data_filtered['연식'].rank(method='first'), 5, labels=[1,2,3,4,5])
data_filtered['역까지최소거리_등급'] = pd.qcut(data_filtered['역까지최소거리'], 5, labels=[1,2,3,4,5])

In [60]:
data_filtered

Unnamed: 0,구분,월세,보증금,전용면적(m2),방향,위도,경도,물건층,전체층,연식,역까지최소거리,월세_등급,전용면적_등급,연식_등급,역까지최소거리_등급
0,원룸,40,300.0,16.53,남서향,37.484710,126.911921,저,4,4,324.624838,2,5,1,2
3,원룸,32,500.0,19.80,남서향,37.474110,126.967600,저,3,25,466.044229,1,4,3,4
11,원룸,30,500.0,19.80,남동향,37.469927,126.965559,저,4,10,795.920254,1,4,1,5
12,원룸,45,3000.0,23.14,서향,37.486506,126.952803,3,4,2,584.802193,3,2,1,4
13,원룸,50,1000.0,23.10,동향,37.480189,126.945238,6,8,25,381.516674,3,2,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1773,원룸,37,300.0,33.06,남동향,37.479770,126.928582,저,3,15,502.745864,1,1,3,4
1775,원룸,35,300.0,19.80,남동향,37.477821,126.935265,3,4,25,772.604019,1,4,5,5
1776,원룸,40,300.0,23.14,남동향,37.486339,126.915350,고,4,25,230.882089,2,2,5,1
1777,원룸,35,500.0,23.14,남동향,37.484921,126.919564,3,4,2,632.619512,1,2,1,5


In [61]:
data_filtered_final = data_filtered.query('월세_등급 <= 3 and 전용면적_등급 <= 1 and 연식_등급 <= 2 and 역까지최소거리_등급 <= 2')
data_filtered_final

Unnamed: 0,구분,월세,보증금,전용면적(m2),방향,위도,경도,물건층,전체층,연식,역까지최소거리,월세_등급,전용면적_등급,연식_등급,역까지최소거리_등급
0,원룸,40,300.0,16.53,남서향,37.484710,126.911921,저,4,4,324.624838,2,5,1,2
19,원룸,50,500.0,19.80,남향,37.485085,126.941380,고,3,4,306.135966,3,4,1,2
31,원룸,37,300.0,18.18,남동향,37.483274,126.945350,저,4,10,321.535298,1,4,1,2
39,원룸,50,1000.0,19.83,남향,37.478004,126.964249,2,5,2,129.109247,3,3,1,1
87,원룸,44,2000.0,19.83,동향,37.481315,126.942018,2,4,10,116.950922,2,3,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1659,원룸,45,500.0,19.84,남동향,37.486125,126.928411,2,10,10,242.916318,3,3,2,1
1681,원룸,42,500.0,26.46,남동향,37.485385,126.928474,고,10,10,171.240151,2,1,2,1
1690,원룸,40,500.0,21.49,남동향,37.485212,126.928696,중,11,10,143.951168,2,2,2,1
1733,원룸,40,500.0,19.80,동향,37.479568,126.962835,저,3,10,302.946145,2,4,2,2


- 최종 매물 리스트 시각화

In [63]:
f = folium.Figure(width=700, height=500)
m = folium.Map(location=[37.486313, 126.935378], zoom_start=14).add_to(f)

for idx in data_filtered_final.index:
    lat = data_filtered_final.loc[idx, '위도']
    long = data_filtered_final.loc[idx, '경도']
    num = data_filtered_final.loc[idx, '구분']

    folium.Marker([lat, long]
                  , popup=f"<a href=https://m.land.naver.com/article/info/{num}>링크</a>"
                  ).add_to(m)
m