# SKN 19기 mini-project 5팀(팀명: 여권어디있지) M/L

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


# pandas 출력 옵션
pd.set_option('display.float_format', '{:.2f}'.format)

# 한글 폰트 사용을 위한 설정
import matplotlib.font_manager as fm
import matplotlib

font_path = 'C:\\Windows\\Fonts\\gulim.ttc'
font = fm.FontProperties(fname=font_path).get_name()
matplotlib.rc('font', family=font)

## 데이터 로드

In [None]:
# 데이터 로드
# 6. 데이터 변환 및 피처 엔지니어링
# 7. 데이터 분할

In [2]:
df_listings = pd.read_csv('./data/listings.csv')

df_listings.head(5)

Unnamed: 0,neighbourhood_cleansed,property_type,room_type,accommodates,bathrooms,bedrooms,beds,amenities,price,number_of_reviews,first_review,review_scores_rating
0,Bunkyo Ku,Entire rental unit,Entire home/apt,3,1.0,0.0,2.0,"[""Bidet"", ""Body soap"", ""Bed linens"", ""Hot wate...",100000.0,0,,
1,Bunkyo Ku,Entire rental unit,Entire home/apt,8,1.0,2.0,5.0,"[""Bidet"", ""Body soap"", ""Bed linens"", ""Hot wate...",100000.0,0,,
2,Taito Ku,Entire serviced apartment,Entire home/apt,4,2.0,2.0,2.0,"[""Room-darkening shades"", ""Body soap"", ""Carbon...",14550.0,24,2023-12-04,4.42
3,Kita Ku,Entire home,Entire home/apt,8,1.0,3.0,6.0,"[""Room-darkening shades"", ""Cooking basics"", ""C...",22012.0,15,2024-01-10,4.73
4,Sumida Ku,Entire rental unit,Entire home/apt,3,1.0,1.0,1.0,"[""Clothing storage: closet"", ""Elevator"", ""Hair...",15429.0,17,2023-12-26,5.0


In [90]:
# neighbourhood cleansed 라벨인코딩

from sklearn.preprocessing import LabelEncoder
l_encod = LabelEncoder()

df_listings['neighbourhood_cleansed'] = l_encod.fit_transform(df_listings['neighbourhood_cleansed'])
df_listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23007 entries, 0 to 23006
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   neighbourhood_cleansed  23007 non-null  int64  
 1   property_type           23007 non-null  object 
 2   room_type               23007 non-null  object 
 3   accommodates            23007 non-null  int64  
 4   bathrooms               23007 non-null  float64
 5   bedrooms                23007 non-null  float64
 6   beds                    23007 non-null  float64
 7   amenities               23007 non-null  object 
 8   price                   23007 non-null  float64
 9   number_of_reviews       23007 non-null  int64  
 10  first_review            19875 non-null  object 
 11  review_scores_rating    19875 non-null  float64
dtypes: float64(5), int64(3), object(4)
memory usage: 2.1+ MB


first review, review scores rating 칼럼의 결측치
- first review의 결측치 행과 review scores rating의 결측치 행이 완벽히 일치함
- first review의 NaN은 '아직 없다' -> '리뷰가 없다' -> review scores rating = 0 으로 해석 가능하지만, review scores rating만 보면 NaN을 0점으로 두는 건 의미상 애매.
- 아니면 결측치를 따로 분류해주는 Boosting 모델 사용 필요

In [None]:
# first_rieview와 review_scores_rating의 unique한 row 개수가 19875개.
df_listings[df_listings['first_review'].isna()]['review_scores_rating'].isna()
# Lengh = 3132. 즉, 19875+3132 = 23007

0        True
1        True
17       True
26       True
51       True
         ... 
23002    True
23003    True
23004    True
23005    True
23006    True
Name: review_scores_rating, Length: 3132, dtype: bool

내일 할 일
1. Target Encoding 적용해보기
2. property type이 각 어느 room_type에 속해있는지 도회해보기
3. review scores rating 결측치(3천개) 개수 == number_of_reviews이므로 결측치 어떻게 할지 묶어서 고민
4. first_review 이용해 새로운 칼럼으로 숙소 운영 days 칼럼 생성 (긴데 리뷰 개수는 적다면 아마 좋은 숙소x, 숙소 가격 하락(추측))
5. 정규화가 필요없는 앙상블 모델로 학습, 평가, 교차 검증 수행

In [None]:
# 모델 학습 데이터 (지역, 최대수용인원, 화장실 개수, 침실 개수, 침대 개수)
df_test = df_listings.loc[:, ['neighbourhood_cleansed', 'accommodates', 'bathrooms', 'bedrooms', 'beds', 'price']]

In [91]:
df_test.describe()

Unnamed: 0,neighbourhood_cleansed,accommodates,bathrooms,bedrooms,beds,price
count,23007.0,23007.0,23007.0,23007.0,23007.0,23007.0
mean,36.33,4.44,1.15,1.39,2.9,17758.09
std,12.62,2.89,0.49,0.94,2.19,12886.22
min,0.0,1.0,0.0,0.0,0.0,1700.0
25%,33.0,2.0,1.0,1.0,1.0,9429.0
50%,42.0,4.0,1.0,1.0,2.0,13912.0
75%,44.0,6.0,1.0,2.0,4.0,21572.0
max,48.0,16.0,10.0,10.0,25.0,100286.0


In [None]:
# 1. SVR 모델 학습
    # - train, test 데이터 나누기
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR

X = df_test.drop('price', axis=1)
y = df_test['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

(0.3644884255883608, 0.4057858515469851)

In [None]:
# 1. SVR 모델 학습
    # - kernel='rbf' 고정, C값 변화에 따른 평가
C = [100, 500, 1000, 2000, 5000, 10000]
for c in C:
    svr = SVR(kernel='rbf', C=c)
    svr.fit(X_train, y_train)
    print(f'c={c}일 때 {svr.score(X_train, y_train)}, {svr.score(X_test, y_test)}')

c=100일 때 0.2726759831817944, 0.3106691237637621
c=500일 때 0.3349708204738413, 0.37642067271530955
c=1000일 때 0.34560704632642847, 0.386970295781344
c=2000일 때 0.3526052756153806, 0.39374469599906414
c=5000일 때 0.3592384793514437, 0.40033923708473496
c=10000일 때 0.3644884255883608, 0.4057858515469851


In [None]:
# 1. SVR 모델 학습
    # - 교차 검증. 0.33~0.38
from sklearn.model_selection import cross_validate

model = SVR(C=5000)
scores = cross_validate(model, X, y, cv=3)
print("교차검증 결과")
scores

교차검증 결과


{'fit_time': array([8.90196466, 9.29569077, 7.36409187]),
 'score_time': array([8.67032242, 7.80012059, 6.69727826]),
 'test_score': array([0.33025018, 0.38199083, 0.39114454])}

In [None]:
# 2. 결정트리 모델 학습
    # - max_depth 값만 바꾸어 가며 테스트. 8일 때 굿
from sklearn.tree import DecisionTreeRegressor

dt_reg = DecisionTreeRegressor(random_state=0, max_depth=7)
#   random_state= 고정하는 이유: 질문의 형태가 달라져서(가하는 규칙이 달라져서) test 데이터 평가 점수도 달라짐
dt_reg.fit(X_train, y_train)

print(dt_reg.score(X_train, y_train), dt_reg.score(X_test, y_test))

# max_depth = 7일 때 0.4832279846116381 0.48772168188054876
# max_depth = 8일 때 0.5220768679288156 0.4999746838148147
# max_depth = 9일 때 0.5624031778984009 0.5013450545432857

0.4509605627700506 0.46419652597562167


In [76]:
# 1. SVR 모델 학습
    # - 교차 검증 0.39 ~ 0.45
from sklearn.model_selection import cross_validate

model = DecisionTreeRegressor(max_depth = 8)
scores = cross_validate(model, X, y, cv=10)
print("교차검증 결과")
scores

교차검증 결과


{'fit_time': array([0.01798034, 0.02629256, 0.01608849, 0.01993823, 0.02257538,
        0.02000308, 0.01655722, 0.02294946, 0.01927042, 0.0212822 ]),
 'score_time': array([0.        , 0.00308061, 0.        , 0.00835633, 0.        ,
        0.01178837, 0.        , 0.        , 0.        , 0.        ]),
 'test_score': array([0.3776484 , 0.46045821, 0.35061282, 0.48870587, 0.421269  ,
        0.43728563, 0.45249218, 0.54414064, 0.45000246, 0.44455194])}

In [81]:
# 3. 랜덤포레스트 모델 평가
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(n_estimators=50, random_state=0)

rf_reg.fit(X_train, y_train)
rf_reg.score(X_train, y_train), rf_reg.score(X_test, y_test)

(0.6999185583812993, 0.5164239735703761)