## 에어비엔비 가격?
- 평가: R-Squared, MAE, MSE, RMSE, RMSLE, MAPE
- target : price(가격)
- csv파일 생성 : 수험번호.csv (예시 아래 참조)
~~~
id,price
34323697,238
29927138,183
120362,234
~~~



## 데이터 불러오기

In [118]:
import pandas as pd
train = pd.read_csv('ab_nyc/train.csv')
test = pd.read_csv('ab_nyc/test.csv')

##EDA

In [119]:
print(train.shape, test.shape)

(39116, 16) (9779, 15)


In [120]:
display(train.head(2)) # 타겟은 price
test.head(2)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,price
0,35742633,Luxury new 1 bed apartment in East Williamsburg,265866685,Mindy,Brooklyn,Bushwick,40.69796,-73.92915,Entire home/apt,30,0,,,2,179,198
1,15840089,Kid- (and Adult-) Friendly Uptown 2-Bedroom,99602138,Yolanda,Manhattan,Washington Heights,40.8349,-73.94829,Entire home/apt,3,36,2019-01-01,1.13,1,5,150


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,34323697,Sonder | The Biltmore | Bright 1BR + Sofa Bed,219517861,Sonder (NYC),Manhattan,Theater District,40.75965,-73.98652,Entire home/apt,29,0,,,327,338
1,29927138,A great space in NYC,158461160,Sophia,Brooklyn,Bedford-Stuyvesant,40.68062,-73.94418,Entire home/apt,30,1,2019-05-31,0.75,6,284


In [121]:
train.describe()

Unnamed: 0,id,host_id,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,price
count,39116.0,39116.0,39116.0,39116.0,39116.0,39116.0,31063.0,39116.0,39116.0,39116.0
mean,18966690.0,67370700.0,40.729054,-73.952084,7.022395,23.268509,1.372676,7.206182,112.718478,153.057547
std,10967430.0,78424250.0,0.054451,0.046106,19.547706,44.62541,1.682309,33.100783,131.479798,248.758522
min,2539.0,2438.0,40.49979,-74.24442,1.0,0.0,0.01,1.0,0.0,0.0
25%,9435824.0,7829861.0,40.69025,-73.98306,1.0,1.0,0.19,1.0,0.0,69.0
50%,19619060.0,30509660.0,40.72306,-73.95568,3.0,5.0,0.71,1.0,45.0,105.0
75%,29073270.0,107434400.0,40.76313,-73.936017,5.0,23.0,2.01,2.0,225.0,175.0
max,36487240.0,274321300.0,40.91306,-73.71299,999.0,607.0,58.5,327.0,365.0,10000.0


In [122]:
train.describe(include='O')

Unnamed: 0,name,host_name,neighbourhood_group,neighbourhood,room_type,last_review
count,39102,39100,39116,39116,39116,31063
unique,38447,9943,5,221,3,1720
top,Hillside Hotel,David,Manhattan,Williamsburg,Entire home/apt,2019-06-23
freq,14,337,17336,3136,20349,1123


In [123]:
target = train.pop("price")

In [124]:
print(train.shape, test.shape)

(39116, 15) (9779, 15)


## 데이터 전처리 & 피처엔지니어링

In [125]:
cols = ['name', 'host_name', 'last_review', 'host_id']
print(train.shape)
train = train.drop(cols, axis=1)
test = test.drop(cols, axis=1)
print(train.shape)

(39116, 15)
(39116, 11)


In [126]:
train['reviews_per_month'] = train['reviews_per_month'].fillna(0)
test['reviews_per_month'] = test['reviews_per_month'].fillna(0)
train.isnull().sum()

id                                0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [127]:
cols =['neighbourhood_group', 'neighbourhood', 'room_type']
from sklearn.preprocessing import LabelEncoder

for col in cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])

train[cols]


Unnamed: 0,neighbourhood_group,neighbourhood,room_type
0,1,28,0
1,2,206,0
2,1,28,0
3,1,214,0
4,1,13,0
...,...,...,...
39111,1,214,1
39112,2,94,1
39113,1,19,0
39114,1,41,0


## 검증 데이터 분리

In [128]:
from sklearn.model_selection import train_test_split
X_tr, X_val, y_tr, y_val = train_test_split(train, target, test_size=0.2, random_state=42)

In [129]:
print(X_tr.shape, X_val.shape, y_tr.shape, y_val.shape)

(31292, 11) (7824, 11) (31292,) (7824,)


## 모델 & 평가

In [131]:
# 평가
import numpy as np
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
def rmse(y_test, y_pred): #RMSE
    return np.sqrt(mean_squared_error(y_test, y_pred))

def rmsle(y_test, y_pred): #RMSLE
    return np.sqrt(np.mean(np.power(np.log1p(y_test) - np.log1p(y_pred), 2)))

def mape(y_test, y_pred): #MAPE
    return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

In [139]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
rf.fit(X_tr, y_tr)
pred = rf.predict(X_val)
print("r2: ,", r2_score(y_val, pred))
print("mae: ,", mean_absolute_error(y_val, pred))
print("mse: ,", mean_squared_error(y_val, pred))
print("rmse: ,", rmse(y_val, pred))

r2: , 0.09703562377377373
mae: , 67.84189544989775
mse: , 46533.83580379602
rmse: , 215.71702715315735


In [138]:
#랜덤포레스트
from xgboost import XGBRegressor

model = XGBRegressor(random_state=42)
model.fit(X_tr,y_tr)
pred=model.predict(X_val)
print("r2: ",r2_score(y_val, pred))
print("mae: ",mean_absolute_error(y_val, pred))
print("mse: ",mean_squared_error(y_val, pred))
print("rmse: ",rmse(y_val, pred))
print("rmsle: ",rmsle(y_val, pred))
print("mape: ",mape(y_val, pred))

r2:  0.14264227657792217
mae:  71.26478516941049
mse:  44183.51883778398
rmse:  210.1987603145746
rmsle:  0.5478067646587614
mape:  52.3556030052965


  return np.sqrt(np.mean(np.power(np.log1p(y_test) - np.log1p(y_pred), 2)))


In [137]:
from lightgbm import LGBMRegressor

model = LGBMRegressor(random_state=42, verbose=-1)
model.fit(X_tr,y_tr)
pred=model.predict(X_val)
print("r2: ",r2_score(y_val, pred))
print("mae: ",mean_absolute_error(y_val, pred))
print("mse: ",mean_squared_error(y_val, pred))
print("rmse: ",rmse(y_val, pred))
print("rmsle: ",rmsle(y_val, pred))
print("mape: ",mape(y_val, pred))

r2:  0.24113512494658418
mae:  66.19307774173389
mse:  39107.73716299594
rmse:  197.75676262266214
rmsle:  0.49253615710769194
mape:  46.078352316219544


  return np.sqrt(np.mean(np.power(np.log1p(y_test) - np.log1p(y_pred), 2)))


In [140]:
pred=model.predict(test)

array([352.93826,  94.99342, 174.93753, ..., 139.05786, 163.84447,
       180.3265 ], dtype=float32)

## 예측 및 csv 제출

In [143]:
submit = pd.DataFrame({
    'pred': pred
})

In [145]:
submit.to_csv('연습문제2번제출.csv', index=False)

##정리

In [147]:
y_test = pd.read_csv("ab_nyc/y_test.csv")
pred = pd.read_csv('연습문제2번제출.csv')
print(r2_score(y_test, pred))

-0.013293263165693014
