In [75]:
import numpy as np
np.set_printoptions(threshold=np.nan)
import pandas as pd
from pandas import DataFrame
from catboost import Pool, CatBoostClassifier, CatBoost

### Get the features

In [82]:
# This trains with only reviewed apartments, which achieved the highest accuracy
train_data_df = pd.read_csv('../data/reviewed_train.csv').dropna(subset=['zipcode', 'groupedPrice', 'availability'])
train_data_df['seaViews'] = train_data_df['seaViews'].real.astype(int)
train_data_df['swimmingPool'] = train_data_df['swimmingPool'].real.astype(int)
train_data_df['balcony'] = train_data_df['balcony'].real.astype(int)
train_data_df['host_is_superhost'] = train_data_df['host_is_superhost'].real.astype(int)
# train_data_df = train_data_df.drop(['beds', 'host_is_superhost', 'swimmingPool', 'balcony', 'seaViews', 'review_scores_rating', 'descLength'], axis=1)
train_features_df = train_data_df.drop(['groupedPrice'], axis=1)
train_labels_df = train_data_df['groupedPrice'].astype(str).apply(lambda x: (x.replace('lowCost', '0'))).astype(str).apply(lambda x: (x.replace('medium', '1'))).astype(str).apply(lambda x: (x.replace('premium', '2')))
train_labels_df = pd.to_numeric(train_labels_df, errors='coerce')

test_data_df = pd.read_csv('../data/reviewed_test.csv').dropna(subset=['zipcode', 'groupedPrice', 'availability'])
test_data_df['seaViews'] = test_data_df['seaViews'].real.astype(int)
test_data_df['swimmingPool'] = test_data_df['swimmingPool'].real.astype(int)
test_data_df['balcony'] = test_data_df['balcony'].real.astype(int)
test_data_df['host_is_superhost'] = test_data_df['host_is_superhost'].real.astype(int)
# test_data_df = test_data_df.drop(['beds', 'host_is_superhost', 'swimmingPool', 'balcony', 'seaViews', 'review_scores_rating', 'descLength'], axis=1)
test_features_df = test_data_df.drop(['groupedPrice'], axis=1)
test_labels_df = test_data_df['groupedPrice'].astype(str).apply(lambda x: (x.replace('lowCost', '0'))).astype(str).apply(lambda x: (x.replace('medium', '1'))).astype(str).apply(lambda x: (x.replace('premium', '2')))
test_labels_df = pd.to_numeric(test_labels_df, errors='coerce')

cat_features = [0,1,6,7,8]
column_names = list(train_features_df)
print column_names

p_train = Pool(train_features_df,
         train_labels_df,
         cat_features,
         column_names)
p_test = Pool(test_features_df,
         test_labels_df,
         cat_features,
         column_names)

['zipcode', 'property_type', 'bathrooms', 'bedrooms', 'beds', 'host_is_superhost', 'swimmingPool', 'balcony', 'security_deposit', 'seaViews', 'availability', 'review_scores_rating', 'descLength']


### Train the model, test it and print accuracy

In [108]:
model = CatBoostClassifier(iterations=4, learning_rate=0.8, depth=3, loss_function='MultiClass', verbose=True)
model.fit(p_train)

0:	learn: -0.9781344	total: 32.4ms	remaining: 97.2ms
1:	learn: -0.9433369	total: 70.4ms	remaining: 70.4ms
2:	learn: -0.9245916	total: 93.6ms	remaining: 31.2ms
3:	learn: -0.9080777	total: 118ms	remaining: 0us


<catboost.core._CatBoostBase at 0x107e8fc90>

In [109]:
preds_class = model.predict(p_test)
preds_proba = model.predict_proba(p_test)
preds_raw = model.predict(p_test, prediction_type='RawFormulaVal')

In [110]:
pred_flat_list = [item for sublist in preds_class for item in sublist]
compare = {'label': test_labels_df, 'pred': pred_flat_list}
compare_df = pd.DataFrame(data=compare)
right_guesses = (compare_df['label'] == compare_df['pred']).astype(int)
compare_df['guess'] = right_guesses

accurracy = 100*compare_df['guess'].sum()/len(compare_df.index)
print accurracy.astype(str) + '%'

# print compare_df['guess'].sum()
# print model.score(train_features_df, train_labels_df)

51%


In [86]:
# how is misclassification distributed?
arr1 = [0, 0, 0]
arr2 = [0, 0, 0]
arr0 = [0, 0, 0]
for i, r in compare_df.iterrows():
    if r['label'] == 0.0:
        if r['pred'] == 0.0:
            arr0[0] = arr0[0] + 1
        elif r['pred'] == 1.0:
            arr0[1] = arr0[1] + 1
        else:
            arr0[2] = arr0[2] + 1
    if r['label'] == 1.0:
        if r['pred'] == 0.0:
            arr1[0] = arr1[0] + 1
        elif r['pred'] == 1.0:
            arr1[1] = arr1[1] + 1
        else:
            arr1[2] = arr1[2] + 1
    if r['label'] == 2.0:
        if r['pred'] == 0.0:
            arr2[0] = arr2[0] + 1
        elif r['pred'] == 1.0:
            arr2[1] = arr2[1] + 1
        else:
            arr2[2] = arr2[2] + 1
print arr0
print arr1
print arr2


[278, 100, 61]
[205, 120, 132]
[36, 50, 183]


In [49]:
print test_data_df.iloc[0:10]

   zipcode property_type  bathrooms  bedrooms  beds  host_is_superhost  \
0     7610         House        2.0       4.0   7.0                  0   
1     7600         House        2.0       4.0   8.0                  0   
2     7002     Apartment        1.0       1.0   1.0                  0   
3     7011     Apartment        2.0       3.0   6.0                  0   
4     7600     Apartment        1.0       1.0   2.0                  1   
5     7001     Apartment        1.0       1.0   2.0                  1   
6     7003          Loft        2.0       1.0   1.0                  1   
7     7015     Apartment        1.0       2.0   3.0                  0   
8     7012     Apartment        1.5       1.0   3.0                  0   
9     7003     Apartment        2.0       3.0   6.0                  0   

   swimmingPool  balcony  security_deposit  seaViews  availability  \
0             0        0               0.0         0          84.0   
1             0        0             500.0   