In [1]:
import pandas as pd
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv('/home/matheus/Documentos/house-recommendation/data/train.csv', delimiter = ';')

In [3]:
df.head()

Unnamed: 0,Id,area,quartos,garagem,banheiros,bairro,preco,y
0,1718,600,4,3,4,Morada Da Colina,700000,0
1,528,61,2,1,1,Vida Nova,170000,0
2,1649,58,2,2,2,Pampulha,226000,0
3,1719,50,2,1,2,Novo Mundo,185000,1
4,42,78,2,2,3,Tabajaras,330000,0


In [4]:
cons = pd.DataFrame({'column': df.columns,
                             'missing_perc': (df.isna().sum() / df.shape[0]) * 100,
                             'dtype': df.dtypes})

numeric_features = list(cons[(cons['dtype'] == 'int64') | (cons['dtype'] == 'float') | (cons['dtype'] == 'bool')]['column'])
numeric_features.remove('y')
numeric_features.remove('Id')
categoric_features = list(cons[(cons['dtype'] == 'object')]['column'])

In [5]:
numeric_features

['area', 'quartos', 'garagem', 'banheiros', 'preco']

In [6]:
categoric_features

['bairro']

In [7]:
std_scaler = StandardScaler()

In [8]:
y = df['y']
scaler = std_scaler
catb = ce.CatBoostEncoder(cols = categoric_features)
df[numeric_features] = scaler.fit_transform(df[numeric_features])
df[categoric_features] = catb.fit_transform(df[categoric_features], y=y)

In [9]:
numeric_features

['area', 'quartos', 'garagem', 'banheiros', 'preco']

In [10]:
model = CatBoostClassifier()
X = df.drop(['Id','y'], axis = 1)
y = df['y'].values
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=42)
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)

In [11]:
print('Mean ROC AUC: %.3f' % scores.mean())

Mean ROC AUC: 0.958


In [12]:
steps = [('over', SMOTE()), ('model', CatBoostClassifier())]
pipeline = Pipeline(steps=steps)

In [13]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % scores.mean())

Mean ROC AUC: 0.958


In [14]:
pipeline.fit(X, y)

Learning rate set to 0.013821
0:	learn: 0.6777888	total: 50.2ms	remaining: 50.1s
1:	learn: 0.6607885	total: 54.9ms	remaining: 27.4s
2:	learn: 0.6466970	total: 60.8ms	remaining: 20.2s
3:	learn: 0.6310669	total: 63ms	remaining: 15.7s
4:	learn: 0.6152246	total: 65.3ms	remaining: 13s
5:	learn: 0.5996438	total: 67.6ms	remaining: 11.2s
6:	learn: 0.5850754	total: 69.9ms	remaining: 9.92s
7:	learn: 0.5733910	total: 72.3ms	remaining: 8.96s
8:	learn: 0.5583969	total: 75.8ms	remaining: 8.35s
9:	learn: 0.5457893	total: 78ms	remaining: 7.72s
10:	learn: 0.5355348	total: 81.5ms	remaining: 7.33s
11:	learn: 0.5249218	total: 85.5ms	remaining: 7.04s
12:	learn: 0.5126989	total: 90.4ms	remaining: 6.87s
13:	learn: 0.5007962	total: 94.8ms	remaining: 6.68s
14:	learn: 0.4914266	total: 97.4ms	remaining: 6.39s
15:	learn: 0.4805032	total: 99.8ms	remaining: 6.14s
16:	learn: 0.4707844	total: 102ms	remaining: 5.9s
17:	learn: 0.4629851	total: 105ms	remaining: 5.72s
18:	learn: 0.4554173	total: 109ms	remaining: 5.62s
19

Pipeline(memory=None,
         steps=[('over',
                 SMOTE(k_neighbors=5, n_jobs=None, random_state=None,
                       sampling_strategy='auto')),
                ('model',
                 <catboost.core.CatBoostClassifier object at 0x7f6a68bcf5b0>)],
         verbose=False)

In [15]:
modelo = pipeline['model']

In [16]:
modelo

<catboost.core.CatBoostClassifier at 0x7f6a68bcf5b0>

In [17]:
df_test = pd.read_csv('/home/matheus/Documentos/house-recommendation/data/test.csv', delimiter = ';')

In [18]:
df_test.drop(['Id', 'y'], axis = 1, inplace = True)

In [19]:
df_test.head(1)

Unnamed: 0,area,quartos,garagem,banheiros,bairro,preco
0,240,3,4,2,Planalto,370000


In [20]:
df_test.dtypes

area          int64
quartos       int64
garagem       int64
banheiros     int64
bairro       object
preco         int64
dtype: object

In [21]:
numeric = df_test.select_dtypes(include='int64').columns.tolist()
cat = df_test.select_dtypes(exclude='int64').columns.tolist()

In [22]:
df_test.head()

Unnamed: 0,area,quartos,garagem,banheiros,bairro,preco
0,240,3,4,2,Planalto,370000
1,68,2,1,2,Dona Zulmira,160000
2,56,2,1,1,Santa Monica,165000
3,44,2,1,1,Shopping Park,120000
4,102,3,2,2,Santa Monica,455000


In [23]:
df_test[numeric] = scaler.transform(df_test[numeric])

In [24]:
df_test[cat] = catb.transform(df_test[cat])

In [25]:
df_test

Unnamed: 0,area,quartos,garagem,banheiros,bairro,preco
0,0.458221,0.299322,1.700116,-0.105917,0.514236,-0.070601
1,-0.531542,-0.837344,-0.768790,-0.105917,0.056944,-0.696467
2,-0.600595,-0.837344,-0.768790,-1.047397,0.373792,-0.681565
3,-0.669648,-0.837344,-0.768790,-1.047397,0.005511,-0.815679
4,-0.335891,0.299322,0.054179,-0.105917,0.373792,0.182726
...,...,...,...,...,...,...
588,-0.600595,-0.837344,-0.768790,-0.105917,0.198611,-0.578744
589,1.148752,0.299322,-0.768790,0.835564,0.361806,-0.160010
590,-0.635121,-0.837344,-0.768790,-1.047397,0.166987,-0.770975
591,-0.531542,-0.837344,0.054179,-0.105917,0.030811,-0.398435


In [26]:
y_pred_test = modelo.predict(df_test)

In [27]:
df_test = pd.read_csv('/home/matheus/Documentos/house-recommendation/data/test.csv', delimiter = ';')

In [28]:
y_true = df_test['y']
roc_auc_score(y_true, y_pred_test)

0.8981947697111632

In [29]:
df_test['preds'] = y_pred_test

In [30]:
df_test

Unnamed: 0,Id,area,quartos,garagem,banheiros,bairro,preco,y,preds
0,1496,240,3,4,2,Planalto,370000,0,0
1,267,68,2,1,2,Dona Zulmira,160000,0,0
2,1672,56,2,1,1,Santa Monica,165000,1,1
3,546,44,2,1,1,Shopping Park,120000,0,0
4,1539,102,3,2,2,Santa Monica,455000,0,0
...,...,...,...,...,...,...,...,...,...
588,1693,56,2,1,2,Tibery,199500,0,1
589,25,360,3,1,3,Daniel Fonseca,340000,0,0
590,521,50,2,1,1,Jardim Brasilia,135000,0,0
591,1417,68,2,2,2,Brasil,260000,0,0


In [80]:
entrada_dados = {
    'Id' : 132,
    'area' : 56,
    'quartos' : 2,
    'garagem' : 1,
    'banheiros' : 1,
    'bairro' : 'PLanalto',
    'preco' : 165000
}

In [81]:
features_names = df_test.drop(['Id', 'y'], axis = 1).columns.tolist()

In [82]:
df_2 = pd.DataFrame(index=[0], columns=features_names)
df_2 = df_2.fillna(value=0)

for i in entrada_dados.items():
  df_2[i[0]] = i[1]

In [83]:
df_2[cat] = catb.transform(df_2[cat])
df_2[numeric] = scaler.transform(df_2[numeric])

In [84]:
df_2

Unnamed: 0,area,quartos,garagem,banheiros,bairro,preco,preds,Id
0,-0.600595,-0.837344,-0.76879,-1.047397,0.170833,-0.681565,0,132


In [85]:
y_pred_test = modelo.predict(df_2.drop('Id', axis = 1))

In [86]:
y_pred_test

array([1])