In [126]:
import pandas as pd
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score

In [90]:
df = pd.read_csv('/home/matheus/Documentos/house-recommendation/data/train.csv', delimiter = ';')

In [91]:
df.head()

Unnamed: 0,Id,area,quartos,garagem,banheiros,bairro,preco,y
0,1718,600,4,3,4,Morada Da Colina,700000,0
1,528,61,2,1,1,Vida Nova,170000,0
2,1649,58,2,2,2,Pampulha,226000,0
3,1719,50,2,1,2,Novo Mundo,185000,1
4,42,78,2,2,3,Tabajaras,330000,0


In [92]:
cons = pd.DataFrame({'column': df.columns,
                             'missing_perc': (df.isna().sum() / df.shape[0]) * 100,
                             'dtype': df.dtypes})

numeric_features = list(cons[(cons['dtype'] == 'int64') | (cons['dtype'] == 'float') | (cons['dtype'] == 'bool')]['column'])
numeric_features.remove('y')
numeric_features.remove('Id')
categoric_features = list(cons[(cons['dtype'] == 'object')]['column'])

In [93]:
numeric_features

['area', 'quartos', 'garagem', 'banheiros', 'preco']

In [94]:
categoric_features

['bairro']

In [95]:
std_scaler = StandardScaler()

In [96]:
scaler = std_scaler
catb = ce.CatBoostEncoder(cols = categoric_features)
df[numeric_features] = scaler.fit_transform(df[numeric_features])
df[categoric_features] = catb.fit_transform(df[categoric_features], y=y)

In [97]:
numeric_features

['area', 'quartos', 'garagem', 'banheiros', 'preco']

In [118]:
model = LGBMClassifier()
X = df.drop(['Id','y'], axis = 1)
y = df['y'].values
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=42)
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)

In [119]:
print('Mean ROC AUC: %.3f' % scores.mean())

Mean ROC AUC: 0.951


In [120]:
steps = [('over', SMOTE()), ('model', LGBMClassifier())]
pipeline = Pipeline(steps=steps)

In [121]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
scores = cross_val_score(pipeline, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
print('Mean ROC AUC: %.3f' % scores.mean())

Mean ROC AUC: 0.950


In [122]:
pipeline.fit(X, y)

Pipeline(memory=None,
         steps=[('over',
                 SMOTE(k_neighbors=5, n_jobs=None, random_state=None,
                       sampling_strategy='auto')),
                ('model',
                 LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                colsample_bytree=1.0, importance_type='split',
                                learning_rate=0.1, max_depth=-1,
                                min_child_samples=20, min_child_weight=0.001,
                                min_split_gain=0.0, n_estimators=100, n_jobs=-1,
                                num_leaves=31, objective=None,
                                random_state=None, reg_alpha=0.0,
                                reg_lambda=0.0, silent=True, subsample=1.0,
                                subsample_for_bin=200000, subsample_freq=0))],
         verbose=False)

In [123]:
modelo = pipeline

In [128]:
df_test = pd.read_csv('/home/matheus/Documentos/house-recommendation/data/test.csv', delimiter = ';')

In [103]:
df_test.drop(['Id', 'y'], axis = 1, inplace = True)

In [104]:
df_test.head(1)

Unnamed: 0,area,quartos,garagem,banheiros,bairro,preco
0,240,3,4,2,Planalto,370000


In [105]:
df_test.dtypes

area          int64
quartos       int64
garagem       int64
banheiros     int64
bairro       object
preco         int64
dtype: object

In [106]:
numeric = df_test.select_dtypes(include='int64').columns.tolist()
cat = df_test.select_dtypes(exclude='int64').columns.tolist()

In [107]:
df_test.head()

Unnamed: 0,area,quartos,garagem,banheiros,bairro,preco
0,240,3,4,2,Planalto,370000
1,68,2,1,2,Dona Zulmira,160000
2,56,2,1,1,Santa Monica,165000
3,44,2,1,1,Shopping Park,120000
4,102,3,2,2,Santa Monica,455000


In [108]:
df_test[numeric] = scaler.transform(df_test[numeric])

In [110]:
df_test[cat] = catb.transform(df_test[cat])

In [111]:
df_test

Unnamed: 0,area,quartos,garagem,banheiros,bairro,preco
0,0.458221,0.299322,1.700116,-0.105917,0.514236,-0.070601
1,-0.531542,-0.837344,-0.768790,-0.105917,0.056944,-0.696467
2,-0.600595,-0.837344,-0.768790,-1.047397,0.373792,-0.681565
3,-0.669648,-0.837344,-0.768790,-1.047397,0.005511,-0.815679
4,-0.335891,0.299322,0.054179,-0.105917,0.373792,0.182726
...,...,...,...,...,...,...
588,-0.600595,-0.837344,-0.768790,-0.105917,0.198611,-0.578744
589,1.148752,0.299322,-0.768790,0.835564,0.361806,-0.160010
590,-0.635121,-0.837344,-0.768790,-1.047397,0.166987,-0.770975
591,-0.531542,-0.837344,0.054179,-0.105917,0.030811,-0.398435


In [125]:
y_pred_test = modelo.predict(df_test)

In [129]:
y_true = df_test['y']
roc_auc_score(y_true, y_pred_test)

0.8733606557377049

In [130]:
df_test

Unnamed: 0,Id,area,quartos,garagem,banheiros,bairro,preco,y
0,1496,240,3,4,2,Planalto,370000,0
1,267,68,2,1,2,Dona Zulmira,160000,0
2,1672,56,2,1,1,Santa Monica,165000,1
3,546,44,2,1,1,Shopping Park,120000,0
4,1539,102,3,2,2,Santa Monica,455000,0
...,...,...,...,...,...,...,...,...
588,1693,56,2,1,2,Tibery,199500,0
589,25,360,3,1,3,Daniel Fonseca,340000,0
590,521,50,2,1,1,Jardim Brasilia,135000,0
591,1417,68,2,2,2,Brasil,260000,0


In [147]:
entrada_dados = {
    'Id' : 132,
    'area' : 125,
    'quartos' : 3,
    'garagem' : 2,
    'banheiros' : 3,
    'bairro' : 'Aparecida',
    'preco' : 1800000
}

In [148]:
features_names = df_test.drop(['Id', 'y'], axis = 1).columns.tolist()

In [149]:
df_2 = pd.DataFrame(index=[0], columns=features_names)
df_2 = df_2.fillna(value=0)

for i in entrada_dados.items():
  df_2[i[0]] = i[1]

In [150]:
df_2[cat] = catb.transform(df_2[cat])
df_2[numeric] = scaler.transform(df_2[numeric])

In [151]:
df_2

Unnamed: 0,area,quartos,garagem,banheiros,bairro,preco,Id
0,-0.203539,0.299322,0.054179,0.835564,0.077238,4.191248,132


In [152]:
y_pred_test = modelo.predict(df_2.drop('Id', axis = 1))

In [153]:
y_pred_test

array([0])