![house Logo](https://www.imovelweb.com.br/noticias/wp-content/uploads/2013/08/venda2.jpg)

# House Sales in King County, USA


Este conjunto de dados contém preços de imóveis para o Condado de King, que inclui Seattle. Inclui casas vendidas entre maio de 2014 e maio de 2015.

https://www.kaggle.com/harlfoxem/housesalesprediction



In [None]:
import matplotlib.pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
import seaborn as sns
import warnings
import mlflow
import mlflow.sklearn


warnings.filterwarnings('ignore')
py.init_notebook_mode(connected=True)
sns.set(style="white")
%matplotlib inline 

## Dados

In [None]:
import pandas as pd

In [None]:
# Raw Data
df  = pd.read_csv('data/raw.csv', parse_dates=['date'], index_col=['id'])#.set_index('date')
df.head()

In [None]:
df.info()

In [None]:
# plot
df_plot = df.set_index('date').resample('W').count()['price'].reset_index()
trace1  = go.Scatter(x=df_plot.date, y=df_plot.price)
data    = [trace1]
layout = go.Layout(
    title='Total de casas vendidas por semana',
    xaxis=dict(title='Data'),
    yaxis=dict(title='Total')
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig)

## Problema


**Devemos criar um modelo para prever o preço do imóvel a partir de alguns dados, esse modelo será utilizado em um portal de classificados de imóveis**

![house Logo](https://i.imgur.com/Wf7316S.png)



## Preparação dos Dados

* Engenharia de Features
* Transformações 
* Limpeza dos dados

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.sample(2).to_json(orient='split')

In [None]:
#df['bedrooms_per_sqft_living'] = df.bedrooms/df.sqft_living

In [None]:
#....

In [None]:
# Save sample
df.drop(["date","price"], axis=1).sample(10)\
    .to_csv('data/predict_test.csv', index=False)

## Model Training

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import numpy as np

# Avaliação
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae  = mean_absolute_error(actual, pred)
    r2   = r2_score(actual, pred)
    return rmse, mae, r2

### Split Train/Test

In [None]:
# Save Dataset
df.to_csv('artefacts/dataset.csv', index=False)

# Features
X = df.drop(["date","price"], axis=1)

# Target
y = df['price']

# Split
X_train, X_test, \
     y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Features
X_train.head(2)

In [None]:
# target
y_train.head(2)

### Build Model

In [None]:
from sklearn.linear_model import LinearRegression

# Define Model
params = {'fit_intercept': False, 'normalize': True, 'n_jobs': -1}

model  = LinearRegression(**params)

#train
model.fit(X_train, y_train)

### Evaluation

In [None]:
# eval
pred_y = model.predict(X_test)

(rmse, mae, r2) = eval_metrics(y_test, pred_y)

print("RMSE: %s" % rmse)
print("MAE: %s" % mae)
print("R2: %s" % r2)

In [None]:
g = sns.jointplot(x=y_test, y=pred_y, kind="reg", color="m", height=7)
g.savefig('artefacts/img_res_0.png')

In [None]:
# # traking

# # log in ml FLow
# with mlflow.start_run():
    
#     # SAlva pasta de artefatos
#     mlflow.log_artifacts("artefacts/")
    
#     # Salva parametros
#     for k, v in params.items():
#         mlflow.log_param(k, v)
    
#     # Salva métricas
#     mlflow.log_metric("RMSE", rmse)
#     mlflow.log_metric("MAE",  mae)
#     mlflow.log_metric("R2",   r2)
    
#     # salva model
#     mlflow.sklearn.log_model(model, "model")

### Build Model 2

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Define Model
params = {'n_estimators': 10, 
          'max_depth': 10, 
          'n_jobs': -1}

model  = RandomForestRegressor(**params)

In [None]:
#train
model.fit(X_train, y_train)

### Evaluation

In [None]:
# eval
pred_y = model.predict(X_test)
(rmse, mae, r2) = eval_metrics(y_test, pred_y)

print("RMSE: %s" % rmse)
print("MAE: %s" % mae)
print("R2: %s" % r2)

In [None]:
g = sns.jointplot(x=y_test, y=pred_y, kind="reg", color="m", height=7)
g.savefig('artefacts/img_res_0.png')

In [None]:
# # tracking
# with mlflow.start_run():
    
#     mlflow.log_artifacts("artefacts/")
    
#     for k, v in params.items():
#         mlflow.log_param(k, v)
        
#     mlflow.log_metric("RMSE", rmse)
#     mlflow.log_metric("MAE",  mae)
#     mlflow.log_metric("R2",   r2)
    
#     mlflow.sklearn.log_model(model, "model")

### Pipeline de ML

![image.png](https://i.imgur.com/Scx9nFd.png)

### Save Model

In [None]:
from sklearn.externals import joblib

In [None]:
# Save
joblib.dump(model, './artefacts/model.pkl')

In [None]:
# load
model = joblib.load('./artefacts/model.pkl')
model

In [None]:
sample = X_test.sample(1)
sample

In [None]:
model.predict(sample)

## MLflow

![mlflow](https://databricks.com/wp-content/uploads/2018/06/mlflow.png)

https://mlflow.org/

### Teste Simples do Tracking

In [None]:
import mlflow
#mlflow.set_tracking_uri("http://34.73.179.244/")

# log in ml FLow
with mlflow.start_run():
    
    mlflow.log_artifacts("artefacts/")
    
    mlflow.log_param("param1", 0)
    
    mlflow.log_metric("RMSE", 0)
    mlflow.log_metric("MAE",  0)
    mlflow.log_metric("R2",   0)

## Keras

### Build Model - Keras

In [None]:
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Input

In [None]:
# Model
model = Sequential([
    Dense(100, input_shape=(X_train.shape[1],)),
    Activation('relu'),
    Dense(1, activation='linear'),
])

model.compile(optimizer='rmsprop', loss='mse')
model.summary()

In [None]:
# train
hist = model.fit(X_train, y_train, 
          validation_data=(X_test, y_test), 
          batch_size=254, epochs=10)

In [None]:
def plot_hist(hist):
    # summarize history for loss
    fig, ax = plt.subplots()  # create figure & 1 axis

    plt.plot(hist.history['loss'])
    plt.plot(hist.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    return fig 

fig = plot_hist(hist)
fig.savefig('artefacts/train_hist.png')

 ### Evaluation

In [None]:
score = model.evaluate(X_test, y_test)

In [None]:
# eval
pred_y = model.predict(X_test).reshape(-1)
(rmse, mae, r2) = eval_metrics(y_test, pred_y)

print("RMSE: %s" % rmse)
print("MAE: %s" % mae)
print("R2: %s" % r2)

In [None]:
g = sns.jointplot(x=y_test, y=pred_y, kind="reg", color="m", height=7)
g.savefig('artefacts/img_res_0.png')

In [None]:
from mlflow import keras

# log in ml FLow
with mlflow.start_run():
    
    mlflow.log_artifacts("artefacts/")
    
    mlflow.log_param('layers', [32])
    
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("MAE",  mae)
    mlflow.log_metric("R2",   r2)
    
    mlflow.keras.log_model(model, "model")    