# Pre Processing

In [1]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/raw/casas.csv')

In [4]:
df.head()

Unnamed: 0,tamanho,ano,garagem,preco
0,159.0,2003,2,208500
1,117.0,1976,2,181500
2,166.0,2001,2,223500
3,160.0,1915,3,140000
4,204.0,2000,3,250000


In [5]:
X = df.drop('preco',axis=1)
y = df['preco'].copy()

In [6]:
X.head()

Unnamed: 0,tamanho,ano,garagem
0,159.0,2003,2
1,117.0,1976,2
2,166.0,2001,2
3,160.0,1915,3
4,204.0,2000,3


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
X_train.shape

(1022, 3)

In [9]:
X_test.shape

(438, 3)

# Set up mlflow experiment

In [None]:
!pip3 install mlflow -quiet

In [15]:
import mlflow

In [16]:
mlflow.set_experiment('house-prices-eda')

INFO: 'house-prices-eda' does not exist. Creating a new experiment


In [17]:
mlflow.start_run()

<ActiveRun: >

# Experiments
## Linear Regression (Start-Stop)

In [18]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [19]:
mlflow.sklearn.log_model(lr,'lr')

In [20]:
lr_predicted = lr.predict(X_test)

In [21]:
len(lr_predicted)

438

In [22]:
X_test.iloc[0]

tamanho      99.0
ano        1963.0
garagem       1.0
Name: 892, dtype: float64

In [23]:
y_test

892     154500
1105    325000
413     115000
522     159000
1036    315500
         ...  
331     139000
323     126175
650     205950
439     110000
798     485000
Name: preco, Length: 438, dtype: int64

In [24]:
from sklearn.metrics import mean_squared_error, r2_score

In [25]:
import math

mse = mean_squared_error(y_test, lr_predicted)
rmse =  math.sqrt(mse)
r2 = r2_score(y_test, lr_predicted)
mlflow.log_metric('mse',mse)
mlflow.log_metric('rmse',rmse)
mlflow.log_metric('r2',r2)

In [26]:
mse

2078666917.9289901

In [27]:
rmse

45592.39978251847

In [28]:
r2

0.7021153642898049

In [29]:
mlflow.end_run()

## XGBOOST (Using With)

In [30]:
!pip3 install xgboost --quiet

In [32]:
from xgboost import XGBRFRegressor, XGBRegressor

In [45]:
mlflow.end_run()

In [46]:
xgb_params = {
    'learning_rate':0.2,
    'n_estimators': 50,
    'random_state':42
}


with mlflow.start_run():
    mlflow.log_params(xgb_params) #log params

    xgb = XGBRegressor(**xgb_params)
    xgb.fit(X_train, y_train)
    mlflow.xgboost.log_model(xgb,'xgboost') #log model
    xgb_predicted = xgb.predict(X_test)
    mse = mean_squared_error(y_test, xgb_predicted)
    rmse =  math.sqrt(mse)
    r2 = r2_score(y_test, xgb_predicted)
    
    #log metrics
    mlflow.log_metric('mse',mse)
    mlflow.log_metric('rmse',rmse)
    mlflow.log_metric('r2',r2)
    
    mlflow.end_run()

# Retrieving Result using comandline

In [43]:
mlflow.get_experiment_by_name('house-prices-eda')

<Experiment: artifact_location='file:///Users/msasso/Documents/personal_reps/mlops_complete_moc_project/notebooks/mlruns/1', experiment_id='1', lifecycle_stage='active', name='house-prices-eda', tags={}>

In [44]:
mlflow.list_run_infos('1')

[<RunInfo: artifact_uri='file:///Users/msasso/Documents/personal_reps/mlops_complete_moc_project/notebooks/mlruns/1/8f0eaddf2da3447da24f566dd64e860a/artifacts', end_time=None, experiment_id='1', lifecycle_stage='active', run_id='8f0eaddf2da3447da24f566dd64e860a', run_uuid='8f0eaddf2da3447da24f566dd64e860a', start_time=1633462215995, status='RUNNING', user_id='msasso'>,
 <RunInfo: artifact_uri='file:///Users/msasso/Documents/personal_reps/mlops_complete_moc_project/notebooks/mlruns/1/2502489329494a56990b660eee62ea8a/artifacts', end_time=1633462214008, experiment_id='1', lifecycle_stage='active', run_id='2502489329494a56990b660eee62ea8a', run_uuid='2502489329494a56990b660eee62ea8a', start_time=1633462202739, status='FINISHED', user_id='msasso'>,
 <RunInfo: artifact_uri='file:///Users/msasso/Documents/personal_reps/mlops_complete_moc_project/notebooks/mlruns/1/be7b6beeba534cf381c3e1e743768c42/artifacts', end_time=1633462200368, experiment_id='1', lifecycle_stage='active', run_id='be7b6bee

In [36]:
mlflow.get_run('a5375040ebce4772aec625dff723cc87')

<Run: data=<RunData: metrics={'mse': 2078666917.9289901, 'r2': 0.7021153642898049, 'rmse': 45592.39978251847}, params={}, tags={'mlflow.log-model.history': '[{"run_id": "a5375040ebce4772aec625dff723cc87", '
                             '"artifact_path": "lr", "utc_time_created": '
                             '"2021-10-05 19:20:03.890566", "flavors": '
                             '{"python_function": {"model_path": "model.pkl", '
                             '"loader_module": "mlflow.sklearn", '
                             '"python_version": "3.8.8", "env": "conda.yaml"}, '
                             '"sklearn": {"pickled_model": "model.pkl", '
                             '"sklearn_version": "0.24.1", '
                             '"serialization_format": "cloudpickle"}}}]',
 'mlflow.source.name': '/Users/msasso/anaconda3/lib/python3.8/site-packages/ipykernel_launcher.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.user': 'msasso'}>, info=<RunInfo: artifact_uri='file:///Users/msass