In [47]:
import h2o
import random
import mlflow
import mlflow.h2o
from h2o.estimators.random_forest import H2ORandomForestEstimator
from sklearn.model_selection import train_test_split

In [4]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,15 hours 6 mins
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.10
H2O cluster version age:,"7 days, 2 hours and 48 minutes"
H2O cluster name:,H2O_from_python_Laptop2_u8hpbm
H2O cluster total nodes:,1
H2O cluster free memory:,1.561 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [25]:
import numpy as np
import pandas as pd

df_raw = pd.read_csv('train.csv')
df_raw

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [43]:
def data_process(df_raw,remove_outlier = False,remove_hard_to_fit = False,linear_model = False):

	# Make a copy so the original dataframe will not be altered.
    df_processed = df_raw.copy()

	# Feature Transformation - take the logarithm of the features to meet normality assumptions.
    df_processed.SalePrice = np.log(df_processed.SalePrice)
    df_processed.GrLivArea = np.log(df_processed.GrLivArea)
    
	# Remove outliers.
    outlier_list = [524, 1299]
    df_processed = df_processed.drop(outlier_list)

    ## Missing values
    
    # 259 LotFrontage  - replace missing value with 0 
    df_processed.LotFrontage = df_processed.LotFrontage.fillna(0)

    # 1369 Alley - replace with None
    df_processed.Alley = df_processed.Alley.fillna('None')

    # 8 MasVnrType and MasVnrArea - replace MasVnrType with None and MasVnrArea with 0
    df_processed.MasVnrType = df_processed.MasVnrType.fillna('None')
    df_processed.MasVnrArea = df_processed.MasVnrArea.fillna(0)

    # 37 basement: BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2- replace with Nb
    df_processed.BsmtQual = df_processed.BsmtQual.fillna('Nb')
    df_processed.BsmtCond = df_processed.BsmtCond.fillna('Nb')
    df_processed.BsmtExposure = df_processed.BsmtExposure.fillna('Nb')
    df_processed.BsmtFinType1 = df_processed.BsmtFinType1.fillna('Nb')
    df_processed.BsmtFinType2 = df_processed.BsmtFinType2.fillna('Nb')

    # 690 FireplaceQu - replace with Nf
    df_processed.FireplaceQu = df_processed.FireplaceQu.fillna('Nf')

    # 81 Garage: GarageType, GarageYrBlt, GarageFinish, GarageQual, GarageCond - replace with Ng and year with 0 
    df_processed.GarageType = df_processed.GarageType.fillna('Ng')
    df_processed.GarageFinish = df_processed.GarageFinish.fillna('Ng')
    df_processed.GarageQual = df_processed.GarageQual.fillna('Ng')
    df_processed.GarageCond = df_processed.GarageCond.fillna('Ng')
    df_processed.GarageYrBlt = df_processed.GarageYrBlt.fillna(0)

    # 1453 PoolQC - replace with Np
    df_processed.PoolQC = df_processed.PoolQC.fillna('Np')

    # 1179 Fence - replace with Nf
    df_processed.Fence = df_processed.Fence.fillna('Nf')

    # 1406 MiscFeature - replace with None    
    df_processed.MiscFeature = df_processed.MiscFeature.fillna('None')

    # 1 Electrical
    df_processed = df_processed[pd.notnull(df_processed.Electrical)]

    ## Combine columns and drop multicollinear columns 
    
    # combine bathroom quanlitity 
    df_processed['BsmtBath'] = df_processed.BsmtFullBath + df_processed.BsmtHalfBath * 0.5
    df_processed['Bath'] = df_processed.FullBath + df_processed.HalfBath * 0.5
    df_processed = df_processed.drop(['BsmtFullBath', 'BsmtHalfBath','FullBath','HalfBath'], axis=1)

    # drop TotalBsmtSF - multicollinearaty
    df_processed = df_processed.drop(['TotalBsmtSF'], axis=1)

    # drop GrLivArea - multicollinearaty
    df_processed = df_processed.drop(['GrLivArea'], axis=1)

    # drop GarageArea - higher correlation than GarageACars, results are better as well
    df_processed = df_processed.drop(['GarageArea'], axis=1) 
    
    
    # drop Id
    df_processed = df_processed.drop(['Id'], axis=1)

	# Categorical Features Processsing

	# MSSubClass processing - MSSubClass 20-90 contains only duplicate information with HouseStyle and YearBuilt.
    df_processed['MSSubClass'] = df_processed['MSSubClass'].replace(['20','30','40','45','50','60','70','75','80','85'], '0')

    # Convert numerical to categorical. 
    df_processed[['MSSubClass','OverallQual','OverallCond']] = df_processed[['MSSubClass','OverallQual','OverallCond']].astype(str)



    return df_processed

In [45]:
data_processed=data_process(df_raw)
data_processed.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,BsmtBath,Bath
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,Nf,,0,2,2008,WD,Normal,12.247694,1.0,2.5
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,Nf,,0,5,2007,WD,Normal,12.109011,0.5,2.0
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,Nf,,0,9,2008,WD,Normal,12.317167,1.0,2.5
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,Nf,,0,2,2006,WD,Abnorml,11.849398,1.0,1.0
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,Nf,,0,12,2008,WD,Normal,12.429216,1.0,2.5


In [73]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
def model_random_forecast_sklearn(data_processed):
    
    
    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data_processed)

    # The predicted column is "SalePrice" .
    train_x = train.drop(["SalePrice"], axis=1)
    test_x = test.drop(["SalePrice"], axis=1)
    train_y = train[["SalePrice"]]
    test_y = test[["SalePrice"]]
 
 #The number of jobs to run in parallel for both fit and predict.
    rfr = RandomForestRegressor(n_jobs=1, random_state=0)
    param_grid = {'n_estimators': [1000], 
                  'max_features': [10,15,20,25], 
                  'max_depth':[20,20,25,25,]}
    
    model = GridSearchCV(estimator=rfr, param_grid=param_grid, n_jobs=1, cv=10)
    
    model.fit(train_x, train_y)
    print('Random forecast regression...')
    print('Best Params:')
    print(model.best_params_)
    print('Best CV Score:')
    print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return y_pred, -model.best_score_ 




In [75]:
model_random_forecast_sklearn(data_processed)



ValueError: could not convert string to float: 'FV'

In [70]:
# sklearn.metrics.SCORERS.keys())
from sklearn import metrics
sklearn.metrics.SCORERS


NameError: name 'sklearn' is not defined

In [None]:
def train(N_alpha, N_rho):
    import os
    import warnings
    import sys

    import pandas as pd
    import numpy as np
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import ElasticNet

    import mlflow
    import mlflow.sklearn
    
    import logging
    logging.basicConfig(level=logging.WARN)
    logger = logging.getLogger(__name__)
    
    from data_processing import data_process

    def eval_metrics(actual, pred):
        rmse = np.sqrt(mean_squared_error(actual, pred))
        mae = mean_absolute_error(actual, pred)
        r2 = r2_score(actual, pred)
        return rmse, mae, r2


    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Read the file
    try:
        df_raw = pd.read_csv('train.csv',index_col=0)
    except Exception as e:
        logger.exception(
            "Unable to download training & test CSV, check your internet connection. Error: %s", e)
        
    # Data processing.
    df_processed = data_process(df_raw)
    
    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(df_processed)

    # The predicted column is "SalePrice" .
    train_x = train.drop(["SalePrice"], axis=1)
    test_x = test.drop(["SalePrice"], axis=1)
    train_y = train[["SalePrice"]]
    test_y = test[["SalePrice"]]

    # Set default values if no N_alpha is provided
    if int(N_alpha) is None:
        N_alpha = 50
    else:
        N_alpha = int(N_alpha)

    # Set default values if no N_rho is provided
    if int(N_rho) is None:
        N_rho = 11
    else:
        N_rho = int(N_rho)
    
    alphaRange = np.logspace(-3, -2, N_alpha)
    rhoRange   = np.linspace(0,1, N_rho) # we avoid very small rho by starting at 0.1
    scores     = np.zeros((N_rho, N_alpha))
      
    
    # Execute ElasticNet
    for alphaIdx, alpha in enumerate(alphaRange):
        for rhoIdx, rho in enumerate(rhoRange):
            with mlflow.start_run():
                lr = ElasticNet(alpha=alpha, l1_ratio=rho, normalize=False)
                lr.fit(train_x, train_y)
                scores[rhoIdx, alphaIdx] = lr.score(train_x, train_y)
        
        # Training Model Performances Evaluate Metrics
#         predicted_qualities = lr.predict(test_x)
#         (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)        

        # Evaluate Metrics
                predicted_qualities = lr.predict(test_x)
                (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        # Print out metrics
                print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, rho))
                print("  RMSE: %s" % rmse)
                #print("  MAE: %s" % mae)
                print("  R2: %s" % r2)

        # Log parameter, metrics, and model to MLflow
                mlflow.log_param("alpha", alpha)
                mlflow.log_param("l1_ratio", rho)
                mlflow.log_metric("rmse", rmse)
                mlflow.log_metric("r2", r2)
                #mlflow.log_metric("mae", mae)

                mlflow.sklearn.log_model(lr, "model")


# In[2]:


train(50,11)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,BsmtBath,Bath
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,Nf,,0,2,2008,WD,Normal,12.247694,1.0,2.5
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,Nf,,0,5,2007,WD,Normal,12.109011,0.5,2.0
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,Nf,,0,9,2008,WD,Normal,12.317167,1.0,2.5
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,Nf,,0,2,2006,WD,Abnorml,11.849398,1.0,1.0
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,Nf,,0,12,2008,WD,Normal,12.429216,1.0,2.5


In [36]:
def train_randomeForrest_sklearn (processed_data):
    

SyntaxError: unexpected EOF while parsing (<ipython-input-36-f3d419a783bf>, line 6)

In [None]:

def model_random_forecast(data_processed):
    
    X_train = Xtrain
    y_train = ytrain
    rfr = RandomForestRegressor(n_jobs=1, random_state=0)
    param_grid = {'n_estimators': [1000]}
    # 'n_estimators': [1000], 'max_features': [10,15,20,25], 'max_depth':[20,20,25,25,]}
    model = GridSearchCV(estimator=rfr, param_grid=param_grid, n_jobs=1, cv=10, scoring=RMSE)
    model.fit(X_train, y_train)
    print('Random forecast regression...')
    print('Best Params:')
    print(model.best_params_)
    print('Best CV Score:')
    print(-model.best_score_)

    y_pred = model.predict(Xtest)
    return y_pred, -model.best_score_ 