In [1]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor


file_path = '../input/train.csv'

home_data = pd.read_csv(file_path)

In [2]:
home_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [3]:
home_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
home_data.describe()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
count,1460.0,1460.0,1201.0,1460.0,1460.0,1460.0,1460.0,1460.0,1452.0,1460.0,...,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0,1460.0
mean,730.5,56.89726,70.049958,10516.828082,6.099315,5.575342,1971.267808,1984.865753,103.685262,443.639726,...,94.244521,46.660274,21.95411,3.409589,15.060959,2.758904,43.489041,6.321918,2007.815753,180921.19589
std,421.610009,42.300571,24.284752,9981.264932,1.382997,1.112799,30.202904,20.645407,181.066207,456.098091,...,125.338794,66.256028,61.119149,29.317331,55.757415,40.177307,496.123024,2.703626,1.328095,79442.502883
min,1.0,20.0,21.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,34900.0
25%,365.75,20.0,59.0,7553.5,5.0,5.0,1954.0,1967.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,129975.0
50%,730.5,50.0,69.0,9478.5,6.0,5.0,1973.0,1994.0,0.0,383.5,...,0.0,25.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,163000.0
75%,1095.25,70.0,80.0,11601.5,7.0,6.0,2000.0,2004.0,166.0,712.25,...,168.0,68.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,214000.0
max,1460.0,190.0,313.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,...,857.0,547.0,552.0,508.0,480.0,738.0,15500.0,12.0,2010.0,755000.0


In [5]:
def colToInt(data, orCol, newCol):
    ne_vals = list(set(data[orCol]))
    ne_col = []
    for n in data[orCol]:
        ne_col.append(ne_vals.index(n))    
    data[newCol] = ne_col

In [6]:
y = home_data.SalePrice

In [7]:
nei_salet_home_data = home_data.copy()
colToInt(nei_salet_home_data, 'Neighborhood', 'Neighborhood_val')
colToInt(nei_salet_home_data, 'SaleType', 'SaleType_val')

home_features = ['LotArea', 
                 'OverallCond', 
                 'OverallQual', 
                 'MSSubClass', 
                 'YearBuilt', 
                 'FullBath', 
                 'TotRmsAbvGrd', 
                 'YearRemodAdd', 
                 'GarageArea', 
                 'YrSold', 
                 'Neighborhood_val',
                 'SaleType_val']

X = nei_salet_home_data[home_features]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
rf_model = RandomForestRegressor(random_state=1, n_estimators=100, max_depth=15, max_leaf_nodes=200)
rf_model.fit(train_X, train_y)

rf_val_predictions = rf_model.predict(val_X)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print("Validation MAE for Random Forest Model: {:,}".format(rf_val_mae))
print("Avg value: {:,}".format(val_y.mean()))

Validation MAE for Random Forest Model: 19,595.88669203081
Avg value: 176,725.51232876713


In [8]:
# Prediction with only integer cols
home_predictors = home_data.drop(['SalePrice'], axis=1) 

home_numeric_predictors = home_predictors.select_dtypes(exclude=['object'])
used_features = home_numeric_predictors.drop('Id', axis=1).keys()

X = home_data[list(used_features)]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

cols_with_missing = [col for col in train_X.columns 
                                 if train_X[col].isnull().any()]

reduced_X_train = train_X.drop(cols_with_missing, axis=1)
reduced_X_test  = val_X.drop(cols_with_missing, axis=1)

rf_model = RandomForestRegressor(random_state=1, n_estimators=100)
rf_model.fit(reduced_X_train, train_y)

rf_val_predictions = rf_model.predict(reduced_X_test)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print("Validation MAE for Random Forest Model: {:,}".format(rf_val_mae))
print("Avg value: {:,}".format(val_y.mean()))

Validation MAE for Random Forest Model: 16,926.217502283107
Avg value: 176,725.51232876713


In [9]:
# Prediction with colToInt for all possible cols

home_object_predictors = home_predictors.select_dtypes(include=['object'])
obj_cols_with_missing = [col for col in train_X.columns 
                                 if train_X[col].notna().all()]

for o in obj_cols_with_missing:
    colToInt(home_data, o, o + "_val")

home_predictors = home_data.drop(['SalePrice'], axis=1)

home_numeric_predictors = home_predictors.select_dtypes(exclude=['object'])
used_features = home_numeric_predictors.drop('Id', axis=1).keys()

X = home_data[list(used_features)]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

cols_with_missing = [col for col in train_X.columns 
                                 if train_X[col].isnull().any()]

reduced_X_train = train_X.drop(cols_with_missing, axis=1)
reduced_X_test  = val_X.drop(cols_with_missing, axis=1)

#print(reduced_X_test.columns)
rf_model = RandomForestRegressor(random_state=1, n_estimators=100)
rf_model.fit(reduced_X_train, train_y)

rf_val_predictions = rf_model.predict(reduced_X_test)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print("Validation MAE for Random Forest Model: {:,}".format(rf_val_mae))
print("Avg value: {:,}".format(val_y.mean()))

Validation MAE for Random Forest Model: 16,891.653812785386
Avg value: 176,725.51232876713


In [10]:
home_data.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities',
       ...
       'GarageArea_val', 'WoodDeckSF_val', 'OpenPorchSF_val',
       'EnclosedPorch_val', '3SsnPorch_val', 'ScreenPorch_val', 'PoolArea_val',
       'MiscVal_val', 'MoSold_val', 'YrSold_val'],
      dtype='object', length=114)

In [11]:
dyn_home_data = home_data.copy()
home_object_predictors = home_predictors.select_dtypes(include=['object'])
obj_cols_with_missing = [col for col in train_X.columns 
                                 if train_X[col].notna().all()]

for o in obj_cols_with_missing:
    colToInt(dyn_home_data, o, o + "_val")

home_predictors = dyn_home_data.drop(['SalePrice'], axis=1)

home_numeric_predictors = home_predictors.select_dtypes(exclude=['object'])
used_features = home_numeric_predictors.drop('Id', axis=1).keys()

In [12]:
from sklearn.impute import SimpleImputer

#adding  class col

dyn_home_data = home_data.copy()
home_object_predictors = home_predictors.select_dtypes(include=['object'])
obj_cols_with_missing = [col for col in train_X.columns 
                                 if train_X[col].notna().all()]

for o in obj_cols_with_missing:
    colToInt(dyn_home_data, o, o + "_val")

home_predictors = dyn_home_data.drop(['SalePrice'], axis=1)

home_numeric_predictors = home_predictors.select_dtypes(exclude=['object'])
used_features = home_numeric_predictors.drop('Id', axis=1).keys()

def lookForFeatures(featureList, data):
    X = data[featureList]

    train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

    my_imputer = SimpleImputer()
    imputed_X_train = my_imputer.fit_transform(train_X)
    imputed_X_test = my_imputer.transform(val_X)    

    #print(reduced_X_test.columns)
    rf_model = RandomForestRegressor(random_state=1, n_estimators=100)
    rf_model.fit(imputed_X_train, train_y)

    rf_val_predictions = rf_model.predict(imputed_X_test)

    # Calculate the mean absolute error of your Random Forest model on the validation data
    rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)
    return featureList, rf_val_mae, val_y.mean() 

import random
possibile_features = list(used_features)

def genSample(flist): 
    return random.sample(flist, random.randint(1, len(flist) - 1))

def initialGeneration(size, data, possibile_features,  seed_features=None):
    outs=[]
    
    if (seed_features):        
        flist, val_mae, mean_y = lookForFeatures(seed_features, data)     
        outs.append([flist, val_mae, mean_y])            
        
    for i in range(size):
        sample = genSample(possibile_features)
        flist, val_mae, mean_y = lookForFeatures(sample, data)     
        outs.append([flist, val_mae, mean_y])
    outs.sort(key=lambda x: x[1])
    return outs

def printList(outs):
    for o in outs:
        print(o[1])
        print(o[0])
        print(" --- ")
        
def mergeFeatures (flist1, flist2):
    c_list = flist1 + flist2
    return list(set(c_list))
    
def newGeneration(prev_gen, top):
    fittests = prev_gen[:top]
    
    n_gen = prev_gen[:1]
    for f in fittests:
        for ff in fittests:
            merged = mergeFeatures(f[0], ff[0])
            if (merged != f[0] and merged != ff[0]):
                flist, val_mae, mean_y = lookForFeatures(merged, dyn_home_data)     
                n_gen.append([flist, val_mae, mean_y])        
            
            flist_p, val_mae_p, mean_y_p = lookForFeatures(genSample(merged), dyn_home_data)     
            n_gen.append([flist_p, val_mae_p, mean_y_p])        
            
    n_gen.sort(key=lambda x: x[1])
    return n_gen



In [50]:
file_path = '../input/train.csv'
home_data = pd.read_csv(file_path)

test_path = '../input/test.csv'
test_data = pd.read_csv(test_path)

dyn_home_data = home_data.copy()
dyn_home_data = dyn_home_data.drop(['SalePrice'], axis=1)

dyn_test_data = test_data.copy()

home_object_predictors = home_predictors.select_dtypes(include=['object'])
obj_cols_with_missing = [col for col in dyn_home_data.columns 
                                 if dyn_home_data[col].notna().all() and dyn_test_data[col].notna().all()]

for o in obj_cols_with_missing:
    colToInt(dyn_home_data, o, o + "_val")
    
for o in obj_cols_with_missing:
    colToInt(dyn_test_data, o, o + "_val")
        

numeric_predictors = dyn_home_data.select_dtypes(exclude=['object'])
used_features = numeric_predictors.drop('Id', axis=1).keys()

current_generation = initialGeneration(3, dyn_home_data, list(used_features), seed_features=['GarageYrBlt', 'BldgType_val', 'BsmtUnfSF', '3SsnPorch_val', '2ndFlrSF', 'Neighborhood_val', 'GrLivArea_val', 'LandSlope_val', 'LotConfig_val', 'LotShape_val', 'SaleCondition_val', 'Heating_val', 'GarageCars', 'BedroomAbvGr_val', 'PavedDrive_val', '2ndFlrSF_val', 'Id_val', 'OverallCond', 'OverallQual_val', 'MSSubClass', 'MSSubClass_val', 'YearBuilt_val', 'EnclosedPorch', 'OpenPorchSF', 'LotArea', 'MiscVal', 'LandContour_val', 'BsmtFinSF2', 'TotalBsmtSF', 'BsmtFullBath', 'FullBath', 'YearRemodAdd_val', 'ScreenPorch_val', 'ScreenPorch', 'GarageArea', 'EnclosedPorch_val', 'YrSold_val'])

#print("Running {:,} generations".format(len(range(20))))

#for i in range(20): 
#    print('new generation')
#    print(i)
#    print("best value: {:,}".format(current_generation[0][1]))
#    current_generation = newGeneration(current_generation, 3)

#winners = current_generation[:1]

#fts = winners[0][0]
    
#printList(winners)



# best value: 16,748.41892237443
# 16748.41892237443
# ['BsmtFullBath', 'LotArea', 'GarageYrBlt', 'OverallCond_val', 'FullBath', 'BsmtFinSF2', 'MiscVal', 'GarageCars', 'EnclosedPorch_val', 'Neighborhood_val', '2ndFlrSF', 'OverallQual_val', 'ScreenPorch', 'GrLivArea_val', 'MSSubClass', 'TotalBsmtSF', 'Fireplaces', 'BedroomAbvGr_val', 'ScreenPorch_val', 'YrSold_val', 'OpenPorchSF', 'BsmtUnfSF', 'OverallQual', 'YearRemodAdd_val', 'EnclosedPorch', 'LandContour_val', 'LandSlope_val']

In [45]:
train_X = dyn_home_data[fts]
test_X = dyn_test_data[fts]

my_imputer = SimpleImputer()
imputed_X_train = my_imputer.fit_transform(train_X)
imputed_X_test = my_imputer.fit_transform(test_X)


full_model = RandomForestRegressor(random_state=1, n_estimators=100)
full_model.fit(imputed_X_train, y)

test_preds = full_model.predict(imputed_X_test)

output = pd.DataFrame({'Id': test_data.Id,
                       'SalePrice': test_preds})

output.to_csv('submission.csv', index=False)


Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive