# *House Prices First Try Copetitions*

## First Submission:

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

data = pd.read_csv('../house/train.csv')
data.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = data.SalePrice
X = data.drop(['SalePrice'], axis=1).select_dtypes(exclude=['object'])
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25)

my_imputer = Imputer()
train_X = my_imputer.fit_transform(train_X)
test_X = my_imputer.transform(test_X)

my_model = XGBRegressor(n_estimators=400, learning_rate=0.05, random_state=101)
my_model.fit(train_X, train_y, early_stopping_rounds=10, 
             eval_set=[(test_X, test_y)], verbose=False)

# make predictions
predictions = my_model.predict(test_X)

print("Mean Absolute Error : " + str(mean_absolute_error(predictions, test_y)))

final = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=400,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=101,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

X_scaled = my_imputer.transform(X)

final.fit(X_scaled, y)

# make predictions
predictions = my_model.predict(X_scaled)
print("Mean Absolute Error : " + str(mean_absolute_error(predictions, y)))

test = pd.read_csv('../house/test.csv')

X_test = test.select_dtypes(exclude=['object'])
X_test = my_imputer.transform(X_test)

# make predictions
predictions = my_model.predict(X_test)

my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predictions})
# you could use any filename. We choose submission here
my_submission.to_csv('SalePrice_1_submission.csv', index=False)

Mean Absolute Error : 14920.21747645548
Mean Absolute Error : 12056.9087302012


## Second Submission:

In [98]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import metrics
from sklearn.model_selection import cross_val_predict

data = pd.read_csv('../house/train.csv')
y = data.SalePrice

def features_seletc(X):
    return X.drop(['SalePrice', 'Id'], axis=1).select_dtypes(exclude=['object'])

X = features_seletc(data)

'''
my_model = Pipeline([
    ('input', Imputer()),
    ('scl', StandardScaler()),
    ('clf', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, 
       n_jobs=1, nthread=None, objective='reg:linear', random_state=101,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])


scores = (-1 *cross_val_score(estimator = my_model, X = X , y = y, scoring='neg_mean_absolute_error', cv=6))
print(scores)
print(scores.mean())

# make predictions
predictions = cross_val_predict(estimator = my_model, X = X , y = y, cv=6)
print(metrics.mean_absolute_error(y_pred = predictions , y_true = y))
'''

"\nmy_model = Pipeline([\n    ('input', Imputer()),\n    ('scl', StandardScaler()),\n    ('clf', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,\n       max_depth=3, min_child_weight=1, missing=None, \n       n_jobs=1, nthread=None, objective='reg:linear', random_state=101,\n       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n       silent=True, subsample=1))])\n\n\nscores = (-1 *cross_val_score(estimator = my_model, X = X , y = y, scoring='neg_mean_absolute_error', cv=6))\nprint(scores)\nprint(scores.mean())\n\n# make predictions\npredictions = cross_val_predict(estimator = my_model, X = X , y = y, cv=6)\nprint(metrics.mean_absolute_error(y_pred = predictions , y_true = y))\n"

In [122]:
my_model = Pipeline([
    ('input', Imputer()),
    ('scl', StandardScaler()),
    ('clf', XGBRegressor(base_score=0.5, colsample_bylevel=1,
       colsample_bytree=1, booster='gbtree', gamma=0, max_delta_step=0,
       min_child_weight=1, missing=None, max_depth = 3,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=101,
       reg_alpha=0, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

n_est = [575, 650, 600, 550]
#max_depth = [2, 3, 6]
learning_rate = [0.05, 0.1]
reg_lambda = [1, 0.1]
#booster = ['gbtree', 'gblinear', 'dart']
clf = GridSearchCV(estimator = my_model, param_grid = dict(clf__n_estimators= n_est, 
                                                           #clf__booster = booster,
                                                           clf__learning_rate = learning_rate,
                                                           clf__reg_lambda = reg_lambda
                                                           #clf__max_depth = max_depth
                                                          ), 
                   scoring='neg_mean_absolute_error', cv=6, verbose=1, n_jobs=3)

In [123]:
clf.fit(X, y)  

Fitting 6 folds for each of 16 candidates, totalling 96 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   20.3s
[Parallel(n_jobs=3)]: Done  96 out of  96 | elapsed:   42.2s finished


GridSearchCV(cv=6, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('input', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)), ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_de...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid=True, n_jobs=3,
       param_grid={'clf__n_estimators': [575, 650, 600, 550], 'clf__learning_rate': [0.05, 0.1], 'clf__reg_lambda': [1, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=1)

In [124]:
clf.best_score_

-15913.275113709333

In [125]:
test = pd.read_csv('../house/test.csv')
X_t = test.drop(['Id'], axis=1).select_dtypes(exclude=['object'])

predictions = clf.predict(X = X_t)

my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predictions})
# you could use any filename. We choose submission here
my_submission.to_csv('SalePrice_2_submission.csv', index=False)

In [126]:
clf.best_estimator_.get_params()

{'memory': None,
 'steps': [('input',
   Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)),
  ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('clf', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
          colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
          max_depth=3, min_child_weight=1, missing=None, n_estimators=550,
          n_jobs=1, nthread=None, objective='reg:linear', random_state=101,
          reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
          silent=True, subsample=1))],
 'input': Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0),
 'scl': StandardScaler(copy=True, with_mean=True, with_std=True),
 'clf': XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
        max_depth=3, min_child_weight=1, missing=None, n_estimators=550,
        n_jobs=1, nthr

In [2]:
my_model.get_params().keys()

dict_keys(['base_score', 'booster', 'colsample_bylevel', 'colsample_bytree', 'gamma', 'learning_rate', 'max_delta_step', 'max_depth', 'min_child_weight', 'missing', 'n_estimators', 'n_jobs', 'nthread', 'objective', 'random_state', 'reg_alpha', 'reg_lambda', 'scale_pos_weight', 'seed', 'silent', 'subsample'])

## Third Submission: 
Training the best model with all data

In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline

data = pd.read_csv('../house/train.csv')
y = data.SalePrice

def features_seletc(X):
    return X.drop(['SalePrice', 'Id'], axis=1).select_dtypes(exclude=['object'])

X = features_seletc(data)

my_model = Pipeline([
    ('input', Imputer()),
    ('scl', StandardScaler()),
    ('clf', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
        max_depth=3, min_child_weight=1, missing=None, n_estimators=550,
        n_jobs=1, nthread=None, objective='reg:linear', random_state=101,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=True, subsample=1))])

my_model.fit(X, y)
predicted_home_prices = my_model.predict(X)
print("MEA training all data:",mean_absolute_error(y, predicted_home_prices))

test = pd.read_csv('../house/test.csv')
X_t = test.drop(['Id'], axis=1).select_dtypes(exclude=['object'])

predictions = my_model.predict(X = X_t)

my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': predictions})
# you could use any filename. We choose submission here
my_submission.to_csv('SalePrice_3_submission.csv', index=False)

MEA training all data: 6049.851522367295


-------------

## Load packages and datasets

In [140]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import metrics
from sklearn.model_selection import cross_val_predict

data = pd.read_csv('../house/train.csv')
y = data.SalePrice
#X = data.drop(['SalePrice', 'Id'], axis=1).select_dtypes(exclude=['object'])
#del data

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
Id               1460 non-null int64
MSSubClass       1460 non-null int64
MSZoning         1460 non-null object
LotFrontage      1201 non-null float64
LotArea          1460 non-null int64
Street           1460 non-null object
Alley            91 non-null object
LotShape         1460 non-null object
LandContour      1460 non-null object
Utilities        1460 non-null object
LotConfig        1460 non-null object
LandSlope        1460 non-null object
Neighborhood     1460 non-null object
Condition1       1460 non-null object
Condition2       1460 non-null object
BldgType         1460 non-null object
HouseStyle       1460 non-null object
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
YearRemodAdd     1460 non-null int64
RoofStyle        1460 non-null object
RoofMatl         1460 non-null object
Exterior1st      1460 non-n

The total training observations are 1,460 and have 79 features ( 3 float64, 33 int64, 43 object ) with 19 nulls, wheres:
 - 14 has few null, so are good candidates for imputer strategies:
<ul><b>See below all Ordinal that doesn't have NA. We can infer thar nan are NA!</b>
    <li>GarageFinish___1379  object. Interior finish of the garage. All nan has GarageType equal a NA, so we can infer NA too</li>
    <li>GarageQual_____1379  object: Garage quality. </li>
    <li>GarageCond_____1379  object: Garage condition. </li>
    <li>BsmtExposure___1422  object: Refers to walkout or garden level walls.</li>
    <li>BsmtFinType2___1422  object: Rating of basement finished area (if multiple types). Only not infer NA to Id 333, becouse has BsmtFinSF2 and BsmtFinSF1, for this consedere GLQ, becouse is what we found in BsmtFinType1!</li>
    <li>BsmtQual_______1423  object: Evaluates the height of the basement Doesn't have PO and NA.</li>
    <li>BsmtCond_______1423  object: Ealuates the general condition of the basement. Doesn't have Ex and NA.</li>
    <li>BsmtFinType1___1423  object: Rating of basement finished area (if multiple types). </li>
</ul>    
<ul><b>Numeric and Categorical datas with diferents inputer strategies</b>
    <li>LotFrontage____1201  float64: is the linear feet of street connected to property. Some property realy don't have directle Lot Frontage, so we can consider 0 to nan.</li>
    <li>GarageType_____1379  object: Doesn't have NA. We can infer that nan are NA</li>
    <li>GarageYrBlt____1379  float64. All nan has GarageType equal a NA, so we can infer to 0</li>
    <li>MasVnrType_____1452  object: is the masonry veneer type, hasn't CBlock in the training data! CBlock is the nan?</li>
    <li>MasVnrArea_____1452  float64: Masonry veneer area in square feet, null when MasVnrType is null. We can infer 0</li>
    <li>Electrical_____1459  object. Only one, can apply the most commun.</li>
</ul>
<p>
 - 5 has more than 47% of nulls, maybe candidates to exclude or substitute for nulls presence flags
<ul><b>See below all Ordinal that doesn't have NA. We can infer thar nan are NA!</b>
    <li>PoolQC        7  object: Pool quality. Does´t have TA and NA explicit in the data</li>
    <li>Fence       281  object: Fence quality.</li>
    <li>FireplaceQu 770  object: Fireplace quality.</li>
</ul>
<ul><b> Categorical datas that does´t have NA explicit in the data, but has nan. Infer NA to nan records</b>
    <li>MiscFeature 54  object: Miscellaneous feature not covered in other categories. Does´t have Elev and NA explicit in the data</li>
    <li>Alley       91  object: is the type of alley access to property.</li>
</ul>
Some numeric data are ordinal or categorical already translate to codes.<p>

## Check the data quality of training data compare to the description

In [59]:
# Identifies the type of dwelling involved in the sale.
MSSubClass = {}
MSSubClass[20] = '1-STORY 1946 & NEWER ALL STYLES'
MSSubClass[30] = '1-STORY 1945 & OLDER'
MSSubClass[40] = '1-STORY W/FINISHED ATTIC ALL AGES'
MSSubClass[45] = '37257 STORY - UNFINISHED ALL AGES'
MSSubClass[50] = '37257 STORY FINISHED ALL AGES'
MSSubClass[60] = '2-STORY 1946 & NEWER'
MSSubClass[70] = '2-STORY 1945 & OLDER'
MSSubClass[75] = '37258 STORY ALL AGES'
MSSubClass[80] = 'SPLIT OR MULTI-LEVEL'
MSSubClass[85] = 'SPLIT FOYER'
MSSubClass[90] = 'DUPLEX - ALL STYLES AND AGES'
MSSubClass[120] = '1-STORY PUD (Planned Unit Development) - 1946 & NEWER'
MSSubClass[150] = '37257 STORY PUD - ALL AGES'
MSSubClass[160] = '2-STORY PUD - 1946 & NEWER'
MSSubClass[180] = 'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER'
MSSubClass[190] = '2 FAMILY CONVERSION - ALL STYLES AND AGES'

a = np.sort(data.MSSubClass.unique())
print("MSSubClass has",data.MSSubClass.count(),"observations. See below the MSSubClass not found in the training data:")
for i in  MSSubClass.keys():
 if [i] not in a: print(i, ":", MSSubClass[i])
# Note: Test dataset has the 150

MSSubClass has 1460 observations. See below the MSSubClass not found in the training data:
150 : 37257 STORY PUD - ALL AGES


In [125]:
# Identifies the general zoning classification of the sale.
MSZoning = {}
MSZoning['A']='Agriculture'
MSZoning['C']='Commercial'
MSZoning['FV']='Floating Village Residential'
MSZoning['I']='Industrial'
MSZoning['RH']='Residential High Density'
MSZoning['RL']='Residential Low Density'
MSZoning['RP']='Residential Low Density Park' 
MSZoning['RM']='Residential Medium Density'
MSZoning['C (all)']='Commercial' # Exist in the training data and we don't know if is a substitute of 'C'
a = np.sort(data.MSZoning.unique())
print("MSZoning has",data.MSZoning.count(),"observations. See below the MSZoning not found in the training data:")
for i in  MSZoning.keys():
 if [i] not in a: print(i, ":", MSZoning[i])

MSZoning has 1460 observations. See below the MSZoning not found in the training data:
A : Agriculture
C : Commercial
I : Industrial
RP : Residential Low Density Park


### Attention: General shape is ordinal!

In [235]:
# General shape of property
LotShape = {}
LotShape['Reg'] = 'Regular'
LotShape['IR1'] = 'Slightly irregular'
LotShape['IR2'] = 'Moderately Irregular'
LotShape['IR3'] = 'Irregular'

a = np.sort(data.LotShape.unique())
print("LotShape has",data.LotShape.count(),"observations and all class are found in the training data!")
for i in  LotShape.keys():
 if [i] not in a: print(i, ":", LotShape[i])

LotShape has 1460 observations and all class are found in the training data!


In [76]:
# Utilities: Type of utilities available
Utilities = {}
Utilities['AllPub'] = 'All public Utilities (E,G,W,& S)'
Utilities['NoSewr'] = 'Electricity, Gas, and Water (Septic Tank)'
Utilities['NoSeWa'] = 'Electricity and Gas Only'
Utilities['ELO'] = 'Electricity only'

a = np.sort(data.Utilities.unique())
print('Utilities has',data.Utilities.count(),"observations. See below the Utilities not found in the training data:")
for i in  Utilities.keys():
   if [i] not in a: print(i, ':', Utilities[i])

Utilities has 1460 observations. See below the Utilities not found in the training data:
NoSewr : Electricity, Gas, and Water (Septic Tank)
ELO : Electricity only


In [123]:
# LotConfig: Lot configuration
LotConfig = {}
LotConfig['Inside'] = 'Inside lot'
LotConfig['Corner'] = 'Corner lot'
LotConfig['CulDSac'] = 'Cul-de-sac'
LotConfig['FR2'] = 'Frontage on 2 sides of property'
LotConfig['FR3'] = 'Frontage on 3 sides of property'

a = np.sort(data.LotConfig.unique())
print('LotConfig has',data.LotConfig.count(),'observations and all class are found in the training data!')
for i in  LotConfig.keys():
   if [i] not in a: print(i, ':', LotConfig[i])

LotConfig has 1460 observations and all class are found in the training data!


### Attention: LandSlope is ordinal!

In [79]:
# LandSlope: Slope of property
LandSlope = {}
LandSlope['Gtl'] = 'Gentle slope'
LandSlope['Mod'] = 'Moderate Slope'
LandSlope['Sev'] = 'Severe Slope'

a = np.sort(data.LandSlope.unique())
print('LandSlope has',data.LandSlope.count(),'observations and all class are found in the training data!')
for i in  LandSlope.keys():
   if [i] not in a: print(i, ':', LandSlope[i])

LandSlope has 1460 observations and all class are found in the training data!


In [91]:
# Neighborhood: Physical locations within Ames city limits
Neighborhood = {}
Neighborhood['Blmngtn'] = 'Bloomington Heights'
Neighborhood['Blueste'] = 'Bluestem'
Neighborhood['BrDale'] = 'Briardale'
Neighborhood['BrkSide'] = 'Brookside'
Neighborhood['ClearCr'] = 'Clear Creek'
Neighborhood['CollgCr'] = 'College Creek'
Neighborhood['Crawfor'] = 'Crawford'
Neighborhood['Edwards'] = 'Edwards'
Neighborhood['Gilbert'] = 'Gilbert'
Neighborhood['IDOTRR'] = 'Iowa DOT and Rail Road'
Neighborhood['MeadowV'] = 'Meadow Village'
Neighborhood['Mitchel'] = 'Mitchell'
Neighborhood['NAmes'] = 'North Ames'
Neighborhood['NoRidge'] = 'Northridge'
Neighborhood['NPkVill'] = 'Northpark Villa'
Neighborhood['NridgHt'] = 'Northridge Heights'
Neighborhood['NWAmes'] = 'Northwest Ames'
Neighborhood['OldTown'] = 'Old Town'
Neighborhood['SWISU'] = 'South & West of Iowa State University'
Neighborhood['Sawyer'] = 'Sawyer'
Neighborhood['SawyerW'] = 'Sawyer West'
Neighborhood['Somerst'] = 'Somerset'
Neighborhood['StoneBr'] = 'Stone Brook'
Neighborhood['Timber'] = 'Timberland'
Neighborhood['Veenker'] = 'Veenker'

a = np.sort(data.Neighborhood.unique())
print('Neighborhood has',data.Neighborhood.count(),'observations and all class are found in the training data!')
for i in  Neighborhood.keys():
   if [i] not in a: print(i, ':', Neighborhood[i])
#  here we found NAmes as Names in the explanation

Neighborhood has 1460 observations and all class are found in the training data!


In [92]:
# Condition1: Proximity to various conditions
Condition1 = {}
Condition1['Artery'] = 'Adjacent to arterial street'
Condition1['Feedr'] = 'Adjacent to feeder street'
Condition1['Norm'] = 'Normal'
Condition1['RRNn'] = 'Within 200 of North-South Railroad'
Condition1['RRAn'] = 'Adjacent to North-South Railroad'
Condition1['PosN'] = 'Near positive off-site feature--park, greenbelt, etc.'
Condition1['PosA'] = 'Adjacent to postive off-site feature'
Condition1['RRNe'] = 'Within 200 of East-West Railroad'
Condition1['RRAe'] = 'Adjacent to East-West Railroad'

a = np.sort(data.Condition1.unique())
print('Condition1 has',data.Condition1.count(),'observations and all class are found in the training data!')
for i in  Condition1.keys():
   if [i] not in a: print(i, ':', Condition1[i])

Condition1 has 1460 observations and all class are found in the training data!


In [115]:
a = np.sort(data.Condition2.unique())
print('Condition2 has',data.Condition2.count(),". See below the Condition2 not found in the training data:")
for i in  Condition1.keys():
   if [i] not in a: print(i, ':', Condition1[i])

Condition2 has 1460 . See below the Condition2 not found in the training data:
RRNe : Within 200 of East-West Railroad


### Some problems of speling disacord from explanation file sugest first make a upper case os all string data before code!

In [105]:
# BldgType: Type of dwelling
BldgType = {}
BldgType['1Fam'] = 'Single-family Detached'
BldgType['2fmCon'] = 'Two-family Conversion; originally built as one-family dwelling'
BldgType['Duplex'] = 'Duplex'
BldgType['TwnhsE'] = 'Townhouse End Unit'
BldgType['Twnhs'] = 'Townhouse Inside Unit'

a = np.sort(data.BldgType.unique())
print('BldgType has',data.BldgType.count(),'observations and all class are found in the training data with some corrections!')

for i in  BldgType.keys():
   if [i] not in a: print(i, ':', BldgType[i])
# Some problems of speling disacord from explanation file: Twnhs is not TwnshI, Duplex instead Duplx and 2fmCon instead 2FmCon

BldgType has 1460 observations and all class are found in the training data with some corrections!


In [114]:
# HouseStyle: Style of dwelling
HouseStyle = {}
HouseStyle['1Story'] = 'One story'
HouseStyle['1.5Fin'] = 'One and one-half story: 2nd level finished'
HouseStyle['1.5Unf'] = 'One and one-half story: 2nd level unfinished'
HouseStyle['2Story'] = 'Two story'
HouseStyle['2.5Fin'] = 'Two and one-half story: 2nd level finished'
HouseStyle['2.5Unf'] = 'Two and one-half story: 2nd level unfinished'
HouseStyle['SFoyer'] = 'Split Foyer'
HouseStyle['SLvl'] = 'Split Level'

a = np.sort(data.HouseStyle.unique())
print('HouseStyle has',data.HouseStyle.count(),'observations and all class are found in the training data!')
for i in  HouseStyle.keys():
   if [i] not in a: print(i, ':', HouseStyle[i])

HouseStyle has 1460 observations and all class are found in the training data!


### Attention: OverallQual and OverallCond are ordinal data!

In [120]:
# OverallQual: Rates the overall material and finish of the house
OverallQual = {}
OverallQual[10] = 'Very Excellent'
OverallQual[9] = 'Excellent'
OverallQual[8] = 'Very Good'
OverallQual[7] = 'Good'
OverallQual[6] = 'Above Average'
OverallQual[5] = 'Average'
OverallQual[4] = 'Below Average'
OverallQual[3] = 'Fair'
OverallQual[2] = 'Poor'
OverallQual[1] = 'Very Poor'

a = np.sort(data.OverallQual.unique())
print('OverallQual has',data.OverallQual.count(),'observations and all class are found in the training data!')
for i in  OverallQual.keys():
   if [i] not in a: print(i, ':', OverallQual[i])

# OverallCond: Rates the overall condition of the house
a = np.sort(data.OverallCond.unique())
print('OverallQual has',data.OverallCond.count(),". See below the OverallCond not found in the training data:")
for i in  OverallQual.keys():
   if [i] not in a: print(i, ':', OverallQual[i])

OverallQual has 1460 observations and all class are found in the training data!
OverallQual has 1460 . See below the OverallCond not found in the training data:
10 : Very Excellent


In [237]:
# Exterior1st: Exterior covering on house
Exterior1st = {}
Exterior1st['AsbShng'] = 'Asbestos Shingles'
Exterior1st['AsphShn'] = 'Asphalt Shingles'
Exterior1st['BrkComm'] = 'Brick Common'
Exterior1st['BrkFace'] = 'Brick Face'
Exterior1st['CBlock'] = 'Cinder Block'
Exterior1st['CemntBd'] = 'Cement Board'
Exterior1st['HdBoard'] = 'Hard Board'
Exterior1st['ImStucc'] = 'Imitation Stucco'
Exterior1st['MetalSd'] = 'Metal Siding'
Exterior1st['Other'] = 'Other'
Exterior1st['Plywood'] = 'Plywood'
Exterior1st['PreCast'] = 'PreCast'
Exterior1st['Stone'] = 'Stone'
Exterior1st['Stucco'] = 'Stucco'
Exterior1st['VinylSd'] = 'Vinyl Siding'
Exterior1st['Wd Sdng'] = 'Wood Siding'
Exterior1st['WdShing'] = 'Wood Shingles'

a = np.sort(data.Exterior1st.unique())
print('Exterior1st has',data.Exterior1st.count(),'observations. See below the Exterior1st not found in the training data:')
for i in  Exterior1st.keys():
   if [i] not in a: print(i, ':', Exterior1st[i])
print()
a = np.sort(data.Exterior2nd.unique())
print('Exterior2nd has',data.Exterior2nd.count(),'observations. See below the Exterior2nd not found in the training data:')
for i in  Exterior1st.keys():
   if [i] not in a: print(i, ':', Exterior1st[i])

Exterior1st has 1460 observations. See below the Exterior1st not found in the training data:
Other : Other
PreCast : PreCast

Exterior2nd has 1460 observations. See below the Exterior2nd not found in the training data:
BrkComm : Brick Common
CemntBd : Cement Board
PreCast : PreCast
WdShing : Wood Shingles


### Attention: ExterQual, ExterCond, KitchenQual, FireplaceQu, GarageCond and HeatingQC are ordinals

In [222]:
# ExterQual: Evaluates the quality of the material on the exterior 
ExterQual = {}
ExterQual['Ex'] = 'Excellent'
ExterQual['Gd'] = 'Good'
ExterQual['TA'] = 'Average/Typical'
ExterQual['Fa'] = 'Fair'
ExterQual['Po'] = 'Poor'

a = np.sort(data.ExterQual.unique())
print('ExterQual has',data.ExterQual.count(),'observations. See below the ExterQual not found in the training data:')
for i in  ExterQual.keys():
   if [i] not in a: print(i, ':', ExterQual[i])
print()

# ExterCond: Evaluates the present condition of the material on the exterior
a = np.sort(data.ExterCond.unique())
print('ExterQual has',data.ExterCond.count(),'observations.')
for i in  ExterQual.keys():
   if [i] not in a: print(i, ':', ExterQual[i])

print()
#HeatingQC: Heating quality and condition
a = np.sort(data.HeatingQC.unique())
print('HeatingQC has',data.HeatingQC.count(),'observations.')
for i in  ExterQual.keys():
   if [i] not in a: print(i, ':', ExterQual[i])
    
print()
# KitchenQual: Kitchen quality
a = np.sort(data.KitchenQual.unique())
print('KitchenQual has',data.KitchenQual.count(),'observations. See below the ExterQual not found in the training data:')
for i in  ExterQual.keys():
   if [i] not in a: print(i, ':', ExterQual[i])
    
print()
# FireplaceQu: Fireplace quality
a = (data.FireplaceQu.unique())
print('FireplaceQu has',data.FireplaceQu.count(),'observations!')
for i in  ExterQual.keys():
   if [i] not in a: print(i, ':', ExterQual[i])
    
print()
# GarageCond: Garage Conditionals
a = (data.GarageCond.unique())
print('GarageCond has',data.GarageCond.count(),'observations!')
for i in  ExterQual.keys():
   if [i] not in a: print(i, ':', ExterQual[i])

ExterQual has 1460 observations. See below the ExterQual not found in the training data:
Po : Poor

ExterQual has 1460 observations.

HeatingQC has 1460 observations.

KitchenQual has 1460 observations. See below the ExterQual not found in the training data:
Po : Poor

FireplaceQu has 770 observations!

GarageCond has 1379 observations!


### PavedDrive: Paved driveway is Ordinal!

In [221]:
PavedDrive = {}
PavedDrive['Y'] = 'Paved'
PavedDrive['P'] = 'Partial Pavement'
PavedDrive['N'] = 'Dirt/Gravel'

a = np.sort(data.PavedDrive.unique())
print('PavedDrive has',data.PavedDrive.count(),'observations.')
for i in  PavedDrive.keys():
   if [i] not in a: print(i, ':', PavedDrive[i])

PavedDrive has 1460 observations.


In [198]:
# SaleType: Type of sale
SaleType = {}
SaleType['WD'] = 'Warranty Deed - Conventional'
SaleType['CWD'] = 'Warranty Deed - Cash'
SaleType['VWD'] = 'Warranty Deed - VA Loan'
SaleType['New'] = 'Home just constructed and sold'
SaleType['COD'] = 'Court Officer Deed/Estate'
SaleType['Con'] = 'Contract 0,15 Down payment regular terms'
SaleType['ConLw'] = 'Contract Low Down payment and low interest'
SaleType['ConLI'] = 'Contract Low Interest'
SaleType['ConLD'] = 'Contract Low Down'
SaleType['Oth'] = 'Other'

a = np.sort(data.SaleType.unique())
print('SaleType has',data.SaleType.count(),'observations. See below the SaleType not found in the training data:')
for i in  SaleType.keys():
   if [i] not in a: print(i, ':', SaleType[i])


SaleType has 1460 observations. See below the SaleType not found in the training data:
VWD : Warranty Deed - VA Loan
