# Classifying housing as overpriced - finding the optimal algorithm
This notebook determines which alternative model may be the best fit for the data.

In [None]:
import pandas as pd


data = pd.read_csv('housing-iter-6/housing-classification-iter-6.csv')
data.columns

Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'Expensive',
       'MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir',
       'Foundation', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'KitchenQual', 'FireplaceQu',
       'MSSubClass', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath',
       'FullBath', 'HalfBath', 'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt',
       'GarageArea', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'MiscVal',
       'MoSold', 'YrSold', 'Id', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrTyp

## Data Description

LotFrontage: Linear feet of street connected to property

LotArea: Lot size in square feet

TotalBsmtSF: Total square feet of basement area

BedroomAbvGr: Bedrooms above grade (does NOT include basement bedrooms)

Fireplaces: Number of fireplaces

PoolArea: Pool area in square feet

GarageCars: Size of garage in car capacity

WoodDeckSF: Wood deck area in square feet

ScreenPorch: Screen porch area in square feet

MSZoning: Identifies the general zoning classification of the sale.
		
       A	Agriculture
       C	Commercial
       FV	Floating Village Residential
       I	Industrial
       RH	Residential High Density
       RL	Residential Low Density
       RP	Residential Low Density Park 
       RM	Residential Medium Density

Condition1: Proximity to various conditions
	
       Artery	Adjacent to arterial street
       Feedr	Adjacent to feeder street	
       Norm	Normal	
       RRNn	Within 200' of North-South Railroad
       RRAn	Adjacent to North-South Railroad
       PosN	Near positive off-site feature--park, greenbelt, etc.
       PosA	Adjacent to postive off-site feature
       RRNe	Within 200' of East-West Railroad
       RRAe	Adjacent to East-West Railroad

Heating: Type of heating
		
       Floor	Floor Furnace
       GasA	Gas forced warm air furnace
       GasW	Gas hot water or steam heat
       Grav	Gravity furnace	
       OthW	Hot water or steam heat other than gas
       Wall	Wall furnace

Street: Type of road access to property

       Grvl	Gravel	
       Pave	Paved

CentralAir: Central air conditioning

       N	No
       Y	Yes

Foundation: Type of foundation
		
       BrkTil	Brick & Tile
       CBlock	Cinder Block
       PConc	Poured Contrete	
       Slab	Slab
       Stone	Stone
       Wood	Wood

ExterQual: Evaluates the quality of the material on the exterior 
		
       Ex	Excellent
       Gd	Good
       TA	Average/Typical
       Fa	Fair
       Po	Poor
		
ExterCond: Evaluates the present condition of the material on the exterior
		
       Ex	Excellent
       Gd	Good
       TA	Average/Typical
       Fa	Fair
       Po	Poor

BsmtQual: Evaluates the height of the basement

       Ex	Excellent (100+ inches)	
       Gd	Good (90-99 inches)
       TA	Typical (80-89 inches)
       Fa	Fair (70-79 inches)
       Po	Poor (<70 inches
       NA	No Basement
		
BsmtCond: Evaluates the general condition of the basement

       Ex	Excellent
       Gd	Good
       TA	Typical - slight dampness allowed
       Fa	Fair - dampness or some cracking or settling
       Po	Poor - Severe cracking, settling, or wetness
       NA	No Basement
	
BsmtExposure: Refers to walkout or garden level walls

       Gd	Good Exposure
       Av	Average Exposure (split levels or foyers typically score average or above)	
       Mn	Mimimum Exposure
       No	No Exposure
       NA	No Basement
	
BsmtFinType1: Rating of basement finished area

       GLQ	Good Living Quarters
       ALQ	Average Living Quarters
       BLQ	Below Average Living Quarters	
       Rec	Average Rec Room
       LwQ	Low Quality
       Unf	Unfinshed
       NA	No Basement

KitchenQual: Kitchen quality

       Ex	Excellent
       Gd	Good
       TA	Typical/Average
       Fa	Fair
       Po	Poor

FireplaceQu: Fireplace quality

       Ex	Excellent - Exceptional Masonry Fireplace
       Gd	Good - Masonry Fireplace in main level
       TA	Average - Prefabricated Fireplace in main living area or Masonry Fireplace in basement
       Fa	Fair - Prefabricated Fireplace in basement
       Po	Poor - Ben Franklin Stove
       NA	No Fireplace

MSSubClass: Identifies the type of dwelling involved in the sale.	

        20	1-STORY 1946 & NEWER ALL STYLES
        30	1-STORY 1945 & OLDER
        40	1-STORY W/FINISHED ATTIC ALL AGES
        45	1-1/2 STORY - UNFINISHED ALL AGES
        50	1-1/2 STORY FINISHED ALL AGES
        60	2-STORY 1946 & NEWER
        70	2-STORY 1945 & OLDER
        75	2-1/2 STORY ALL AGES
        80	SPLIT OR MULTI-LEVEL
        85	SPLIT FOYER
        90	DUPLEX - ALL STYLES AND AGES
       120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
       150	1-1/2 STORY PUD - ALL AGES
       160	2-STORY PUD - 1946 & NEWER
       180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
       190	2 FAMILY CONVERSION - ALL STYLES AND AGES

OverallQual: Rates the overall material and finish of the house

       10 Very Excellent
       9	Excellent
       8	Very Good
       7	Good
       6	Above Average
       5	Average
       4	Below Average
       3	Fair
       2	Poor
       1	Very Poor
	
OverallCond: Rates the overall condition of the house

       10 Very Excellent
       9	Excellent
       8	Very Good
       7	Good
       6	Above Average	
       5	Average
       4	Below Average	
       3	Fair
       2	Poor
       1	Very Poor
		
YearBuilt: Original construction date

YearRemodAdd: Remodel date (same as construction date if no remodeling or additions)

MasVnrArea: Masonry veneer area in square feet

BsmtFinSF1: Type 1 finished square feet

BsmtFinSF2: Type 2 finished square feet

BsmtUnfSF: Unfinished square feet of basement area

1stFlrSF: First Floor square feet
 
2ndFlrSF: Second floor square feet

LowQualFinSF: Low quality finished square feet (all floors)

GrLivArea: Above grade (ground) living area square feet

1stFlrSF: First Floor square feet
 
2ndFlrSF: Second floor square feet

LowQualFinSF: Low quality finished square feet (all floors)

GrLivArea: Above grade (ground) living area square feet

BsmtFullBath: Basement full bathrooms

BsmtHalfBath: Basement half bathrooms

FullBath: Full bathrooms above grade

HalfBath: Half baths above grade

KitchenAbvGr: Kitchens above grade

TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)

GarageYrBlt: Year garage was built

GarageArea: Size of garage in square feet

OpenPorchSF: Open porch area in square feet

EnclosedPorch: Enclosed porch area in square feet

3SsnPorch: Three season porch area in square feet

MiscVal: $Value of miscellaneous feature

MoSold: Month Sold (MM)

YrSold: Year Sold (YYYY)

### Thoughts on the features and how ordinal they are
```
MSZoning: Identifies the general zoning classification of the sale.
		
       A, C, I are separate
       FV, RP, RH, RM, RL 	could be a nice ordinal encoding

Condition1: Proximity to various conditions
	prob not an ordinal

Heating: Type of heating
		
       probl not an ordinal

Street: Type of road access to property

       Binary --> not an ordinal

CentralAir: Central air conditioning

       Binary --> not an ordinal

Foundation: Type of foundation
		
       prob not an ordinal

ExterQual: Evaluates the quality of the material on the exterior 
		
       Ex, Gd, TA, Fa, Po --> nice ordinal
		
ExterCond: Evaluates the present condition of the material on the exterior
		
       Ex, Gd, TA, Fa, Po --> nice ordinal

BsmtQual: Evaluates the height of the basement

       Ex, Gd, TA, Fa, Po, NA --> nice ordinal
		
BsmtCond: Evaluates the general condition of the basement

       Ex, Gd, TA, Fa, Po, NA --> nice ordinal
	
BsmtExposure: Refers to walkout or garden level walls

       Gd, Av, Mn, No, NA --> nice ordinal
	
BsmtFinType1: Rating of basement finished area

       GLQ, ALQ, BLQ, Rec, LwQ, Unf, NA --> nice ordinal

KitchenQual: Kitchen quality

       Ex, Gd, TA, Fa, Po --> nice ordinal

FireplaceQu: Fireplace quality

       Ex, Gd, TA, Fa, Po, NA --> nice ordinal
```

In [None]:
y = data.pop('Expensive')
print(y.tolist())

[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 

In [None]:
X = data
X.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,8450,65.0,856,3,0,0,2,0,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,9600,80.0,1262,3,1,0,2,298,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,11250,68.0,920,3,1,0,2,0,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,9550,60.0,756,3,1,0,3,0,0,RL,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,14260,84.0,1145,4,1,0,3,192,0,RL,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


## Basic exploration
See version 0 and 1 of this Notebook

## Split into training and testing data

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=31416)
ids_train = ids = X_train.pop('Id')
ids_test = X_test.pop('Id')

## Baseline Model
Our intuition based model returns an accuracy score of (train, test) = (0.85, 0.87)

## Create a Data Piepline

### Setup pipelines for numerical data

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

numerical_features = list(X_train.select_dtypes(include=["number"]))
numerical_pipe = Pipeline([
        ("numerical_imputer", SimpleImputer(strategy="mean"))
    ])


### Setup pipeline for categorical data
1. Impute N/A with constant sting
2. One_hot encode the categorical columns (drop first)

#### Split ordinal and non ordinal categorical columns

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import set_config

X_cat = X.select_dtypes(exclude="number").copy()

one_hot_features = X_cat.columns.get_indexer(['MSZoning', 'Condition1', 'Heating', 'Street', 'CentralAir', 'Foundation'])
ordinal_quality_features = X_cat.columns.get_indexer(['ExterQual', 'ExterCond']) #, 'KitchenQual', 'FireplaceQu'])
ordinal_quality_na_features = X_cat.columns.get_indexer(['BsmtQual', 'BsmtCond', 'KitchenQual', 'FireplaceQu'])
basement_exposure_features = X_cat.columns.get_indexer(['BsmtExposure'])
basement_finished_quality_features = X_cat.columns.get_indexer(['BsmtFinType1'])

quality_grade_cat = ["Ex", "Gd", "TA", "Fa", "Po"]
quality_grade_categories = [quality_grade_cat for x in ordinal_quality_features]
quality_grade_na_cat = quality_grade_cat + ['NA']
quality_grade_na_categories = [quality_grade_na_cat for x in ordinal_quality_na_features]
basement_exposure_categories = ["Gd", "Av", "Mn", "No", "NA"]
basement_finished_quality_categories = ["GLQ", "ALQ", "BLQ", "Rec", "LwQ", "Unf", "NA"]

categorical_encoder = ColumnTransformer(
    transformers=[
        ("ordinal_quality_features", OrdinalEncoder(categories=quality_grade_categories), ordinal_quality_features),
        ("ordinal_quality_na_features", OrdinalEncoder(categories=quality_grade_na_categories), ordinal_quality_na_features),
        ("basement_exposure_features", OrdinalEncoder(categories=[basement_exposure_categories]), basement_exposure_features),
        ("basement_finished_wquality_features", OrdinalEncoder(categories=[basement_finished_quality_categories]), basement_finished_quality_features),
        ("one_hot_features", OneHotEncoder(drop="first", handle_unknown='ignore'), one_hot_features),
    ]
)

In [None]:
X_cat.isna().value_counts()

MSZoning  Condition1  Heating  Street  CentralAir  Foundation  ExterQual  ExterCond  BsmtQual  BsmtCond  BsmtExposure  BsmtFinType1  KitchenQual  FireplaceQu  Alley  LotShape  LandContour  Utilities  LotConfig  LandSlope  Neighborhood  Condition2  BldgType  HouseStyle  RoofStyle  RoofMatl  Exterior1st  Exterior2nd  MasVnrType  BsmtFinType2  HeatingQC  Electrical  Functional  GarageType  GarageFinish  GarageQual  GarageCond  PavedDrive  PoolQC  Fence  MiscFeature  SaleType  SaleCondition
False     False       False    False   False       False       False      False      False     False     False         False         False        False        True   False     False        False      False      False      False         False       False     False       False      False     False        False        False       False         False      False       False       False       False         False       False       False       True    True   True         False     False            588
         

In [None]:
print(one_hot_features)
print(ordinal_quality_features)
print(ordinal_quality_na_features)


[0 1 2 3 4 5]
[6 7]
[ 8  9 12 13]


In [None]:
print(quality_grade_cat)
print(quality_grade_na_cat)
print(quality_grade_categories)
print(quality_grade_na_categories)


['Ex', 'Gd', 'TA', 'Fa', 'Po']
['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']
[['Ex', 'Gd', 'TA', 'Fa', 'Po'], ['Ex', 'Gd', 'TA', 'Fa', 'Po']]
[['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'], ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'], ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA'], ['Ex', 'Gd', 'TA', 'Fa', 'Po', 'NA']]


In [None]:


# categorical_features = list(X_train.select_dtypes(exclude=["number"]))
categorical_pipe = Pipeline([
        ("categorical_imputer", SimpleImputer(strategy="constant", fill_value="NA")),
        ("categorical_encode", categorical_encoder)
    ])

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

preprocessor = ColumnTransformer(transformers=[
        ("numerical_preprocessing", numerical_pipe, numerical_features),
        ("categorical_preprocessing", categorical_pipe, X_cat.columns)
    ]
)

def knn_pipe(preprocessor, scaler):
    nearest_neighbors = KNeighborsClassifier()

    return Pipeline([("preprocessor", preprocessor), ('scale', scaler), ("knn_classifier", nearest_neighbors)])



In [None]:
from sklearn.model_selection import GridSearchCV

def knn_search(preprocessor, scaler, scaler_grid):
    pipe = knn_pipe(preprocessor, scaler)
    param_grid = { 
        'knn_classifier__n_neighbors': range(1, 15)
        ,'knn_classifier__weights': ['uniform', 'distance']
        } | scaler_grid

    k_fold_count = 5
    return GridSearchCV(pipe, # you have defined this beforehand
                        param_grid, # your parameter grid
                        cv=k_fold_count, # the value for K in K-fold Cross Validation
                        scoring='accuracy', # the performance metric to use 
                        verbose=1, # we want informative outputs during the training process
                        n_jobs=4 # use 4 cores at once, to reduce runtime (-1 for all cores)
                        ,error_score='raise'
                        ) 

In [None]:
scaler_grid = { 
        'scale__with_mean': [True, False]
        ,'scale__with_std': [True, False]
        }

search_knn = knn_search(preprocessor, StandardScaler(), scaler_grid)
search_knn.fit(X_train, y_train)

Fitting 5 folds for each of 112 candidates, totalling 560 fits


GridSearchCV(cv=5, error_score='raise',
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('numerical_preprocessing',
                                                                         Pipeline(steps=[('numerical_imputer',
                                                                                          SimpleImputer())]),
                                                                         ['LotArea',
                                                                          'LotFrontage',
                                                                          'TotalBsmtSF',
                                                                          'BedroomAbvGr',
                                                                          'Fireplaces',
                                                                          'PoolArea',
                                                        

Which are the best parameters? Which is it's average score?

In [None]:
search_knn.best_params_, search_knn.best_score_


({'knn_classifier__n_neighbors': 7,
  'knn_classifier__weights': 'distance',
  'scale__with_mean': True,
  'scale__with_std': True},
 0.9383514911411905)

### Apply to the full training data

In [None]:
from sklearn.metrics import accuracy_score

def calc_accuracy(search):
    # training accuracy
    y_train_pred = search.predict(X_train)

    # test accuracy
    y_test_pred = search.predict(X_test)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy =  accuracy_score(y_test, y_test_pred)
    return train_accuracy, test_accuracy


train_accuracy, test_accuracy = calc_accuracy(search_knn)
train_accuracy, test_accuracy, abs(train_accuracy - test_accuracy)



(1.0, 0.9417808219178082, 0.05821917808219179)

Compared to the best only numerical data model I found

```
({'dtree_classifier__criterion': 'entropy',
  'dtree_classifier__max_depth': 7,
  'dtree_classifier__min_samples_leaf': 5,
  'dtree_classifier__min_samples_split': 28},
(0.9332191780821918, 0.934931506849315, 0.001712328767123239)
```
The results turn out to be worse. The accuracy on the learning set did improve, but the test set did worsen and the spread got larger. This indicated the model now is overfitted.

### Linear regressions

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

def log_reg_pipe(preprocessor, scaler):
    log_regression = LogisticRegression()

    return Pipeline([("preprocessor", preprocessor), ('scale', scaler), ("log_reg_classifier", log_regression)])



def log_reg_search(preprocessor, scaler, scaler_grid):
    pipe = log_reg_pipe(preprocessor, scaler)
    param_grid = {
        'log_reg_classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] 
        # ,'klog_reg_classifier__weights': ['uniform', 'distance']
        } | scaler_grid

    k_fold_count = 5
    return GridSearchCV(pipe, # you have defined this beforehand
                        param_grid, # your parameter grid
                        cv=k_fold_count, # the value for K in K-fold Cross Validation
                        scoring='accuracy', # the performance metric to use 
                        verbose=1, # we want informative outputs during the training process
                        n_jobs=4 # use 4 cores at once, to reduce runtime (-1 for all cores)
                        ,error_score='raise'
                        ) 

scaler_grid = { 
        'scale__with_mean': [True, False]
        ,'scale__with_std': [True, False]
        }

search_lin_reg = lin_reg_search(preprocessor, StandardScaler(), scaler_grid)
search_lin_reg.fit(X_train, y_train)

train_accuracy, test_accuracy = calc_accuracy(search_lin_reg)
train_accuracy, test_accuracy, abs(train_accuracy - test_accuracy)

SyntaxError: closing parenthesis ')' does not match opening parenthesis '[' (228618108.py, line 14)

In [None]:
import numpy as np 

X_competition = pd.read_csv('test-housing-classification.csv')

result_competition = pd.DataFrame(X_competition.pop('Id'))

y_competition_pred = search.predict(X_competition)

result_competition['Expensive'] = y_competition_pred


# result_train = pd.DataFrame(ids_train, index=ids_train, dtype=np.int64)
# result_train['Estimate'] = y_train_pred
# result_train



In [None]:
result_competition.to_csv('competition_submision.csv', index=False)
result_competition # .drop(columns='Index')


Unnamed: 0,Id,Expensive
0,1461,0
1,1462,0
2,1463,0
3,1464,0
4,1465,0
...,...,...
1454,2915,0
1455,2916,0
1456,2917,0
1457,2918,0
