<a href="https://colab.research.google.com/github/marcelorubino84/MLHousingPricesProject/blob/main/Housing_competition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Importing all the necessary libraries


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt


# 2. Importing the data

In [None]:
from google.colab import drive 
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
df_houses = pd.read_csv('/content/gdrive/MyDrive/housing-classification-iter6.csv')

# 3. Exploring the data

In [None]:
# Display all columns
pd.set_option('display.max_columns', None)

In [None]:
# Checking for missing values
df_houses.isna().sum()

LotArea             0
LotFrontage       259
TotalBsmtSF         0
BedroomAbvGr        0
Fireplaces          0
                 ... 
PoolQC           1453
Fence            1179
MiscFeature      1406
SaleType            0
SaleCondition       0
Length: 81, dtype: int64

In [None]:
# Checking for duplicates
df_houses.duplicated().sum()

0

In [None]:
# Checking the datatypes
df_houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   TotalBsmtSF    1460 non-null   int64  
 3   BedroomAbvGr   1460 non-null   int64  
 4   Fireplaces     1460 non-null   int64  
 5   PoolArea       1460 non-null   int64  
 6   GarageCars     1460 non-null   int64  
 7   WoodDeckSF     1460 non-null   int64  
 8   ScreenPorch    1460 non-null   int64  
 9   Expensive      1460 non-null   int64  
 10  MSZoning       1460 non-null   object 
 11  Condition1     1460 non-null   object 
 12  Heating        1460 non-null   object 
 13  Street         1460 non-null   object 
 14  CentralAir     1460 non-null   object 
 15  Foundation     1460 non-null   object 
 16  ExterQual      1460 non-null   object 
 17  ExterCond      1460 non-null   object 
 18  BsmtQual

In [None]:
# Since there is a huge amount of missing data in the below columns, they will be deleted
df_houses = df_houses.drop(columns=["MiscFeature",'PoolQC','Fence','Alley','FireplaceQu'])

# 3. Defining the feature vectors and the target column for the model

In [None]:
y = df_houses.pop('Expensive')

In [None]:
X = df_houses

In [None]:
X.columns

Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'MSZoning',
       'Condition1', 'Heating', 'Street', 'CentralAir', 'Foundation',
       'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'KitchenQual', 'MSSubClass', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'GarageYrBlt', 'GarageArea',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'MiscVal', 'MoSold',
       'YrSold', 'Id', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition2', 'BldgType', 'HouseStyle',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'BsmtFinType2', 'HeatingQC', 'Ele

In [None]:
y

0       0
1       0
2       0
3       0
4       0
       ..
1455    0
1456    0
1457    1
1458    0
1459    0
Name: Expensive, Length: 1460, dtype: int64

# 4. Splitting the data into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [None]:
X_train.head()

Unnamed: 0,LotArea,LotFrontage,TotalBsmtSF,BedroomAbvGr,Fireplaces,PoolArea,GarageCars,WoodDeckSF,ScreenPorch,MSZoning,Condition1,Heating,Street,CentralAir,Foundation,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,KitchenQual,MSSubClass,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,KitchenAbvGr,TotRmsAbvGrd,GarageYrBlt,GarageArea,OpenPorchSF,EnclosedPorch,3SsnPorch,MiscVal,MoSold,YrSold,Id,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition2,BldgType,HouseStyle,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,BsmtFinType2,HeatingQC,Electrical,Functional,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,SaleType,SaleCondition
318,9900,90.0,1347,4,1,0,3,340,0,RL,Norm,GasA,Pave,Y,PConc,Gd,TA,Gd,TA,Gd,GLQ,Gd,60,7,5,1993,1993,256.0,987,0,360,1372,1274,0,2646,1,0,2,1,1,9,1993.0,656,60,144,0,0,4,2009,319,Reg,Low,AllPub,Inside,Mod,NoRidge,Norm,1Fam,2Story,Gable,CompShg,HdBoard,HdBoard,BrkFace,Unf,Ex,SBrkr,Typ,Attchd,RFn,TA,TA,Y,WD,Normal
580,14585,,1144,3,2,0,2,216,0,RL,Norm,GasA,Pave,Y,CBlock,TA,TA,TA,TA,No,BLQ,Gd,20,6,6,1960,1987,85.0,594,219,331,1429,0,0,1429,0,1,1,0,1,7,1960.0,572,110,0,0,0,6,2007,581,IR1,Lvl,AllPub,CulDSac,Gtl,NAmes,Norm,1Fam,1Story,Gable,CompShg,Wd Sdng,Wd Sdng,BrkFace,Rec,Ex,SBrkr,Typ,Attchd,Unf,TA,TA,Y,WD,Normal
961,12227,,1330,4,1,0,2,550,0,RL,PosN,GasA,Pave,Y,CBlock,TA,Gd,Gd,Gd,No,ALQ,TA,60,6,7,1977,1995,424.0,896,0,434,1542,1330,0,2872,1,0,2,1,1,11,1977.0,619,282,0,0,0,7,2008,962,IR1,Lvl,AllPub,Corner,Gtl,NWAmes,Norm,1Fam,2Story,Gable,CompShg,HdBoard,HdBoard,BrkFace,Unf,TA,SBrkr,Typ,Attchd,Fin,TA,TA,Y,WD,Normal
78,10778,72.0,1768,4,0,0,0,0,0,RL,Norm,GasA,Pave,N,CBlock,TA,TA,TA,TA,No,Unf,TA,90,4,5,1968,1968,0.0,0,0,1768,1768,0,0,1768,0,0,2,0,2,8,,0,0,0,0,0,4,2010,79,Reg,Lvl,AllPub,Inside,Gtl,Sawyer,Norm,Duplex,1Story,Hip,CompShg,HdBoard,HdBoard,,Unf,TA,SBrkr,Typ,,,,,Y,WD,Normal
5,14115,85.0,796,1,0,0,2,40,0,RL,Norm,GasA,Pave,Y,Wood,TA,TA,Gd,TA,No,GLQ,TA,50,5,5,1993,1995,0.0,732,0,64,796,566,0,1362,1,0,1,1,1,5,1993.0,480,30,0,320,700,10,2009,6,IR1,Lvl,AllPub,Inside,Gtl,Mitchel,Norm,1Fam,1.5Fin,Gable,CompShg,VinylSd,VinylSd,,Unf,Ex,SBrkr,Typ,Attchd,Unf,TA,TA,Y,WD,Normal


In [None]:
y_train.head()

318    1
580    0
961    1
78     0
5      0
Name: Expensive, dtype: int64

# 5. Creating the numeric & categoric pipelines

In [None]:
X_cat_columns = X.select_dtypes(exclude="number").copy().columns
X_num_columns = X.select_dtypes(include="number").copy().columns

numeric_pipe = make_pipeline(
    SimpleImputer(strategy="mean"))


categoric_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="N_A"),
    OneHotEncoder(handle_unknown='ignore'))

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num_pipe", numeric_pipe, X_num_columns),
        ("cat_pipe", categoric_pipe, X_cat_columns),
    ]
)

# 6. Creating the full_pipeline (preprocessor + Random Forest)


In [None]:
# full pipeline: preprocessor + Random Forest Model
full_pipeline = make_pipeline(preprocessor, 
                              RandomForestClassifier())

# define parameter grid
param_grid = {
    "columntransformer__num_pipe__simpleimputer__strategy":["mean", "median"],
    "randomforestclassifier__n_estimators": [80, 90, 100, 115, 120, 130],
    #"randomforestclassifier__ max_depth": range(2,10),
    "randomforestclassifier__min_samples_split": range(2,10),
    "randomforestclassifier__min_samples_leaf": range(1,10),
    "randomforestclassifier__max_features": ['auto', 'log2', 'sqrt'],
    "randomforestclassifier__bootstrap" : ['True', 'False']

}

# define GridSearchCV
search = GridSearchCV(full_pipeline,
                      param_grid,
                      cv=3,
                      verbose=1,
                      n_jobs = -1)

In [None]:
search.fit(X_train, y_train)

Fitting 3 folds for each of 5184 candidates, totalling 15552 fits


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(transformers=[('num_pipe',
                                                                         Pipeline(steps=[('simpleimputer',
                                                                                          SimpleImputer())]),
                                                                         Index(['LotArea', 'LotFrontage', 'TotalBsmtSF', 'BedroomAbvGr', 'Fireplaces',
       'PoolArea', 'GarageCars', 'WoodDeckSF', 'ScreenPorch', 'MSSubClass',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAd...
             param_grid={'columntransformer__num_pipe__simpleimputer__strategy': ['mean',
                                                                                  'median'],
                         'randomforestclassifier__bootstrap': ['True', 'False'],
                         'randomforestclassifier__max_feature

In [None]:
#the best hyperparameters
search.best_params_

{'columntransformer__num_pipe__simpleimputer__strategy': 'median',
 'randomforestclassifier__bootstrap': 'False',
 'randomforestclassifier__max_features': 'sqrt',
 'randomforestclassifier__min_samples_leaf': 4,
 'randomforestclassifier__min_samples_split': 7,
 'randomforestclassifier__n_estimators': 90}

In [None]:
# the best accuracy
search.best_score_

0.9520664425548744

In [None]:
accuracy_score(search.predict(X_test), y_test)

0.9554794520547946

# 7. Checking the accuracy of the test

In [None]:
# make predictions
y_predict_test = search.predict(X_test)

In [None]:
# Check accuracy on the test set
accuracy_score(y_true = y_test,
               y_pred = y_predict_test
              )

0.9554794520547946

In [None]:
accuracy_score(search.predict(X_test), y_test)

0.9554794520547946

# 8. Using the 'test.csv' file to make the prediction

In [None]:
df_test = pd.read_csv('/content/gdrive/MyDrive/test.csv')

In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     1459 non-null   int64  
 1   LotArea        1459 non-null   int64  
 2   LotFrontage    1232 non-null   float64
 3   TotalBsmtSF    1458 non-null   float64
 4   BedroomAbvGr   1459 non-null   int64  
 5   Fireplaces     1459 non-null   int64  
 6   PoolArea       1459 non-null   int64  
 7   GarageCars     1458 non-null   float64
 8   WoodDeckSF     1459 non-null   int64  
 9   ScreenPorch    1459 non-null   int64  
 10  MSZoning       1455 non-null   object 
 11  Condition1     1459 non-null   object 
 12  Heating        1459 non-null   object 
 13  Street         1459 non-null   object 
 14  CentralAir     1459 non-null   object 
 15  Foundation     1459 non-null   object 
 16  ExterQual      1459 non-null   object 
 17  ExterCond      1459 non-null   object 
 18  BsmtQual

In [None]:
df_predict_copy = df_houses.copy()

In [None]:
df_predict_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 75 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1460 non-null   int64  
 1   LotFrontage    1201 non-null   float64
 2   TotalBsmtSF    1460 non-null   int64  
 3   BedroomAbvGr   1460 non-null   int64  
 4   Fireplaces     1460 non-null   int64  
 5   PoolArea       1460 non-null   int64  
 6   GarageCars     1460 non-null   int64  
 7   WoodDeckSF     1460 non-null   int64  
 8   ScreenPorch    1460 non-null   int64  
 9   MSZoning       1460 non-null   object 
 10  Condition1     1460 non-null   object 
 11  Heating        1460 non-null   object 
 12  Street         1460 non-null   object 
 13  CentralAir     1460 non-null   object 
 14  Foundation     1460 non-null   object 
 15  ExterQual      1460 non-null   object 
 16  ExterCond      1460 non-null   object 
 17  BsmtQual       1423 non-null   object 
 18  BsmtCond

In [None]:
df_test = df_test.drop(columns=["MiscFeature",'PoolQC','Fence','Alley','FireplaceQu'])

In [None]:
df_test = df_test.drop(columns=['Unnamed: 0'])

In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 75 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   LotArea        1459 non-null   int64  
 1   LotFrontage    1232 non-null   float64
 2   TotalBsmtSF    1458 non-null   float64
 3   BedroomAbvGr   1459 non-null   int64  
 4   Fireplaces     1459 non-null   int64  
 5   PoolArea       1459 non-null   int64  
 6   GarageCars     1458 non-null   float64
 7   WoodDeckSF     1459 non-null   int64  
 8   ScreenPorch    1459 non-null   int64  
 9   MSZoning       1455 non-null   object 
 10  Condition1     1459 non-null   object 
 11  Heating        1459 non-null   object 
 12  Street         1459 non-null   object 
 13  CentralAir     1459 non-null   object 
 14  Foundation     1459 non-null   object 
 15  ExterQual      1459 non-null   object 
 16  ExterCond      1459 non-null   object 
 17  BsmtQual       1415 non-null   object 
 18  BsmtCond

In [None]:
y_prediction = search.predict(df_test)

In [None]:
y_prediction

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
prediction_df = pd.DataFrame(y_prediction,columns=['Expensive'])

In [None]:
prediction_df

Unnamed: 0,Expensive
0,0
1,0
2,0
3,0
4,0
...,...
1454,0
1455,0
1456,0
1457,0


In [None]:
prediction_df.to_csv('RandomForest.csv')