# <font color='#9575cd'>1. Importing Packages and Data</font> 

In [31]:
import os
import gc
import warnings

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

warnings.filterwarnings('ignore')
%matplotlib inline
sns.set_style('whitegrid')

In [2]:
folderPath = 'D:/Competitions/HousePriceCompetition'

In [3]:
fileName = 'train.csv'

In [95]:
train = pd.read_csv(os.path.join(folderPath,fileName),index_col='Id')

In [83]:
test = pd.read_csv(os.path.join(folderPath,'test.csv'),index_col='Id')

In [21]:
train.shape[0]

1460

In [84]:
train.dropna(axis=0,subset=['SalePrice'],inplace=True)

# <font color='#9575cd'>2. Data Exploration and Manipulation</font>

### <font color="#00b8d4">2.1 Missing value treatment</font>

In [None]:
train = train.select_dtypes(exclude='object')

In [None]:
test = test.select_dtypes(exclude='object')

In [15]:
missing_val_count_by_column = X_train.isnull().sum()

In [26]:
print(missing_val_count_by_column[missing_val_count_by_column>0])

LotFrontage    259
MasVnrArea       8
GarageYrBlt     81
dtype: int64


In [29]:
def score_dataset(X_train,X_valid,y_train,y_valid):
    model = RandomForestRegressor(n_estimators=100,random_state=0)
    model.fit(X_train,y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid,preds)

In [34]:
y = train.SalePrice
X = train.drop(['SalePrice'],axis=1)

In [35]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=0)

###### Drop Columns with missing values

In [37]:
col_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

In [38]:
reduced_X_train = X_train.drop(col_with_missing,axis=1)
reduced_X_valid = X_valid.drop(col_with_missing,axis=1)

In [39]:
print(score_dataset(reduced_X_train,reduced_X_valid,y_train,y_valid))

17837.82570776256


###### Simple Imputer

In [54]:
from sklearn.preprocessing import Imputer

In [59]:
my_imputer = Imputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

In [60]:
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

In [61]:
print(score_dataset(imputed_X_train,imputed_X_valid,y_train,y_valid))

18062.894611872147


In [79]:
my_imputer = Imputer(strategy='median')
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

In [80]:
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

In [81]:
print(score_dataset(imputed_X_train,imputed_X_valid,y_train,y_valid))

17791.59899543379


###### An Extension of Imputation

In [62]:
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

In [63]:
for col in col_with_missing:
    X_train_plus[col + '_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_missing'] = X_valid_plus[col].isnull()

In [64]:
my_imputer = Imputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

In [65]:
print(score_dataset(imputed_X_train_plus,imputed_X_valid_plus,y_train,y_valid))

18148.417180365297


###### Other Imputations

In [66]:
X_train_zero = X_train.copy()
X_valid_zero = X_valid.copy()

In [67]:
for col in col_with_missing:
    X_train_zero[col].fillna(0,inplace=True)
    X_valid_zero[col].fillna(0,inplace=True)

In [68]:
print(score_dataset(X_train_zero,X_valid_zero,y_train,y_valid))

18017.665970319635


In [69]:
del(X_train_zero)
del(X_valid_zero)

In [75]:
X_train_median = X_train.copy()
X_valid_median = X_valid.copy()

In [76]:
for col in col_with_missing:
    med = X_train[col].median()
    X_train_median[col].fillna(med,inplace=True)
    X_valid_median[col].fillna(med,inplace=True)

In [77]:
print(score_dataset(X_train_median,X_valid_median,y_train,y_valid))

18062.894611872147


In [78]:
del(X_train_median)
del(X_valid_median)

### <font color="#00b8d4">2.2 Outlier Detection and treatment</font>

### <font color="#00b8d4">2.3 Oversampling the minority class</font>

### <font color="#00b8d4">2.4 Data type assignment</font>

### <font color="#00b8d4">2.5 Label Exploration</font>

# <font color='#9575cd'>3. Feature Engineering</font>

### <font color="#006064">3.1. Removing Redundant Columns</font>

# <font color='#9575cd'>4. Machine Learning Modelling</font>

### <font color="#006064">4.1. Data Pre-processing</font>

##### <font color="#0097a7">4.1.1 Imputation</font>

##### <font color="#0097a7">4.1.2 Labelling of Categorical variables</font>

In [98]:
X = train.copy()
X_test = test.copy()
y = train.SalePrice

In [99]:
X.drop(['SalePrice'],axis=1,inplace=True)

In [100]:
col_with_missing = [col for col in X.columns if X[col].isnull().any()]
X.drop(col_with_missing,axis=1,inplace=True)
X_test.drop(col_with_missing,axis=1,inplace=True)

In [101]:
X_train,X_valid,y_train,y_valid = train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=0)

In [113]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train,X_valid,y_train,y_valid):
    model = RandomForestRegressor(n_estimators=100,random_state=0)
    model.fit(X_train,y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid,preds)

###### Droping categorical variables

In [108]:
object_cols = X_train.dtypes[X_train.dtypes == 'object'].index

In [109]:
drop_X_train = X_train.select_dtypes(exclude='object')
drop_X_valid = X_valid.select_dtypes(exclude='object')

In [114]:
print("MAE from Approach 1 (Drop categorical variables):")
print(score_dataset(drop_X_train, drop_X_valid, y_train, y_valid))

MAE from Approach 1 (Drop categorical variables):
17837.82570776256


###### label Encoding

In [115]:
from sklearn.preprocessing import LabelEncoder

In [117]:
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

In [119]:
good_label_columns = [col for col in object_cols if set(X_train[col])==set(X_valid[col])]
bad_columns = list(set(object_cols)-set(good_label_columns))
print(good_label_columns)
print(bad_columns)

['MSZoning', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'BldgType', 'HouseStyle', 'ExterQual', 'CentralAir', 'KitchenQual', 'PavedDrive', 'SaleCondition']
['Utilities', 'HeatingQC', 'Functional', 'Foundation', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Condition2', 'Neighborhood', 'ExterCond', 'Exterior2nd', 'LandSlope', 'SaleType', 'Condition1', 'Heating']


In [121]:
label_X_train = label_X_train.drop(bad_columns,axis=1)
label_X_valid = label_X_valid.drop(bad_columns,axis=1)

In [122]:
label_encoder = LabelEncoder()
for col in good_label_columns:
    label_X_train[col] = label_encoder.fit_transform(X_train[col])
    label_X_valid[col] = label_encoder.transform(X_valid[col])
    
print("MAE from Approach 2 (Label Encoding):") 
print(score_dataset(label_X_train, label_X_valid, y_train, y_valid))

MAE from Approach 2 (Label Encoding):
17575.291883561644


##### <font color="#0097a7">4.1.3 One Hot Encoder</font>

##### <font color="#0097a7">4.1.4 Scaling</font>

### <font color="#006064">4.2 Creating Baseline Model </font>

### <font color="#006064">4.3 Model Selection</font>

### <font color="#006064">4.4 Model Optimization</font>

### <font color="#006064">4.5 Implementation</font>

# <font color='#9575cd'>5. Investigating Predictions/Evaluation</font>

# <font color='#9575cd'>6. Submission</font>