<a href="https://colab.research.google.com/github/macorony/Python-ML-60-day-Journey/blob/main/House_Price.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Day9 (2022-01-31)
Machine Learning on house price

In [None]:
import numpy as np
import pandas as pd
from scipy import sparse # The most important part of scipy is sparse providing sparse matrices.
import matplotlib.pyplot as plt
%matplotlib inline  
# Render a figure in the notebook rather than display a object. 
import missingno as msno
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import f_regression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid, StratifiedKFold
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA

# Basic Data Exploration of training and test data combined
1. The input variable of train and test data are concatenated
2. The target variable of train data is independent for data exploration

In [None]:
df_train = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
X_test = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

In [None]:
X_train, y_train = df_train.iloc[:,:-1], df_train.iloc[:,-1]

# Basic Data Exploration of training and test data combined
1. The input variable of train and test data are concatenated
2. The target variable of train data is independent for data exploration

In [None]:
# Inspect columns of training data are consistent with test data 
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
X_train.columns == X_test.columns

In [None]:
# characterizing training data
print("Training data shape: \n{}".format(X_train.shape))
print("\nFeature names: \n{}".format(X_train.columns))
print("\nData type of dependent variable: {}".format(X_train.dtypes))
print("\nData type of target variable: {}".format(y_train.dtype))

In [None]:
print("First five rows of data: \n", X_train.head())
print ("Summary of training data: \n", X_train.describe())

In [None]:
# characterizing the test data
print("Test data shape: \n{}".format(X_test.shape))
print("\nFeature names: \n{}".format(X_test.columns))
print("\nData type of dependent variable: {}".format(X_test.dtypes))

In [None]:
print("First five rows of test data: \n", X_test.head())
print ("Summary of test data: \n", X_test.describe())

In [None]:
# There are two types of dependent variables: categorical and numerical
num_features = [i for i in X_train if X_train[i].dtype != "object"]
cate_features = [i for i in X_train if X_train[i].dtype == "object"]
print("There are {0} numeric features and {1} categorical features\n".format(len(num_features), len(cate_features)))
print("The numerica features:", num_features, '\n')
print("The categorical feature:", cate_features)
# Some features' data types are numeric but they may be possible to act as categorical variable.
# We need to explore into that

# Missing Values
1. Detect missing values
2. Handle missing values

In [None]:
# concatenate the train and test data
X_global = pd.concat([X_train, X_test], ignore_index=True)
print('Global data shape:', X_global.shape)
print(X_global.tail())

In [None]:
msno.matrix(X_global)

### About missingno heatmap
1. Used to identify if there is a relationship in the presence of null values between each of the columns.
2. Values close to positive 1 or negative 1 indicate that the presence of null values in one column is either correlated or anti-correlated with the presence of null values in another column.
3. Values close to 0 indicate that there is little to no relationship between the presence of null values in one colum compared to another.

In [None]:
# Using train data to identify the correlation of missing values. 
# Some features indicate 1 or close to 1 correlation. These features have similar syntax meanings 
# MasVnrType and MasVnrType, Bsmt feature group, Garage feature group.
msno.heatmap(X_train)

In [None]:
# Detect which features have missing values
withNull = X_global.isnull().sum().sort_values(ascending=False)/len(X_global) * 100
null_features = withNull[withNull!=0]
# Plot features with missing values
fig, ax = plt.subplots(figsize=(15,10))
sns.barplot(x=null_features.index, y=null_features.values)
plt.xticks(rotation=45)
plt.xlabel('Features', fontsize=15)
plt.ylabel('Percentage of missing values', fontsize=15)
plt.title('Features by percentage of missing values', fontsize=20)

### Day9 (2022-02-01)
Handling Missing information

In [None]:
withNull

In [1]:
# Removing the features of PoolQC, MiscFeature, Alley, Fence as they miss over 80% values 
# Update the training data
to_remove =  withNull.index[0:4]
X_global.drop(columns=to_remove, inplace=True)

In [1]:
# Inspect the rest null features one by one
# FireplaceQu missing values is due to some houses do not have fireplace
# fill null value with NF as no fireplace
X_global['FireplaceQu'].value_counts()
X_global['FireplaceQu'] = X_global['FireplaceQu'].fillna("NF")

In [1]:
# LotFrontage: Linear feet of street connected to property
# LotFrontage is indenpent from other features from nullity correlation matrix
# Fill the column with median value
# Missing values filling missing values with global median values can lead to data leakage
# Even the leaking effect is slight it is worthy to specify that. 
X_global['LotFrontage'].describe()
X_global['LotFrontage'] = X_global['LotFrontage'].fillna(X_global['LotFrontage'].median())

In [1]:
# Garage feature group
# The garage feature group as GarageYrBlt, GarageCond, GarageType, GarageFinish, GarageQual
# These five features have the same percentage of missing valuse due to some houses have no garage
# Fill these columns with NG as no garage.
# Notice that GarageYrBlt (temporarily fill with NG)
garage_features = ["GarageYrBlt", "GarageCond", "GarageType", "GarageFinish", "GarageQual"]
X_global[garage_features] = X_global[garage_features].fillna('NG')

In [1]:
# Basement feature group
# Similar to garage feature group
# The missing value mostly due to there is no basement
# Fill missing values with NB as no basement
basement_features = ["BsmtFinType2", "BsmtExposure", "BsmtQual", "BsmtCond", "BsmtFinType1"]
X_global[basement_features] = X_global[basement_features].fillna('NB')

In [1]:
# Masonry feature group
# MasVnrType and MasVnrArea have the same percentage of missing values
# Due to some houses have no masonry
# Fill missing values with NM as no masonry
masonry_features = ["MasVnrType", "MasVnrArea"]
X_global[masonry_features] = X_global[masonry_features].fillna('NM')

In [1]:
# Electrical feature has only one missing values
# Fill the missing with mostly present values
electrical_most = X_global['Electrical'].value_counts(ascending=False).index[0]
X_global['Electrical'] = X_global['Electrical'].fillna(electrical_most)

In [1]:
# MSZoning feature has missing only in test set
# Fill the missing with mostly present values
MSZoning_most = X_global['MSZoning'].value_counts(ascending=False).index[0]
X_global['MSZoning'] = X_global['MSZoning'].fillna(MSZoning_most)

In [1]:
# BsmtHalfBath and BsmtFullBath features
# Fill the missing with mostly present values
BsmtHalfBath_most = X_global['BsmtHalfBath'].value_counts(ascending=False).index[0]
X_global['BsmtHalfBath'] = X_global['BsmtHalfBath'].fillna(BsmtHalfBath_most)

BsmtFullBath_most = X_global['BsmtFullBath'].value_counts(ascending=False).index[0]
X_global['BsmtFullBath'] = X_global['BsmtFullBath'].fillna(BsmtFullBath_most)

In [None]:
# Funtional feature
Functional_most = X_global['Functional'].value_counts(ascending=False).index[0]
X_global['Functional'] = X_global['Functional'].fillna(Functional_most)

In [None]:
# Utilites feature
Utilities_most = X_global['Utilities'].value_counts(ascending=False).index[0]
X_global['Utilities'] = X_global['Utilities'].fillna(Utilities_most)

In [None]:
# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, GarageCars, GarageArea
# use for loop to all the missing feature

for feature in ['Exterior1st', 'Exterior2nd','KitchenQual', 'SaleType']:
    feature_most = X_global[feature].value_counts(ascending=False).index[0]
    X_global[feature] = X_global[feature].fillna(feature_most)

for feature in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageCars', 'GarageArea']:
    feature_median = X_global[feature].median()
    X_global[feature] = X_global[feature].fillna(feature_median)

In [None]:
# Re-check the missing values
print(f"After handling the missing value, there are {X_global.isnull().sum().sum(axis=0)} missing in the training data.")