In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Making DataFrame
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_full = [df_train, df_test]

# Check the NaN Value in DataFrame df_train and df_test
from IPython.display import display
pd.options.display.max_rows = None
display(df_train.isnull().sum())
display(df_test.isnull().sum())
print(df_train.info())
sns.heatmap(df_train.isnull())

# Drop Id column and the features that have many NaN value
drop_features = ['Id','Alley','PoolQC', 'Fence', 'MiscFeature']
df_train.drop(drop_features, axis =1, inplace = True)
df_test.drop(drop_features, axis = 1, inplace = True)

# Replace the NaN values
# Fillna Column MSZoning
for dataset in df_full:
  dataset['MSZoning'] = dataset['MSZoning'].fillna(dataset['MSZoning'].mode()[0])

# Fillna Column LotFrontage
for dataset in df_full:
  dataset['LotFrontage'] = dataset['LotFrontage'].fillna(dataset['LotFrontage'].mean())

# Fillna Column MasVnrType
for dataset in df_full:
  dataset['MasVnrType'] = dataset['MasVnrType'].fillna(dataset['MasVnrType'].mode()[0])

# Fillna Column MasVnrArea
for dataset in df_full:
  dataset['MasVnrArea'] = dataset['MasVnrArea'].fillna(dataset['MasVnrArea'].mode()[0])

# Fillna Column BsmtQual
for dataset in df_full:
  dataset['BsmtQual'] = dataset['BsmtQual'].fillna(dataset['BsmtQual'].mode()[0])

# Fillna Column BsmtCond
for dataset in df_full:
  dataset['BsmtCond'] = dataset['BsmtCond'].fillna(dataset['BsmtCond'].mode()[0])

# Fillna Column BsmtExposure
for dataset in df_full:
  dataset['BsmtExposure'] = dataset['BsmtExposure'].fillna(dataset['BsmtExposure'].mode()[0])

# Fillna Column BsmtFinType1
for dataset in df_full:
  dataset['BsmtFinType1'] = dataset['BsmtFinType1'].fillna(dataset['BsmtFinType1'].mode()[0])

# Fillna Column BsmtFinType2
for dataset in df_full:
  dataset['BsmtFinType2'] = dataset['BsmtFinType2'].fillna(dataset['BsmtFinType2'].mode()[0])

# Fillna Column Electrical
for dataset in df_full:
  dataset['Electrical'] = dataset['Electrical'].fillna(dataset['Electrical'].mode()[0])

# Fillna Column FireplaceQu
for dataset in df_full:
  dataset['FireplaceQu'] = dataset['FireplaceQu'].fillna(dataset['FireplaceQu'].mode()[0])

# Fillna Column GarageType
for dataset in df_full:
  dataset['GarageType'] = dataset['GarageType'].fillna(dataset['GarageType'].mode()[0])

# Fillna Column GarageYrBlt
for dataset in df_full:
  dataset['GarageYrBlt'] = dataset['GarageYrBlt'].fillna(dataset['GarageYrBlt'].mean())

# Fillna Column GarageFinish
for dataset in df_full:
  dataset['GarageFinish'] = dataset['GarageFinish'].fillna(dataset['GarageFinish'].mode()[0])  

# Fillna Column GarageQual
for dataset in df_full:
  dataset['GarageQual'] = dataset['GarageQual'].fillna(dataset['GarageQual'].mode()[0])

# Fillna Column GarageCond
for dataset in df_full:
  dataset['GarageCond'] = dataset['GarageCond'].fillna(dataset['GarageCond'].mode()[0])

# Fillna Column Utilities
for dataset in df_full:
  dataset['Utilities'] = dataset['Utilities'].fillna(dataset['Utilities'].mode()[0])

# Fillna Column Exterior1st
for dataset in df_full:
  dataset['Exterior1st'] = dataset['Exterior1st'].fillna(dataset['Exterior1st'].mode()[0])

# Fillna Column Exterior2nd
for dataset in df_full:
  dataset['Exterior2nd'] = dataset['Exterior2nd'].fillna(dataset['Exterior2nd'].mode()[0])

# Fillna Column BsmtFinSF1
for dataset in df_full:
  dataset['BsmtFinSF1'] = dataset['BsmtFinSF1'].fillna(dataset['BsmtFinSF1'].mean())

# Fillna Column BsmtFinSF2
for dataset in df_full:
  dataset['BsmtFinSF2'] = dataset['BsmtFinSF2'].fillna(dataset['BsmtFinSF2'].mean())

# Fillna Column BsmtUnfSF
for dataset in df_full:
  dataset['BsmtUnfSF'] = dataset['BsmtUnfSF'].fillna(dataset['BsmtUnfSF'].mean())

# Fillna Column TotalBsmtSF
for dataset in df_full:
  dataset['TotalBsmtSF'] = dataset['TotalBsmtSF'].fillna(dataset['TotalBsmtSF'].mean())

# Fillna Column BsmtFullBath
for dataset in df_full:
  dataset['BsmtFullBath'] = dataset['BsmtFullBath'].fillna(dataset['BsmtFullBath'].mean())

# Fillna Column BsmtHalfBath
for dataset in df_full:
  dataset['BsmtHalfBath'] = dataset['BsmtHalfBath'].fillna(dataset['BsmtHalfBath'].mean())

# Fillna Column KitchenQual
for dataset in df_full:
  dataset['KitchenQual'] = dataset['KitchenQual'].fillna(dataset['KitchenQual'].mode()[0])

# Fillna Column Functional
for dataset in df_full:
  dataset['Functional'] = dataset['Functional'].fillna(dataset['Functional'].mode()[0])

# Fillna Column GarageCars
for dataset in df_full:
  dataset['GarageCars'] = dataset['GarageCars'].fillna(dataset['GarageCars'].mean())

# Fillna Column GarageArea
for dataset in df_full:
  dataset['GarageArea'] = dataset['GarageArea'].fillna(dataset['GarageArea'].mean())

# Fillna Column SaleType
for dataset in df_full:
  dataset['SaleType'] = dataset['SaleType'].fillna(dataset['SaleType'].mode()[0])

# Check again the DataFrame for NaN values
print(df_train.isnull().sum())
print(df_test.isnull().sum())

# Column partition
df_final = pd.concat([df_train, df_test])
df_final = pd.get_dummies(df_final)

# Choosing the best machine learning model
df_train_final = df_final.iloc[:1460, :]
df_test_final = df_final.iloc[1460: , :]
classifiers = [DecisionTreeRegressor(), LogisticRegression()]
X = df_train_final.drop('SalePrice', axis = 1)
y = df_train_final['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

acc = []
for cls in classifiers:
    cls.fit(X_train, y_train)
    y_pred = cls.predict(X_test)
    a = accuracy_score(y_test, y_pred)
    acc.append(a)

data = {'Classifier': ['DecisionTreeRegressor', 'LogisticRegression'], 'Accuracy': acc}
df_class_acc = pd.DataFrame(data)

# Make a Plot the accuracy from each model 
sns.barplot(x = 'Classifier', y = 'Accuracy', data = df_class_acc)
plt.show()

# Hyperparameter Tuning
tree = DecisionTreeRegressor()
param_grid = {'criterion' : ['mse', 'friedman_mse', 'mae'], 'max_depth':list(range(1,11)),'min_samples_split':list(range(2,11)),\
              'max_features': list(range(1, 11)),'min_samples_leaf': list(range(1, 11))}
gscv = GridSearchCV(tree, param_grid=param_grid)
gscv.fit(X_train, y_train)
gscv.best_params_

# Make machine learning model based on best_params_
tree = DecisionTreeRegressor(criterion = 'friedman_mse',max_depth = 10,max_features = 9,min_samples_leaf = 2,min_samples_split = 4)
tree.fit(X_train, y_train)
tree.score(X_test,y_test)

# Predict the SalePrice
y_pred = tree.predict(df_test_final.drop('SalePrice', axis = 1))

# Make DataFrame for submission
submission = pd.read_csv('test.csv')
submission['SalePrice'] = y_pred
submission = submission.drop(submission.iloc[:, 1:80], axis = 1)

# Save to csv
submission.to_csv('submissionHouse1.csv', index=False)