In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import scale
from sklearn.model_selection import GridSearchCV

# Read the csv files
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
list_df_full = [df_train, df_test]

#scatter plot grlivarea/saleprice
var = 'GrLivArea'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
plt.figure(figsize = (8,6))
a = sns.scatterplot(data = data, x=var, y='SalePrice')
a.set(ylim = (0,800000))

#Dealing with outliars
#standardizing data
saleprice_scaled = StandardScaler().fit_transform(df_train['SalePrice'][:,np.newaxis])
low_range = saleprice_scaled[saleprice_scaled[:,0].argsort()][:10]
high_range= saleprice_scaled[saleprice_scaled[:,0].argsort()][-10:]

#deleting points
df_train.sort_values(by = 'GrLivArea', ascending = False)[:2]
df_train2 = df_train.drop(df_train[df_train['Id'] == 1299].index)
df_train3 = df_train2.drop(df_train2[df_train2['Id'] == 524].index)

#scatter plot grlivarea/saleprice
var = 'GrLivArea'
data = pd.concat([df_train3['SalePrice'], df_train3[var]], axis=1)
plt.figure(figsize = (8,6))
a = sns.scatterplot(data = data, x=var, y='SalePrice')
a.set(ylim = (0,800000))

#correlation matrix
corrmat = df_train3.corr()
plt.figure(figsize=(12, 9))
sns.heatmap(corrmat,  vmax=.8, square=True)

#saleprice correlation matrix
k = 15 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cormat2 = df_train3[cols].corr()
plt.figure(figsize = (10,6))
sns.heatmap(cormat2, annot = True, square=True, annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
sns.set(font_scale=1.3)

# The Correlation between top 10 features in scatterplot
sns.pairplot(df_train3[cols])

#missing data
total = df_train3.isnull().sum().sort_values(ascending=False)
percent = (df_train3.isnull().sum()/len(df_train3)).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(30)

#missing data
total2 = df_test.isnull().sum().sort_values(ascending=False)
percent2 = (df_test.isnull().sum()/len(df_train)).sort_values(ascending=False)
missing_data2 = pd.concat([total2, percent2], axis=1, keys=['Total', 'Percent'])
missing_data2.head(40)

#dealing with missing data
df_full = pd.concat([df_train3, df_test])
df_full2 = df_full.drop((missing_data2[missing_data2['Total'] > 1]).index, axis=1)
print((missing_data[missing_data['Total'] == 1]).index)
print((missing_data2[missing_data2['Total'] == 1]).index)

# Fillna the columns that have missing value = 1 
df_full2['BsmtFinSF2'] = df_full2['BsmtFinSF2'].fillna(df_full2['BsmtFinSF2'].mean())
df_full2['BsmtFinSF1'] = df_full2['BsmtFinSF1'].fillna(df_full2['BsmtFinSF1'].mean())
df_full2['Exterior2nd'] = df_full2['Exterior2nd'].fillna(df_full2['Exterior2nd'].mode()[0])
df_full2['BsmtUnfSF'] = df_full2['BsmtUnfSF'].fillna(df_full2['BsmtUnfSF'].mean())
df_full2['TotalBsmtSF'] = df_full2['TotalBsmtSF'].fillna(df_full2['TotalBsmtSF'].mean())
df_full2['Exterior1st'] = df_full2['Exterior1st'].fillna(df_full2['Exterior1st'].mode()[0])
df_full2['GarageArea'] = df_full2['GarageArea'].fillna(df_full2['GarageArea'].mean())
df_full2['GarageCars'] = df_full2['GarageCars'].fillna(df_full2['GarageCars'].mean())
df_full2['SaleType'] = df_full2['SaleType'].fillna(df_full2['SaleType'].mode()[0])
df_full2['KitchenQual'] = df_full2['KitchenQual'].fillna(df_full2['KitchenQual'].mode()[0])
df_full2['Electrical'] = df_full2['Electrical'].fillna(df_full2['Electrical'].mode()[0])

#Check the distribution graph
#histogram and normal probability plot for SalePrice
sns.distplot(df_train3['SalePrice'], fit=norm);
fig = plt.figure()
stats.probplot(df_train3['SalePrice'], plot=plt)
#histogram and normal probability plot for SalePrice
fig = plt.figure()
sns.distplot(df_train3['1stFlrSF'], fit=norm);
fig = plt.figure()
stats.probplot(df_train3['1stFlrSF'], plot=plt)
#histogram and normal probability plot for GrLiveArea
fig = plt.figure()
sns.distplot(df_train3['GrLivArea'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train3['GrLivArea'], plot=plt)
#histogram and normal probability plot for TotalBsmtSF
fig = plt.figure()
sns.distplot(df_train3['TotalBsmtSF'], fit=norm);
fig = plt.figure()
res = stats.probplot(df_train3['TotalBsmtSF'], plot=plt)

# Make categorical into numerical features
df_full3 = pd.get_dummies(df_full2)
df_full3.drop('Id', axis =1, inplace = True)

# Get the DataFrame for training and predict
df_train_final = df_full3.iloc[:1458, :]
df_test_final = df_full3.iloc[1458: , :]

# Choosing the best machine learning model 
classifiers = [DecisionTreeRegressor(), LogisticRegression()]
X = df_train_final.drop('SalePrice', axis = 1)
y = df_train_final['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(scale(X), y, test_size=.2, random_state=0)

acc = []
for cls in classifiers:
    cls.fit(X_train, y_train)
    y_pred = cls.predict(X_test)
    a = accuracy_score(y_test, y_pred)
    acc.append(a)

data = {'Classifier': ['DecisionTreeRegressor', 'LogisticRegression'], 'Accuracy': acc}
df_class_acc = pd.DataFrame(data)

#Make a Plot
sns.barplot(x = 'Classifier', y = 'Accuracy', data = df_class_acc)
plt.show()

#Hyperparameter Tuning
tree = DecisionTreeRegressor()
param_grid = {'criterion' : ['mse', 'friedman_mse', 'mae'], 'max_depth':list(range(1,11)),'min_samples_split':list(range(2,11)),'max_features': list(range(1, 11)),'min_samples_leaf': list(range(1, 11))}
gscv = GridSearchCV(tree, param_grid=param_grid)
gscv.fit(X_train, y_train)
gscv.best_params_

# Make machine learning model based on best_params_
tree = DecisionTreeRegressor(criterion = 'friedman_mse',max_depth = 10,max_features = 9,min_samples_leaf = 2,min_samples_split = 4)
tree.fit(X_train, y_train)
tree.score(X_test,y_test)

# Predict for the SalePrice
X_test_final = scale(df_test_final.drop('SalePrice', axis = 1))
y_pred = tree.predict(X_test_final)

# Make DataFrame for submission
submission = pd.read_csv('test.csv')
submission['SalePrice'] = y_pred
submission = submission.drop(submission.iloc[:, 1:80], axis = 1)

# #Save to csv
submission.to_csv('submissionHouse4.csv', index=False)