In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, Ridge, Lasso
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import make_scorer, r2_score
from scipy.stats import kstest
from scipy.stats import norm
import warnings
warnings.filterwarnings("ignore")

In [16]:
data=pd.read_csv("train.csv")

## Create One Hot Encoded Variable

In [17]:
#nominal variables are suited for one-hot encoding.  Garage Finish was choosen because of it correlation to SalePrice
#greater than 50%
GarageFinishDummyVar=pd.get_dummies(pd.DataFrame(data["GarageFinish"]), columns=['GarageFinish'], prefix='GarageFinish')

## Create Polynomial Variable

In [19]:
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(np.array(data["GrLivArea"]).reshape(-1, 1))

In [20]:
poly1=[]
poly2=[]
for i in X_poly:
    poly1.append(i[0])
    poly2.append(i[1])

In [21]:
data=pd.concat([data, GarageFinishDummyVar, pd.Series(poly1, name="GrLivArea1"), pd.Series(poly2, name="GrLivArea2")], axis=1)

In [22]:
y=data["SalePrice"]
X=data.drop(["Id", "SalePrice", "GarageFinish", "GrLivArea"], axis=1)

## Label Categorical Variables

In [23]:
X_dict={}
for i in X.columns:
    labeler=LabelEncoder()
    X_dict[i]=labeler.fit_transform(data[i])

In [24]:
X_from_labeler=pd.DataFrame.from_dict(X_dict)

## Normality Test (Kolmogorov-Smirnov Test)

In [25]:
for i in X_from_labeler.columns:
    stat, p = kstest(X_from_labeler[i], 'norm', args=(X_from_labeler[i].mean(), X_from_labeler[i].std()))
    print(f'Statistic={stat:.3f}, p-value={p:.3f}')

Statistic=0.209, p-value=0.000
Statistic=0.420, p-value=0.000
Statistic=0.152, p-value=0.000
Statistic=0.061, p-value=0.000
Statistic=0.521, p-value=0.000
Statistic=0.534, p-value=0.000
Statistic=0.407, p-value=0.000
Statistic=0.521, p-value=0.000
Statistic=0.510, p-value=0.000
Statistic=0.448, p-value=0.000
Statistic=0.536, p-value=0.000
Statistic=0.103, p-value=0.000
Statistic=0.466, p-value=0.000
Statistic=0.508, p-value=0.000
Statistic=0.495, p-value=0.000
Statistic=0.319, p-value=0.000
Statistic=0.155, p-value=0.000
Statistic=0.320, p-value=0.000
Statistic=0.124, p-value=0.000
Statistic=0.175, p-value=0.000
Statistic=0.479, p-value=0.000
Statistic=0.533, p-value=0.000
Statistic=0.283, p-value=0.000
Statistic=0.280, p-value=0.000
Statistic=0.380, p-value=0.000
Statistic=0.333, p-value=0.000
Statistic=0.367, p-value=0.000
Statistic=0.520, p-value=0.000
Statistic=0.264, p-value=0.000
Statistic=0.260, p-value=0.000
Statistic=0.515, p-value=0.000
Statistic=0.402, p-value=0.000
Statisti

## Correlation Analysis (Feature Selection)

In [26]:
correlation_plot=pd.concat([X_from_labeler, y], axis=1).corr()

## Check for Muilticollinearity (80% Correlation or Greater)

In [27]:
correlation_plot[(correlation_plot.SalePrice>.8) | (correlation_plot.SalePrice<-.8)]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,MoSold,YrSold,SaleType,SaleCondition,GarageFinish_Fin,GarageFinish_RFn,GarageFinish_Unf,GrLivArea1,GrLivArea2,SalePrice
SalePrice,-0.054751,-0.166872,0.213821,0.454564,0.041036,0.139868,-0.25558,0.015453,-0.014314,-0.067396,...,0.046432,-0.028923,-0.054911,0.213092,0.419678,0.169792,-0.410608,0.689795,0.689795,1.0


In [30]:
feature_list=correlation_plot[(correlation_plot.SalePrice>.5) | (correlation_plot.SalePrice<-.5)].reset_index().iloc[:,0]
print(list(feature_list))

['OverallQual', 'YearBuilt', 'YearRemodAdd', 'ExterQual', 'BsmtQual', 'TotalBsmtSF', '1stFlrSF', 'FullBath', 'KitchenQual', 'TotRmsAbvGrd', 'GarageCars', 'GarageArea', 'GrLivArea1', 'GrLivArea2', 'SalePrice']


In [31]:
updated={}
for i in list(feature_list)[:-1]:
    updated[i]=X_from_labeler[i]

In [32]:
updated_dataset=pd.DataFrame.from_dict(updated)

##  Prepare Test Dataset

In [80]:
test=pd.read_csv("test.csv")

In [85]:
testX_poly = poly.fit_transform(np.array(test["GrLivArea"]).reshape(-1, 1))

In [86]:
testpoly1=[]
testpoly2=[]
for i in testX_poly:
    testpoly1.append(i[0])
    testpoly2.append(i[1])

In [87]:
testdata=pd.concat([test, pd.Series(testpoly1, name="GrLivArea1"), pd.Series(testpoly2, name="GrLivArea2")], axis=1)

In [88]:
test_updated={}
for i in list(feature_list)[:-1]:
    test_updated[i]=testdata[i]

test_updated_dataset=pd.DataFrame.from_dict(test_updated)

In [89]:
test_X_dict={}
for i in test_updated_dataset.columns:
    labeler=LabelEncoder()
    test_X_dict[i]=labeler.fit_transform(test_updated_dataset[i])

In [90]:
test_updated_dataset=pd.DataFrame.from_dict(test_X_dict)

## Standardize Data

In [92]:
standardizer=StandardScaler()
standar_dta=standardizer.fit_transform(updated_dataset)
teststandar_dta=standardizer.fit_transform(test_updated_dataset)

In [93]:
standardized_dataset=pd.DataFrame(standar_dta)
teststandardized_dataset=pd.DataFrame(teststandar_dta)

In [94]:
standardized_dataset.columns=updated_dataset.columns
teststandardized_dataset.columns=test_updated_dataset.columns

## Logistic Regression Model

In [71]:
lr=LogisticRegression()

In [72]:
lr.fit(standardized_dataset, y)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [73]:
n_scores = cross_val_score(lr, standardized_dataset, y, scoring=make_scorer(r2_score), cv=cv, n_jobs=-1)
print("Mean R²:", np.mean(n_scores))

Mean R²: 0.6259105030083458


In [97]:
lr_predictions=lr.predict(teststandardized_dataset)

In [101]:
pd.concat([pd.Series(test["Id"], name="Id"), pd.Series(np.maximum(lr_predictions, 0), name="SalePrice")], axis=1).to_csv("HousePrice_Submission1.csv", index=False)

## Ridge Regression Model

In [74]:
ridge=Ridge()

In [75]:
ridge.fit(standardized_dataset, y)

0,1,2
,alpha,1.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [76]:
n_scores = cross_val_score(ridge, standardized_dataset, y, scoring=make_scorer(r2_score), cv=cv, n_jobs=-1)
print("Mean R²:", np.mean(n_scores))

Mean R²: 0.7894089242987673


In [102]:
ridge_predictions=ridge.predict(teststandardized_dataset)

In [103]:
pd.concat([pd.Series(test["Id"], name="Id"), pd.Series(np.maximum(ridge_predictions, 0), name="SalePrice")], axis=1).to_csv("HousePrice_Submission2.csv", index=False)