In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from scipy.stats import f_oneway
import seaborn as sns
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import math
from sklearn.metrics import mean_squared_error

In [2]:
path = r'C:\Users\User\Desktop\Excel\train.csv'
df_train = pd.read_csv(path)

In [3]:
df_train = pd.DataFrame(df_train)

In [4]:
column_names = df_train.columns
for column_name in column_names:
    if df_train[column_name].isna().sum() > len(df_train[column_name])/2 :
        df_train = df_train.drop(column_name,axis=1)

In [5]:
column_names_new = df_train.columns
for column_name in column_names_new:
    if isinstance(type(df_train[column_name]),float or int):
        df_train[column_name] = df_train[column_name].replace(np.nan,df_train[column_name].mean())
    if isinstance(type(df_train[column_name]),object):
        df_train[column_name] = df_train[column_name].replace(np.nan,df_train[column_name].value_counts().idxmax())

In [None]:
df_train.describe(include='all')

In [7]:
df_train['LotArea'] = (df_train['LotArea'] - df_train['LotArea'].mean()) / df_train['LotArea'].std()
df_train['YearBuilt'] = (df_train['YearBuilt'] - df_train['YearBuilt'].mean()) / df_train['YearBuilt'].std()
df_train['YearRemodAdd'] = (df_train['YearRemodAdd'] - df_train['YearRemodAdd'].mean()) / df_train['YearRemodAdd'].std()
df_train['BsmtUnfSF'] = (df_train['BsmtUnfSF'] - df_train['BsmtUnfSF'].mean()) / df_train['BsmtUnfSF'].std()
df_train['1stFlrSF'] = (df_train['1stFlrSF'] - df_train['1stFlrSF'].mean()) / df_train['1stFlrSF'].std()
df_train['2ndFlrSF'] = (df_train['2ndFlrSF'] - df_train['2ndFlrSF'].mean()) / df_train['2ndFlrSF'].std()
df_train['GrLivArea'] = (df_train['GrLivArea'] - df_train['GrLivArea'].mean()) / df_train['GrLivArea'].std()
df_train['GarageArea'] = (df_train['GarageArea'] - df_train['GarageArea'].mean()) / df_train['GarageArea'].std()

In [8]:
object_column = []
for column_name in column_names_new:
    if df_train[column_name].dtypes == object :
        object_column.append(column_name)
df_train = pd.get_dummies(df_train,columns=object_column)

In [9]:
X = []
for column_name in df_train.columns:
    if column_name == 'SalePrice':
        Y_train = df_train[column_name]
    else:
        X.append(column_name)

In [10]:
X_train = []
for x in X:
    pearson_coef , p_value = pearsonr(df_train[x],Y_train)
    if -1 < pearson_coef < 1 and p_value < 0.001 :
        X_train.append(x)

In [None]:
corr = df_train[X_train].corr()
sns.heatmap(corr)

In [12]:
for i in range(len(X_train)):
    for j in range(i+1,len(X_train)):
        covariance = df_train[X_train[i]].corr(df_train[X_train[j]])
        if covariance < -0.4 or covariance > 0.4 :
            del(X_train[i])
            break

In [None]:
def max_coef(list,n):
    final_list = []
    for i in range(n):
        another_list = list
        max1 = -100
        for j in range(len(another_list)):
            pearson_coef , p_value = pearsonr(df_train[another_list[j]],Y_train)
            if pearson_coef > max1 :
                max1 = pearson_coef
                k = j
        final_list.append(another_list[k])
        del(another_list[k])
    return final_list

selected_X_train = max_coef(X_train,10)
print(selected_X_train)
X_train = X_train + selected_X_train
df_train = df_train + df_train[selected_X_train]

In [None]:
poly = PolynomialFeatures(degree=3,include_bias=False)
X_train_poly = poly.fit_transform(df_train[selected_X_train])
lr = LinearRegression()
lr.fit(X_train_poly,Y_train)

In [15]:
path2 = r'C:\Users\User\Desktop\Excel\test.csv'
df_test = pd.read_csv(path2)

In [16]:
df_test = pd.DataFrame(df_test)

In [17]:
column_names_new = df_test.columns
for column_name in column_names_new:
    if isinstance(type(df_test[column_name]),float or int):
        df_test[column_name] = df_test[column_name].replace(np.nan,df_test[column_name].mean())
    if isinstance(type(df_test[column_name]),object):
        df_test[column_name] = df_test[column_name].replace(np.nan,df_test[column_name].value_counts().idxmax())

In [18]:
not_standard_columns = ['LotArea','YearBuilt','YearRemodAdd','BsmtUnfSF','1stFlrSF','2ndFlrSF','GrLivArea','GarageArea']
for column in not_standard_columns:
    df_test[column] = (df_test[column] - df_test[column].mean()) / df_test[column].std()

In [19]:
object_column = []
for column_name in df_test.columns:
    if df_test[column_name].dtypes == object :
        object_column.append(column_name)
df_test = pd.get_dummies(df_test,columns=object_column)

In [20]:
yhat = lr.predict(poly.fit_transform(df_test[selected_X_train]))

In [21]:
Predict_values = {'Id':df_test['Id'],'SalePrice':yhat}
df_predict = pd.DataFrame(Predict_values)

In [22]:
csv_path = r'C:\Users\User\Desktop\Excel\Predict_SalePrice.csv'
df_predict.to_csv(csv_path)

In [24]:
path3 = r'C:\Users\User\Desktop\Excel\sample_submission.csv'
df_sample = pd.read_csv(path3)
df_sample = pd.DataFrame(df_sample)

In [None]:
MSE = mean_squared_error(np.log(df_sample['SalePrice']),np.log(yhat))
RMSE = math.sqrt(MSE)
RMSE