In [1]:
# Import modules
import pandas as pd
import numpy as np
import scipy
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
final = pd.read_csv('../data/test_clean.csv', index_col=0)
train = pd.read_csv('../data/train_clean.csv', index_col=0)

In [3]:
final['Age'] = 2019 - final['YearBuilt']
final_cont = final[['LotArea','Age','GarageArea']]
final_categ = final[['BsmtFullBath','KitchenAbvGr','GarageType','Kitchen','Fireplace','ExterQ',
                    'BsmtQ','HeatingQ','n_toilets']] #remove n_showers

In [4]:
train['Age'] = 2019 - train['YearBuilt']
train_cont = train[['LotArea','Age','GarageArea']]

In [5]:
# boxcox and normalize dataframes with sklearn.stats
def make_boxed_normal(df):
    new_dict = {}
    logs = []
    devs = []
    means = []
    cols = df.columns
    for name in cols: 
        key = 'norm_' + name
        arr = np.array(df[name]) + 1
        arr, log_num = stats.boxcox(arr)
        upper = arr.max()
        lower = arr.min()
        
        value = (arr-np.mean(arr))/(np.std(arr))
        print (arr)
        new_dict[key] = value
        logs.append(log_num)
        devs.append(np.std(arr))
        means.append(np.mean(arr))
        
    return pd.DataFrame(new_dict), logs, devs, means

In [6]:
train_norm, logs,devs,means = make_boxed_normal(train_cont)

# train['Age'] = stats.boxcox(train['Age'])
# train['Age'] - np.mean(train['Age'])/np.std(train['Age'])

[88.88466171 93.63673908 99.88961533 ... 91.37651357 94.10054238
 94.96362037]
[3.93259214 5.90583755 4.14263436 ... 7.34224395 7.03003521 6.43179868]
[214.72436143 185.87808688 233.91277455 ... 113.11916884 108.64874634
 121.94845642]


In [7]:
train_categ = train[['BsmtFullBath','KitchenAbvGr','GarageType','Kitchen','Fireplace','ExterQ',
                    'BsmtQ','HeatingQ','n_toilets']] #remove n_showers

In [8]:
#We will pass into here the test data + log_list with lambdas from training data

def normalize_test(df, mean_arr, std_arr, log_list):
    new_dict = {}
    cols = df.columns
    i = 0
    for name in cols: 
        key = 'test_norm_' + name
        arr = np.array(df[name]) + 1 
        tempnum = log_list[i]
        print(tempnum)
        arr = stats.boxcox(arr, lmbda = tempnum)
        upper = arr.max()
        lower = arr.min()
        value = (arr-mean_arr[i])/(std_arr[i])
        print('Iter=',i)
        new_dict[key] = value
        i += 1
     
    return pd.DataFrame(new_dict)

In [9]:
final_norm = normalize_test(final_cont,means,devs,logs)

0.39726927868822215
Iter= 0
0.22006264293977928
Iter= 1
0.8207705726888868
Iter= 2


In [20]:
#FUNCTION TO LOG & TRANSFORM GrLivArea
def make_log(df):
    new_dict = {}
    cols = df.columns
    for name in cols: 
        key = 'log_' + name
        arr = np.array(df[name])
        log_arr = np.log(arr+1)
        value = (log_arr - log_arr.mean())/ log_arr.std()
        new_dict[key] = value
     
    return pd.DataFrame(new_dict)

# temp = make_log(train[['GrLivArea']])

# temp.iloc[0].isnull().sum()

In [21]:
final_norm.reset_index(drop=True, inplace=True)
final_categ.reset_index(drop=True, inplace=True)
final_combo = pd.concat([final_norm,final_categ],axis=1)

final_templog = make_log(final[['GrLivArea']])


print (final_templog)
##Transforming the GrLivArea in Final-Test Data Set with LOG:
x = final['GrLivArea'] 
mnGrLivArea = np.log(train.GrLivArea).mean()
stdGrLivArea = np.log(train.GrLivArea).std()
final_combo['LogLivArea'] = final_templog

      log_GrLivArea
0         -1.793177
1          0.643966
2          0.834282
3          0.589376
4         -0.356035
5         -0.170718
6          0.486689
7         -0.493216
8          0.300817
9         -0.121638
10         0.799071
11        -1.537473
12        -1.270200
13         0.290098
14        -1.976385
15         0.738569
16        -0.354321
17         0.063133
18         0.123871
19         2.208936
20         1.246424
21        -0.624587
22        -0.760824
23        -1.108203
24        -0.099767
25        -0.048920
26         0.893728
27        -0.268379
28        -0.981476
29         1.296461
...             ...
1414      -1.567101
1415      -0.164334
1416      -2.553487
1417      -2.678278
1418      -0.198025
1419       1.695326
1420       1.581602
1421       0.813432
1422      -0.156375
1423      -0.446694
1424       0.030753
1425       0.982016
1426       1.079437
1427       1.662758
1428       1.649136
1429      -0.260101
1430       1.035230
1431      -0.801072


In [22]:
train_norm.reset_index(drop=True, inplace=True)
train_categ.reset_index(drop=True, inplace=True)
train_combo = pd.concat([train_norm,train_categ],axis=1)
train_templog = make_log(train[['GrLivArea']])

#Transforming the GrLivArea in Training Data Set with LOG:
y = train['GrLivArea'] 
mnGrLivArea = np.log(train.GrLivArea).mean()
stdGrLivArea = np.log(train.GrLivArea).std()
train_combo['LogLivArea'] = train_templog

In [23]:
multi = linear_model.LinearRegression()
train_price, train_logs, train_devs, train_means = make_boxed_normal(train[['SalePrice']]-37800)
multi.fit(X=train_combo, y=train_price)

[102.88252769  97.83021699 105.44490873 ... 112.0494597   89.06318741
  90.38673086]


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [24]:
pricepredict = multi.predict(final_combo)

pricepredict

array([[-0.91483279],
       [ 0.03331182],
       [ 0.6064589 ],
       ...,
       [ 0.40546667],
       [-0.83562147],
       [ 0.44477303]])

In [25]:
#To transform these predictions back to price
#logs, devs, means = 1,2,3 of the object 
train_price
train_logs
train_devs
train_means
#You multiply by the standard deviation, add back the mean.
#And then un-BoxCox it using the hyperparamter





[94.95626602505818]

In [26]:
print (pricepredict)

pricepredict = (pricepredict * train_devs[0])+train_means[0]
lmbda = train_logs
adjprice = scipy.special.inv_boxcox(pricepredict, lmbda[0])

[[-0.91483279]
 [ 0.03331182]
 [ 0.6064589 ]
 ...
 [ 0.40546667]
 [-0.83562147]
 [ 0.44477303]]


In [27]:
adjprice

array([[ 77806.32534174],
       [132056.22396204],
       [175643.38748498],
       ...,
       [159337.80289055],
       [ 81579.05587887],
       [162437.28843059]])

In [33]:
adjprice.mean()+37800

179029.16506937367

In [34]:
train[['SalePrice']].mean()

SalePrice    180423.080028
dtype: float64