In [1]:
# Import modules
import pandas as pd
import numpy as np
import scipy
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
final = pd.read_csv('../../data/test_clean.csv', index_col=0)
train = pd.read_csv('../../data/train_clean.csv', index_col=0)

In [3]:
final['Age'] = 2019 - final['YearBuilt']
final_cont = final[['LotArea','Age','GarageArea']]
final_categ = final[['BsmtFullBath','KitchenAbvGr','GarageType','Kitchen','Fireplace','ExterQ',
                    'BsmtQ','HeatingQ','n_toilets']] #remove n_showers

In [4]:
train['Age'] = 2019 - train['YearBuilt']
train_cont = train[['LotArea','Age','GarageArea']]

In [5]:
# boxcox and normalize dataframes with sklearn.stats
def make_boxed_normal(df):
    new_dict = {}
    logs = []
    devs = []
    means = []
    cols = df.columns
    for name in cols: 
        key = 'norm_' + name
        arr = np.array(df[name]) + 1
        arr, log_num = stats.boxcox(arr)
        upper = arr.max()
        lower = arr.min()
        value = (arr-np.mean(arr))/(np.std(arr))
        new_dict[key] = value
        logs.append(log_num)
        devs.append(np.std(arr))
        means.append(np.mean(arr))
        
    return pd.DataFrame(new_dict), logs, devs, means

In [6]:
train_norm, logs,devs,means = make_boxed_normal(train_cont)

In [7]:
train_categ = train[['BsmtFullBath','KitchenAbvGr','GarageType','Kitchen','Fireplace','ExterQ',
                    'BsmtQ','HeatingQ','n_toilets']] #remove n_showers

In [8]:
#We will pass into here the test data + log_list with lambdas from training data

def normalize_test(df, mean_arr, std_arr, log_list):
    new_dict = {}
    cols = df.columns
    i = 0
    for name in cols: 
        key = 'test_norm_' + name
        arr = np.array(df[name]) + 1 
        tempnum = log_list[i]
        print(tempnum)
        arr = stats.boxcox(arr, lmbda = tempnum)
        upper = arr.max()
        lower = arr.min()
        value = (arr-mean_arr[i])/(std_arr[i])
        print('Iter=',i)
        new_dict[key] = value
        i += 1
     
    return pd.DataFrame(new_dict)

In [9]:
final_norm = normalize_test(final_cont,means,devs,logs)

0.39726927868822215
Iter= 0
0.22006264293977928
Iter= 1
0.8207705726888868
Iter= 2


In [10]:
#FUNCTION TO LOG & TRANSFORM GrLivArea
def make_log(df):
    new_dict = {}
    cols = df.columns
    for name in cols: 
        key = 'log_' + name
        arr = np.array(df[name])
        log_arr = np.log(arr+1)
        value = (log_arr - log_arr.mean())/ log_arr.std()
        new_dict[key] = value
     
    return pd.DataFrame(new_dict)

temp = make_log(train[['GrLivArea']])

temp.iloc[0].isnull().sum()

0

In [11]:
final_norm.reset_index(drop=True, inplace=True)
final_categ.reset_index(drop=True, inplace=True)
final_combo = pd.concat([final_norm,final_categ],axis=1)
final_templog = make_log(final[['GrLivArea']])

##Transforming the GrLivArea in Final-Test Data Set with LOG:
x = final['GrLivArea'] 
mnGrLivArea = np.log(train.GrLivArea).mean()
stdGrLivArea = np.log(train.GrLivArea).std()
final_combo['LogLivArea'] = final_templog

In [12]:
train_norm.reset_index(drop=True, inplace=True)
train_categ.reset_index(drop=True, inplace=True)
train_combo = pd.concat([train_norm,train_categ],axis=1)
train_templog = make_log(train[['GrLivArea']])

#Transforming the GrLivArea in Training Data Set with LOG:
y = train['GrLivArea'] 
mnGrLivArea = np.log(train.GrLivArea).mean()
stdGrLivArea = np.log(train.GrLivArea).std()
train_combo['LogLivArea'] = train_templog

In [25]:
multi = linear_model.LinearRegression()
train_price = make_boxed_normal(train[['SalePrice']]-37800)
multi.fit(X=train_combo, y=train_price)

ValueError: cannot copy sequence with size 1437 to array axis with dimension 1

In [23]:
pricepredict = multi.predict(final_combo)

In [24]:
#To transform these predictions back to price
#logs, devs, means = 1,2,3 of the object 
train_price[1] #=logs
train_price[2] #=deviation
train_price[3] #=means
#You multiply by the standard deviation, add back the mean.
#And then un-BoxCox it using the hyperparamter

KeyError: 1

In [None]:
pricepredict = (pricepredict * train_price[2])+train_price[3]
lmbda = train_price[1]
adjprice = scipy.special.inv_boxcox(pricepredict, lmbda)

In [None]:
pricepredict.mean()

In [None]:
adjprice.mean()

In [None]:
atest = train[['SalePrice']]


In [None]:
train['SalePrice'].mean()

In [None]:
train['GrLivArea'].mean()

In [None]:
final['GrLivArea'].mean()

In [None]:
a = [12,21,14,52,4,60,70]
b = pd.DataFrame(a)
b.columns = ['Test']
c = make_boxed_normal(b)
lmbda = c[1]
d = c[0]*c[2] + c[3]

In [None]:
scipy.special.inv_boxcox(d, lmbda)-1

In [None]:
pricepredict

In [None]:
train_combo.head(10)