In [1]:
# Import modules
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
final = pd.read_csv('../data/test_clean.csv', index_col=0)
train = pd.read_csv('../data/train_clean.csv', index_col=0)

In [3]:
final['Age'] = 2019 - final['YearBuilt']
final_cont = final[['LotArea','Age','GarageArea']]
final_categ = final[['BsmtFullBath','KitchenAbvGr','GarageType','Kitchen','Fireplace','ExterQ',
                    'BsmtQ','HeatingQ','n_toilets']] #remove n_showers

In [4]:
train['Age'] = 2019 - train['YearBuilt']
train_cont = train[['LotArea','Age','GarageArea']]

In [5]:
# boxcox and normalize dataframes with sklearn.stats
def make_boxed_normal(df):
    new_dict = {}
    logs = []
    devs = []
    means = []
    cols = df.columns
    for name in cols: 
        key = 'norm_' + name
        arr = np.array(df[name]) + 1
        arr, log_num = stats.boxcox(arr)
        upper = arr.max()
        lower = arr.min()
        value = (arr-lower)/(upper-lower)
        new_dict[key] = value
        logs.append(log_num)
        devs.append(np.std(arr))
        means.append(np.mean(arr))
        
    return pd.DataFrame(new_dict), logs, devs, means

In [6]:
train_norm, logs,devs,means = make_boxed_normal(train_cont)

In [7]:
train_categ = train[['BsmtFullBath','KitchenAbvGr','GarageType','Kitchen','Fireplace','ExterQ',
                    'BsmtQ','HeatingQ','n_toilets']] #remove n_showers

In [8]:
train_norm.reset_index(drop=True, inplace=True)
train_categ.reset_index(drop=True, inplace=True)

In [9]:
train_combo = pd.concat([train_norm, train_categ], axis=1, verify_integrity=True)

In [10]:
len(train_norm)

1437

In [11]:
len(train_categ)

1437

In [12]:
len(train_combo)

1437

In [13]:
# y = train['GrLivArea'] 
# mnGrLivArea = np.log(train.GrLivArea).mean()
# stdGrLivArea = np.log(train.GrLivArea).std()
# train_combo['LogLivArea'] = (np.log(y) - mnGrLivArea) / stdGrLivArea

In [35]:
train_combo['LogLivArea']= np.log(train['GrLivArea'])

In [36]:

# train_combo['LogLivArea'] = (train_combo['LogLivArea'] - train_combo['LogLivArea'].mean()) / train_combo['LogLivArea'].std()

In [37]:
np.log(train['GrLivArea']).std()

0.37092127853086587

In [38]:
train_combo['LogLivArea'].std()

0.3712709816605724

1437

In [15]:
train_combo.columns  #TRAIN_COMBO IS THE FINAL TRAINING DATA TO USE FOR TRAINING THE MULTI-MODEL

Index(['norm_LotArea', 'norm_Age', 'norm_GarageArea', 'BsmtFullBath',
       'KitchenAbvGr', 'GarageType', 'Kitchen', 'Fireplace', 'ExterQ', 'BsmtQ',
       'HeatingQ', 'n_toilets', 'LogLivArea'],
      dtype='object')

In [16]:
#We will pass into here the test data + log_list with lambdas from training data

def normalize_test(df, mean_arr, std_arr, log_list):
    new_dict = {}
    cols = df.columns
    i = 0
    for name in cols: 
        key = 'test_norm_' + name
        arr = np.array(df[name]) + 1 
        tempnum = log_list[i]
        print(tempnum)
        arr = stats.boxcox(arr, lmbda = tempnum)
        upper = arr.max()
        lower = arr.min()
        value = (arr-mean_arr[i])/(std_arr[i])
        print('Iter=',i)
        new_dict[key] = value
        i += 1
     
    return pd.DataFrame(new_dict)

In [17]:
final_norm = normalize_test(final_cont,means,devs,logs)

0.39726927868822215
Iter= 0
0.22006264293977928
Iter= 1
0.8207705726888868
Iter= 2


In [18]:
final_norm.reset_index(drop=True, inplace=True)
final_categ.reset_index(drop=True, inplace=True)

In [19]:
final_combo = pd.concat([final_norm,final_categ],axis=1)

In [20]:
x = final['GrLivArea'] 
mnGrLivArea = np.log(train.GrLivArea).mean()
stdGrLivArea = np.log(train.GrLivArea).std()
final_combo['LogLivArea'] = (np.log(x) - mnGrLivArea) / stdGrLivArea

In [21]:
final_combo.head(2)

Unnamed: 0,test_norm_LotArea,test_norm_Age,test_norm_GarageArea,BsmtFullBath,KitchenAbvGr,GarageType,Kitchen,Fireplace,ExterQ,BsmtQ,HeatingQ,n_toilets,LogLivArea
0,0.523506,0.555439,1.175736,0.0,1,1,3,0,3,3,3,1.0,-1.842024
1,1.042896,0.634372,-0.696247,0.0,1,1,4,0,3,3,3,2.0,0.642691


In [22]:
multi = linear_model.LinearRegression()

In [23]:
pd.isnull(train['GrLivArea']).sum()

0

In [24]:
pd.isnull(train_combo).sum()

norm_LotArea        0
norm_Age            0
norm_GarageArea     0
BsmtFullBath        0
KitchenAbvGr        0
GarageType          0
Kitchen             0
Fireplace           0
ExterQ              0
BsmtQ               0
HeatingQ            0
n_toilets           0
LogLivArea         13
dtype: int64

In [25]:
train_price = make_boxed_normal(train[['SalePrice']]-37800)

In [26]:
multi.fit(X=train_combo, y=train_price[0])

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
train_combo.columns[train_combo.isna().any()].tolist()

In [None]:
train_combo.head(5)

In [None]:
train_combo.isna