In [1]:
# Import modules
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [2]:
final = pd.read_csv('../data/test_clean.csv', index_col=0)
train = pd.read_csv('../data/train_clean.csv', index_col=0)

In [3]:
final['Age'] = 2019 - final['YearBuilt']
final_cont = final[['LotArea','Age','GarageArea']]
final_categ = final[['BsmtFullBath','KitchenAbvGr','GarageType','Kitchen','Fireplace','ExterQ',
                    'BsmtQ','HeatingQ','n_toilets']] #remove n_showers

In [4]:
train['Age'] = 2019 - train['YearBuilt']
train_cont = train[['LotArea','Age','GarageArea']]

In [5]:
# boxcox and normalize dataframes with sklearn.stats
def make_boxed_normal(df):
    new_dict = {}
    logs = []
    devs = []
    means = []
    cols = df.columns
    for name in cols: 
        key = 'norm_' + name
        arr = np.array(df[name]) + 1
        arr, log_num = stats.boxcox(arr)
        upper = arr.max()
        lower = arr.min()
        value = (arr-lower)/(upper-lower)
        new_dict[key] = value
        logs.append(log_num)
        devs.append(np.std(arr))
        means.append(np.mean(arr))
        
    return pd.DataFrame(new_dict), logs, devs, means

In [6]:
train_norm, logs,devs,means = make_boxed_normal(train_cont)

In [7]:
train_categ = train[['BsmtFullBath','KitchenAbvGr','GarageType','Kitchen','Fireplace','ExterQ',
                    'BsmtQ','HeatingQ','n_toilets']] #remove n_showers

In [8]:
train_combo = pd.concat([train_norm,train_categ],axis=1)

In [10]:
y = train['GrLivArea'] 
mnGrLivArea = np.log(train.GrLivArea).mean()
stdGrLivArea = np.log(train.GrLivArea).std()
train_combo['LogLivArea'] = (np.log(y) - mnGrLivArea) / stdGrLivArea

In [11]:
train_combo.columns  #TRAIN_COMBO IS THE FINAL TRAINING DATA TO USE FOR TRAINING THE MULTI-MODEL

Index(['norm_LotArea', 'norm_Age', 'norm_GarageArea', 'BsmtFullBath',
       'KitchenAbvGr', 'GarageType', 'Kitchen', 'Fireplace', 'ExterQ', 'BsmtQ',
       'HeatingQ', 'n_toilets', 'LogLivArea'],
      dtype='object')

In [12]:
#We will pass into here the test data + log_list with lambdas from training data

def normalize_test(df, mean_arr, std_arr, log_list):
    new_dict = {}
    cols = df.columns
    i = 0
    for name in cols: 
        key = 'test_norm_' + name
        arr = np.array(df[name]) + 1 
        tempnum = log_list[i]
        print(tempnum)
        arr = stats.boxcox(arr, lmbda = tempnum)
        upper = arr.max()
        lower = arr.min()
        value = (arr-mean_arr[i])/(std_arr[i])
        print('Iter=',i)
        new_dict[key] = value
        i += 1
     
    return pd.DataFrame(new_dict)

In [13]:
final_norm = normalize_test(final_cont,means,devs,logs)

0.39726927868822215
Iter= 0
0.22006264293977928
Iter= 1
0.8207705726888868
Iter= 2


In [14]:
final_combo = pd.concat([final_norm,final_categ],axis=1)

In [15]:
x = final['GrLivArea'] 
mnGrLivArea = np.log(train.GrLivArea).mean()
stdGrLivArea = np.log(train.GrLivArea).std()
final_combo['LogLivArea'] = (np.log(x) - mnGrLivArea) / stdGrLivArea

In [16]:
final_combo.head(2)

Unnamed: 0,test_norm_LotArea,test_norm_Age,test_norm_GarageArea,BsmtFullBath,KitchenAbvGr,GarageType,Kitchen,Fireplace,ExterQ,BsmtQ,HeatingQ,n_toilets,LogLivArea
0,0.523506,0.555439,1.175736,0.0,1,1,3,0,3,3,3,1.0,-1.842024
1,1.042896,0.634372,-0.696247,0.0,1,1,4,0,3,3,3,2.0,0.642691


In [17]:
multi = linear_model.LinearRegression()

In [18]:
train_price = make_boxed_normal(train[['SalePrice']]-37800)

In [19]:
multi.fit(X=train_combo, y=train_price[0])

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [20]:
train_combo.columns[train_combo.isna().any()].tolist()

['norm_LotArea',
 'norm_Age',
 'norm_GarageArea',
 'BsmtFullBath',
 'KitchenAbvGr',
 'GarageType',
 'Kitchen',
 'Fireplace',
 'ExterQ',
 'BsmtQ',
 'HeatingQ',
 'n_toilets',
 'LogLivArea']

In [21]:
train_combo.head(5)

Unnamed: 0,norm_LotArea,norm_Age,norm_GarageArea,BsmtFullBath,KitchenAbvGr,GarageType,Kitchen,Fireplace,ExterQ,BsmtQ,HeatingQ,n_toilets,LogLivArea
0,0.350879,0.153033,0.464833,1.0,1.0,1.0,4.0,0.0,4.0,4.0,5.0,4.0,0.832204
1,0.385661,0.476262,0.402387,0.0,1.0,1.0,3.0,3.0,3.0,4.0,5.0,3.0,0.628286
2,0.431427,0.187439,0.506372,1.0,1.0,1.0,4.0,3.0,4.0,4.0,5.0,4.0,0.666528
3,0.384202,0.837357,0.529582,1.0,1.0,1.0,4.0,4.0,3.0,3.0,4.0,2.0,0.23089
4,0.505452,0.203592,0.658179,1.0,1.0,1.0,4.0,3.0,4.0,4.0,5.0,4.0,1.280433


In [None]:
train_co