MULTILINEAR REGRESSION FRAMEWORK FOR TRAIN/TEST

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing, datasets, linear_model
from sklearn.model_selection import train_test_split, KFold


In [2]:
training = pd.read_csv('../data/train_clean.csv', index_col=0)
training.shape

(1437, 17)

In [3]:
#Convert year building was builtinto a numeric Age 
training['Age'] = 2019 - training['YearBuilt']


In [4]:
# normalizes dataframes using preprocessing from sklearn

def make_normal(df):
    new_dict = {}
    cols = df.columns
    for name in cols: 
        key = 'norm_' + name
        arr = np.array(df[name])
        value = preprocessing.normalize([arr])      
        new_dict[key] = value[0]
        ind = range(df.shape[0]-1) 
    return pd.DataFrame(new_dict)
    

In [5]:
# Make continuous features normal. Return normed df
norm_list = ['LotArea', 'Age', 'GrLivArea', 'GarageArea']
norm_df = training[norm_list]

norm_df = make_normal(norm_df)

In [6]:
# Mutates main df. Drops un-normed continuous values
training = training.drop(norm_list, axis=1)

In [7]:
# Mutates main df. Removes ordinal years. Keeps as subset
years_df = training[['YearBuilt', 'YearRemodAdd']]
training = training.drop(['YearBuilt', 'YearRemodAdd'], axis=1)

In [8]:
# Mutates main df. Adds normed values
training = pd.concat([training, norm_df], axis = 1)

In [9]:
training.sample(5) # for inspection. This df will be used for training/testing

Unnamed: 0,BsmtFullBath,BedroomAbvGr,KitchenAbvGr,GarageType,SalePrice,Kitchen,Fireplace,ExterQ,BsmtQ,HeatingQ,n_toilets,n_showers,norm_LotArea,norm_Age,norm_GrLivArea,norm_GarageArea
291,1.0,3.0,1.0,1.0,167000.0,3.0,4.0,3.0,3.0,3.0,2.0,2.0,0.033389,0.032389,0.025722,0.014411
1164,0.0,3.0,1.0,1.0,200500.0,5.0,4.0,3.0,3.0,3.0,3.0,3.0,0.085562,0.026286,0.020443,0.050846
1247,1.0,2.0,1.0,1.0,301500.0,4.0,3.0,4.0,4.0,5.0,3.0,3.0,0.032181,0.005163,0.026243,0.038122
1198,1.0,3.0,1.0,1.0,200000.0,4.0,0.0,4.0,4.0,5.0,3.0,3.0,0.015197,0.033797,0.01186,0.0
89,0.0,2.0,1.0,1.0,98600.0,3.0,0.0,3.0,3.0,3.0,1.0,1.0,0.032537,0.046001,0.021808,0.022076


In [10]:
norm_list = ['norm_'+x for x in norm_list]

In [11]:
training.shape

(1450, 16)

In [12]:
training = training.dropna()
training.shape

(1450, 16)

In [13]:
# Remove "SalePrice column from train_x"
train_y = training['SalePrice'] 
train_x = training.drop('SalePrice', axis=1)

In [14]:
# Split Training data 

train_x.isnull().sum()
 
                            


BsmtFullBath       13
BedroomAbvGr       13
KitchenAbvGr       13
GarageType         13
Kitchen            13
Fireplace          13
ExterQ             13
BsmtQ              13
HeatingQ           13
n_toilets          13
n_showers          13
norm_LotArea       13
norm_Age           13
norm_GrLivArea     13
norm_GarageArea    13
dtype: int64

In [15]:
#Multilinear - Simple -- Unpenalized model i.e. lambda = 0
from sklearn import linear_model
ols = linear_model.LinearRegression()
ols.fit(train_x, train_y)
print("Intercept: %f" %ols.intercept_)
print("Coefficients: %s" %str(ols.coef_))
print("R^2: %f" %(ols.score(train_x, train_y)))

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

LASSO REGRESSION / COEFFICIENT ESTIMATES AT DIFFERENT ALPHA LEVELS

In [None]:
lasso  = Lasso()
alphas = np.linspace(0.1,0.9,20)
lasso.set_params(normalize=False)
coefs_lasso  = []
intercepts_lasso = []

for alpha in alphas:
    lasso.set_params(alpha=alpha)
    lasso.fit(train_x, train_y)  
    coefs_lasso.append(lasso.coef_)
    intercepts_lasso.append(lasso.intercept_)
        
coefs_lasso = pd.DataFrame(coefs_lasso, index = alphas, columns = train_x.columns)  
coefs_lasso.head(10)

In [None]:
for name in coefs_lasso.columns:
    plt.plot(coefs_lasso.index, coefs_lasso[name], label=name)
plt.xlabel(r'hyperparameter $\lambda$')
plt.ylabel(r'slope values')

plt.legend()   

In [None]:
ridge = Ridge() # create a ridge regression instance
ridge.set_params(normalize=False)
ridge.fit(train_x, train_y) # fit data
ridge.coef_, ridge.intercept_ # print out the coefficients
print("The coef of determination of ridge regression is: %.4f" %ridge.score(train_x, train_y))

In [None]:
ridge_scores_train = []
lasso_scores_train = []
#ridge_scores_test  = []
#lasso_scores_test  = []

alphas = np.logspace(-2, 3, 10)

for alpha in alphas:
    ridge.set_params(alpha=alpha, normalize=False)
    lasso.set_params(alpha=alpha, normalize=False)
    ridge.fit(train_x, train_y)
    lasso.fit(train_x, train_y)
    print(ridge.score(train_x,train_y))
    ridge_scores_train.append(ridge.score(train_x, train_y))
    #ridge_scores_test.append(ridge.score(X_test, Y_test))
    lasso_scores_train.append(lasso.score(train_x, train_y))
    #lasso_scores_test.append(lasso.score(X_test, Y_test))
    
ridge_scores_train = np.array(ridge_scores_train) 
#ridge_scores_test  = np.array(ridge_scores_test)
lasso_scores_train = np.array(lasso_scores_train) 
#lasso_scores_test  = np.array(lasso_scores_test)

In [None]:
plt.plot(alphas, ridge_scores_train, label=r'$train\ R^2$')
#plt.plot(alphas, ridge_scores_test, label=r'$test\ R^2$')
plt.legend(loc=1)
plt.title(r'Ridge Train $R^2$')
#ridge_underfit = ridge_scores_train < ridge_scores_test
#last_underfit  = np.max(alphas[ridge_underfit])
#plt.axvline(last_underfit, linestyle='--', color='g', label='optimal lambda', alpha=0.4)
plt.legend(loc=1)
plt.xlabel(r'hyperparameter $\lambda$')
plt.ylabel(r'$R^2$')
ridge_scores_train

In [None]:
plt.plot(alphas, lasso_scores_train, label=r'$train\ R^2$')
#plt.plot(alphas, lasso_scores_test, label=r'$test\ R^2$')
plt.title(r'Lasso Train $R^2$ Comparison')
#lasso_underfit = lasso_scores_train < lasso_scores_test
#last_underfit  = np.max(alphas[lasso_underfit])
#plt.axvline(last_underfit, linestyle='--', color='g', label='optimal lambda', alpha=0.4)
plt.legend(loc=1)
plt.xlabel(r'hyperparameter $\lambda$')
plt.ylabel(r'$R^2$')

In [None]:
lasso_scores_train

In [None]:
ridge_scores_train