In [27]:
# special IPython command to prepare the notebook for matplotlib
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import math

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from utils.dataManagers.steward import DataSteward
from utils.dataManagers.informations import informer

ds = DataSteward('mapper_and_most_frequent', 'mean')
column_desc = informer.get_column_descriptions()

train = ds.train_data
test = ds.test_data
ytrain = ds.train_response

In [30]:
def get_df_for_predictions(train, test):
    all_data = pd.concat((train, test))
    categorical = all_data.select_dtypes(['category'])  
    number_type = all_data.select_dtypes(['int64', 'float64']) 
    
    df = pd.get_dummies(categorical).join(number_type)
    return df.iloc[:train.shape[0], :], df.iloc[train.shape[0]:, :]

trainWithDummies, testWithDummies = get_df_for_predictions(train, test)

In [33]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression(normalize=True)
clf.fit(trainWithDummies, ytrain)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [95]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso

alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
param_grid = dict(alpha=alphas)

clf = Ridge(normalize=True)
grid_ridge = GridSearchCV(clf, param_grid)
grid_ridge.fit(trainWithDummies, ytrain)

predictions = grid_ridge.predict(testWithDummies)
get_submission(predictions, 'ridge_normalized_alpha_tuned').head()

GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=True, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,113853.367766
1462,147668.438883
1463,184987.302361
1464,192471.257008
1465,212642.778402


In [96]:
clf = Lasso()
grid_lasso = GridSearchCV(clf, param_grid)
grid_lasso.fit(trainWithDummies, ytrain)

predictions = grid_lasso.predict(testWithDummies)
get_submission(predictions, 'lasso_normalized_alpha_tuned').head()



GridSearchCV(cv=None, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,105176.419535
1462,146275.788274
1463,185132.460119
1464,193517.653019
1465,220236.475947


In [70]:
def get_submission(predictions, predictions_file_name=None):
    df = pd.DataFrame(predictions, columns=['SalePrice'])
    df.index = test.index
    df[df.SalePrice < 0] = 0
    if predictions_file_name:
        df.to_csv(f'scores/{predictions_file_name}.csv')
    return df

In [71]:
predictions = clf.predict(testWithDummies)
df = get_submission(predictions, 'default_linear_regression')
df.head()

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,95488.0
1462,132224.0
1463,176768.0
1464,189440.0
1465,220288.0


In [None]:
# 

In [56]:
def get_df_for_predictions(data):
    df = data.copy()
    del df['Id']
    number_type = df.select_dtypes(['int64', 'float64']) 
    number_type = df.fillna(df.mean())
    categorical = pd.get_dummies(df.select_dtypes(['category']))
    categorical.reset_index(inplace=True)
    return categorical, number_type

train_categorical, train_number_type = get_df_for_predictions(train)

sd = StandardScaler()
number_type_normalized = sd.fit_transform(train_number_type)
number_type_normalized = pd.DataFrame(number_type_normalized, columns = train_number_type.columns)
number_type_normalized.head()

x_train = train_categorical.join(number_type_normalized)

y_train_dropped = ytrain.drop(ytrain.index[rows_with_nulls.index - 1])


ValueError: fill value must be in categories

In [49]:
from sklearn import linear_model
clf = linear_model.LinearRegression()
clf.fit(x_train, y_train_dropped)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [53]:
test_categorical, test_number_type = get_df_for_predictions(test)

test_nt_norm =  sd.transform(test_number_type.dropna())
test_nt_norm = pd.DataFrame(test_nt_norm, columns = test_number_type.columns)

x_test = test_categorical.join(test_nt_norm)

predictions = clf.predict(x_test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [7]:
# df = train.dropna(subset=[rows_with_nulls.columns]).copy()
# del df['Id']
# number_type = df.select_dtypes(['int64', 'float64']) 
# categorical = pd.get_dummies(df.select_dtypes(['category']))
# categorical.reset_index(inplace=True)

In [8]:
# from sklearn.preprocessing import StandardScaler

# sd = StandardScaler()
# number_type_normalized = sd.fit_transform(number_type)
# number_type_normalized = pd.DataFrame(number_type_normalized, columns = number_type.columns)
# number_type_normalized.head()

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,-0.23357,-0.205885,0.99293,0.823953,0.462009,0.571581,-0.282645,-0.986817,-0.496377,-0.822753,...,0.235641,-0.760257,0.231036,-0.356622,-0.11253,-0.278676,-0.072999,-0.141407,-1.615345,0.153084
1,0.384834,-0.064358,0.120665,-0.460746,-0.572748,1.152559,-0.282645,-0.689078,0.415045,0.229055,...,-0.224712,1.68609,-0.716739,-0.356622,-0.11253,-0.278676,-0.072999,-0.141407,-0.498715,-0.596291
2,-0.109889,0.138702,0.928317,0.776371,0.28251,0.101672,-0.282645,-0.355789,-0.352705,-0.656951,...,0.549518,-0.760257,-0.064173,-0.356622,-0.11253,-0.278676,-0.072999,-0.141407,0.990125,0.153084
3,-0.439705,-0.070512,-1.850006,-0.746235,-0.572748,-0.475034,-0.282645,-0.120264,-0.720866,-0.550734,...,0.727382,-0.760257,-0.172934,4.083851,-0.11253,-0.278676,-0.072999,-0.141407,-1.615345,-1.345665
4,0.549742,0.509132,0.896011,0.681208,1.275032,0.462647,-0.282645,-0.23136,0.152394,-0.074052,...,1.74225,0.815913,0.588393,-0.356622,-0.11253,-0.278676,-0.072999,-0.141407,2.106755,0.153084


In [42]:
# x_train = categorical.join(number_type_normalized)
# x_train.head()

Unnamed: 0,index,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
0,0,0,0,0,0,0,1,0,0,0,...,0.235641,-0.760257,0.231036,-0.356622,-0.11253,-0.278676,-0.072999,-0.141407,-1.615345,0.153084
1,1,1,0,0,0,0,0,0,0,0,...,-0.224712,1.68609,-0.716739,-0.356622,-0.11253,-0.278676,-0.072999,-0.141407,-0.498715,-0.596291
2,2,0,0,0,0,0,1,0,0,0,...,0.549518,-0.760257,-0.064173,-0.356622,-0.11253,-0.278676,-0.072999,-0.141407,0.990125,0.153084
3,3,0,0,0,0,0,0,1,0,0,...,0.727382,-0.760257,-0.172934,4.083851,-0.11253,-0.278676,-0.072999,-0.141407,-1.615345,-1.345665
4,4,0,0,0,0,0,1,0,0,0,...,1.74225,0.815913,0.588393,-0.356622,-0.11253,-0.278676,-0.072999,-0.141407,2.106755,0.153084


In [10]:
y_train_dropped = ytrain.drop(ytrain.index[rows_with_nulls.index - 1])
y_train_dropped.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

In [76]:
def get_dummy_variables_for_object_dtypes(in_df, print_progress=False):
    df = in_df.copy()
    for col in df.columns:
        if df[col].dtype == 'object':
            dummies = pd.get_dummies(train[col].values.astype(str), dummy_na=False)
            if print_progress:
                print(f'Getting dummies from {col} column')
                print(f'Columns are: {dummies.columns.values}')
            df = df.join(dummies, rsuffix='_' + col)
            del df[col]
    return df
            
df = get_dummy_variables_for_object_dtypes(train)
df.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,ConLw,New,Oth,WD,Abnorml,AdjLand,Alloca,Family,Normal,Partial
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,0,0,1,0,0,0,0,1,0
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,0,0,0,1,0,0,0,0,1,0
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,0,0,1,0,0,0,0,1,0
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,0,0,1,1,0,0,0,0,0
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,0,0,0,1,0,0,0,0,1,0
