In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV,ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import mean_squared_error
%matplotlib inline

In [2]:
df = pd.read_csv('../data/clean_data',index_col='Id',)
test = pd.read_csv('../data/clean_test_data',index_col='Id')
print(df.shape,test.shape)

(2044, 81) (879, 80)


In [3]:
obj_list = ['Overall Qual',
            'Overall Cond',
            'MS SubClass',
            'Bsmt Full Bath',
            'Bsmt Half Bath',
            'Full Bath',
            'Half Bath',
            'Bedroom AbvGr',
            'Kitchen AbvGr',
            'Fireplaces',
            'Garage Cars',
            'Mo Sold',
            'Yr Sold',
           ]
for col in obj_list:
    df[col] = df[col].astype(object)
    test[col] = test[col].astype(object)

For these features, I chose to make them categorical instead of numerical. This is because there is no linear relationship between these values. For example, the difference between an overall quality of 2 and 3 may not have the same effect on price as the difference between an overall quality score of 8 and 9

#### Creating dummy variables for each column with the object data type, and appending it to the original data frame.

In [4]:
dummy_col_names = df.select_dtypes(include="object").columns
test_dummy_col_names = test.select_dtypes(include="object").columns
for col in dummy_col_names:
    df = df.join(pd.get_dummies(df[col],prefix=str(col)))
for col in test_dummy_col_names:
    test = test.join(pd.get_dummies(test[col],prefix=str(col)))

#### Creating our X and y data frames to fit our linear regression model, while removing columns that are linearly related to Total Area

In [6]:
X_col_list = list(df.select_dtypes(exclude='object').columns)
not_in_X = ['SalePrice',
            'Gr Liv Area',
            'Garage Area',
            'Total Bsmt SF',
            '2nd Flr SF',
            'Bsmt Unf SF',
            'BsmtFin SF 1',
            '1st Flr SF',
            'Low Qual Fin SF',
            'BsmtFin SF 2'
           ]
for col in not_in_X:
    X_col_list.remove(col)
X = df[X_col_list]
y = df[["SalePrice"]]

In [7]:
test_col_list = list(test.select_dtypes(exclude='object').columns)
not_in_test = ['Gr Liv Area',
                 'Garage Area',
                 'Total Bsmt SF',
                 '2nd Flr SF',
                 'Bsmt Unf SF',
                 'BsmtFin SF 1',
                 '1st Flr SF',
                 'Low Qual Fin SF',
                 'BsmtFin SF 2'
                ]
for col in not_in_test:
    test_col_list.remove(col)
test = test[test_col_list]

because these features are linearly related to Total Area, the feature that I created, they should not be included in the model.

#### Ensuring that our columns are in the correct order

In [8]:
for col in X_col_list:
    if col not in test_col_list:
        test[str(col)] = 0

for col in test_col_list:
    if col not in X_col_list:
        X_col_list.append(col)
        X[str(col)] = 0
test = test[X.columns]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [11]:
list(X.columns)

['PID',
 'Lot Frontage',
 'Lot Area',
 'Year Built',
 'Year Remod/Add',
 'Mas Vnr Area',
 'TotRms AbvGrd',
 'Garage Yr Blt',
 'Wood Deck SF',
 'Open Porch SF',
 'Enclosed Porch',
 '3Ssn Porch',
 'Screen Porch',
 'Pool Area',
 'Misc Val',
 'Total Area',
 'MS SubClass_20',
 'MS SubClass_30',
 'MS SubClass_40',
 'MS SubClass_45',
 'MS SubClass_50',
 'MS SubClass_60',
 'MS SubClass_70',
 'MS SubClass_75',
 'MS SubClass_80',
 'MS SubClass_85',
 'MS SubClass_90',
 'MS SubClass_120',
 'MS SubClass_150',
 'MS SubClass_160',
 'MS SubClass_180',
 'MS SubClass_190',
 'MS Zoning_A (agr)',
 'MS Zoning_C (all)',
 'MS Zoning_FV',
 'MS Zoning_I (all)',
 'MS Zoning_RH',
 'MS Zoning_RL',
 'MS Zoning_RM',
 'Street_Grvl',
 'Street_Pave',
 'Alley_Grvl',
 'Alley_No Alley',
 'Alley_Pave',
 'Lot Shape_IR1',
 'Lot Shape_IR2',
 'Lot Shape_IR3',
 'Lot Shape_Reg',
 'Land Contour_Bnk',
 'Land Contour_HLS',
 'Land Contour_Low',
 'Land Contour_Lvl',
 'Utilities_AllPub',
 'Utilities_NoSeWa',
 'Utilities_NoSewr',
 'Lo

In [9]:
X.to_csv('../data/to_be_split_train_X')
y.to_csv('../data/to_be_split_train_y')
test.to_csv('../data/to_be_modeled_test')