### Importing all relevant libraries

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, LassoCV,ElasticNetCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import mean_squared_error
%matplotlib inline

### Reading in both training and testing data sets

In [2]:
df = pd.read_csv('../data/train.csv',index_col='Id')
test = pd.read_csv('../data/test.csv',index_col='Id')
print('Training Data shape:',df.shape,'Testing Data shape:',test.shape)

Training Data shape: (2051, 80) Testing Data shape: (879, 79)


# Exploratory Data Analysis

In [3]:
df.head()

Unnamed: 0_level_0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
109,533352170,60,RL,,13517,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,3,2010,WD,130500
544,531379050,60,RL,43.0,11492,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,4,2009,WD,220000
153,535304180,20,RL,68.0,7922,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,1,2010,WD,109000
318,916386060,60,RL,73.0,9802,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2010,WD,174000
255,906425045,50,RL,82.0,14235,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,3,2010,WD,138500


In [4]:
df.describe().T.head()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PID,2051.0,713590000.0,188691800.0,526301100.0,528458140.0,535453200.0,907180080.0,924152030.0
MS SubClass,2051.0,57.00878,42.82422,20.0,20.0,50.0,70.0,190.0
Lot Frontage,1721.0,69.0552,23.26065,21.0,58.0,68.0,80.0,313.0
Lot Area,2051.0,10065.21,6742.489,1300.0,7500.0,9430.0,11513.5,159000.0
Overall Qual,2051.0,6.11214,1.426271,1.0,5.0,6.0,7.0,10.0


In [5]:
def box_func(df, ylabel):
    data = df.select_dtypes(include="object")
    fig, ax = plt.subplots(ncols = 1, nrows = data.shape[1], figsize = (8,data.shape[1]*6))
    i=0
    for col in data:
        sns.boxplot(x = ylabel, y = col, data = df,orient='h',ax=ax[i])
        i+=1
    plt.tight_layout()
#box_func(df,"SalePrice")

In [6]:
def scatter(df,target):
    data = df.select_dtypes(exclude="object")
    fig, ax = plt.subplots(ncols = 1, nrows = data.shape[1], figsize = (7,data.shape[1]*3))
    i = 0
    for col in data:
        sns.regplot(x = df[col],y = df[target],fit_reg=False, ax=ax[i])
        i += 1
        plt.tight_layout()
#scatter(df,"SalePrice")

# Data Cleaning

In [7]:
fill_na={'Lot Frontage':np.mean(df['Lot Frontage']),
         'Alley':'No Alley',
         'Mas Vnr Type':'No Vnr',
         'Mas Vnr Area':0,
         'Bsmt Qual':'No Bsmt',
         'Bsmt Cond':'No Bsmt',
         'Bsmt Exposure':'No Bsmt',
         'BsmtFin Type 1':'No Bsmt',
         'BsmtFin SF 1':0,
         'BsmtFin Type 2':'No Bsmt',
         'BsmtFin SF 2':0,
         'Bsmt Unf SF':0,
         'Total Bsmt SF':0,
         'Bsmt Full Bath':0,
         'Bsmt Half Bath':0,
         'Fireplace Qu':'No Fire',
         'Garage Type':'No Garage',
         'Garage Yr Blt':0,
         'Garage Qual':'No Garage',
         'Garage Cond':'No Garage',
         'Garage Finish':"No Garage",
         'Garage Cars':0,
         'Garage Area':0,
         'Pool QC':'No Pool',
         'Fence':'No Fence',
         'Misc Feature':'No Feature',
         'Electrical':'Mix'
        }

#### Additional Feature Engineering

In [8]:
df.fillna(value = fill_na,inplace = True)
test.fillna(fill_na,inplace=True)
df["Total Area"] = df["Total Bsmt SF"] + df["Gr Liv Area"] + df['Garage Area']
test['Total Area'] = test["Total Bsmt SF"] + test['Gr Liv Area'] + test["Garage Area"]

In [9]:
print("There are",df.isnull().sum().sum(),"null values in our training data frame")
print("There are",test.isnull().sum().sum(),"null values in our testing data frame")

There are 0 null values in our training data frame
There are 0 null values in our testing data frame


#### dropping outliers/incorrect values

In [10]:
df.drop(df[df['Lot Area']>100000].index,inplace=True)
df.drop(df[df['Garage Yr Blt']>2018].index,inplace=True)
df.drop(df[df['Total Area']>10000].index,inplace=True)

In [11]:
df.to_csv("../data/clean_data")
test.to_csv("../data/clean_test_data")

In [12]:
obj_list = ['Overall Qual',
            'Overall Cond',
            'MS SubClass',
            'Bsmt Full Bath',
            'Bsmt Half Bath',
            'Full Bath',
            'Half Bath',
            'Bedroom AbvGr',
            'Kitchen AbvGr',
            'Fireplaces',
            'Garage Cars',
            'Mo Sold',
            'Yr Sold',
           ]
for col in obj_list:
    df[col] = df[col].astype(object)
    test[col] = test[col].astype(object)

#### Creating dummy variables for each column with the object data type, and appending it to the original data frame.

In [13]:
dummy_col_names = df.select_dtypes(include="object").columns
test_dummy_col_names = test.select_dtypes(include="object").columns
for col in dummy_col_names:
    df = df.join(pd.get_dummies(df[col],prefix=str(col)))
for col in test_dummy_col_names:
    test = test.join(pd.get_dummies(test[col],prefix=str(col)))

#### Creating our X and y data frames to fit our linear regression model, while removing columns that are linearly related to Total Area

In [14]:
X_col_list = list(df.select_dtypes(exclude='object').columns)
not_in_X = ['SalePrice',
            'Gr Liv Area',
            'Garage Area',
            'Total Bsmt SF',
            '2nd Flr SF',
            'Bsmt Unf SF',
            'BsmtFin SF 1',
            '1st Flr SF',
            'Low Qual Fin SF',
            'BsmtFin SF 2'
           ]
for col in not_in_X:
    X_col_list.remove(col)
X = df[X_col_list]
y = df["SalePrice"]

In [15]:
test_col_list = list(test.select_dtypes(exclude='object').columns)
not_in_test = ['Gr Liv Area',
                 'Garage Area',
                 'Total Bsmt SF',
                 '2nd Flr SF',
                 'Bsmt Unf SF',
                 'BsmtFin SF 1',
                 '1st Flr SF',
                 'Low Qual Fin SF',
                 'BsmtFin SF 2'
                ]
for col in not_in_test:
    test_col_list.remove(col)
test = test[test_col_list]

#### Ensuring that our columns are in the correct order

In [16]:
for col in X_col_list:
    if col not in test_col_list:
        test[str(col)] = 0

for col in test_col_list:
    if col not in X_col_list:
        X_col_list.append(col)
        X[str(col)] = 0
test = test[X.columns]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


#### Splitting our data into training and testing sets.

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=9)

#### Scaling the training data, and then scaling the testing data based on the scale created by the training data

In [18]:
ss = StandardScaler()
X_Scaled_train = ss.fit_transform(X_train)
X_Scaled_test = ss.transform(X_test)

# Modeling

In [19]:
ENCV = ElasticNetCV(l1_ratio=[.001,.3,.5,.7,.9,1],n_alphas=100)
ENCV.fit(X_Scaled_train,y_train)

ElasticNetCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
       l1_ratio=[0.001, 0.3, 0.5, 0.7, 0.9, 1], max_iter=1000,
       n_alphas=100, n_jobs=1, normalize=False, positive=False,
       precompute='auto', random_state=None, selection='cyclic',
       tol=0.0001, verbose=0)

# Model Evaluation

In [20]:
print("l1_ratio:",ENCV.l1_ratio_)
print('Alpha:',ENCV.alpha_)
print('train score:',ENCV.score(X_Scaled_train,y_train))
print('test score:',ENCV.score(X_Scaled_test,y_test))
y_hat_train = ENCV.predict(X_Scaled_train)
y_hat_test = ENCV.predict(X_Scaled_test)
print('Train RMSE:',np.sqrt(mean_squared_error(y_train,y_hat_train)))
print('Test RMSE:',np.sqrt(mean_squared_error(y_test,y_hat_test)))

l1_ratio: 1.0
Alpha: 534.5013540863487
train score: 0.9407763851144852
test score: 0.9259456031236891
Train RMSE: 18971.387221036082
Test RMSE: 22575.78200896088


In [21]:
X_col_list = X.columns
coef_df = pd.DataFrame(data = ENCV.coef_,index=X_col_list,columns=["Coefficient"])
coef_df = coef_df[coef_df['Coefficient'] !=0]
coef_df.sort_values(by='Coefficient').head(15)

Unnamed: 0,Coefficient
Overall Cond_3,-3190.903111
Fireplaces_0,-2939.08128
Bsmt Full Bath_0.0,-2837.22773
Full Bath_1,-2400.325848
Exter Qual_TA,-2142.972102
Overall Cond_2,-1865.257668
BsmtFin Type 1_Unf,-1714.767763
Functional_Maj1,-1657.294092
Overall Cond_4,-1455.436068
Heating QC_TA,-1452.292361


In [22]:
coef_df.sort_values(by='Coefficient').tail(15)

Unnamed: 0,Coefficient
Neighborhood_NoRidge,3361.848809
Bsmt Exposure_Gd,3458.220229
Year Remod/Add,3478.750402
Bldg Type_1Fam,3543.907234
Neighborhood_GrnHill,3553.992559
Half Bath_1,3749.586557
Bsmt Qual_Ex,3835.251432
Neighborhood_StoneBr,4021.776584
Lot Area,4163.969102
Kitchen Qual_Ex,4239.602445


# Kaggle Submission

In [23]:
scaled_test = ss.transform(test)

In [24]:
data = ENCV.predict(scaled_test)

In [26]:
submission = pd.DataFrame(data = data,index = test.index,columns=['SalePrice'])

In [None]:
submission.to_csv("../data/Submission_File",index_label='Id')

# Creating Data for Analysis

In [None]:
y_hat = ENCV.predict(X_test)