# Matthew Garton
# Regression Project - Starter Code

### Purpose: "Messy" Notebook for keeping general notes, tinkering, experimentation, etc...

This notebook gives a useful view into my work process and highlights the early steps I took to familiarize myself with the data and ultimately develop my data cleaning function. I would recommend reading up to where I defined the clean_ames_data() function, then moving on to the EDA notebook, followed by the Feature Engineering and Model Design book. The Data Handling Functions notebook is meant to serve as a reference and a template for a more systematic workflow.

In [None]:
# import necessary modules
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics
import sklearn.linear_model as linear_model
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

sns.set_style('darkgrid')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [None]:
# write a function to handle basic data inspection
def inspect_data(df):
    '''Perfrom basic data inspection tasks, provide user with readable output'''
    print('Shape: {}\n'.format(df.shape))
    print(df.info(),'\n') 

In [None]:
# load training data
ames = pd.read_csv("../data/train.csv")

# inspect the data
inspect_data(ames)

# convert column names to useable format
ames.columns = [x.lower().replace(' ','_') for x in ames.columns]

For categorical columns other than mason veneer type, it seems clear that the categorical value 'NA' has been treated as nan. It should be a simple exercise to handle these by replacing nan's with the value 'NA.' I am going to make the assumption that the Masonry Veneer Type entries of NaN should be values of 'None', as there is no value for 'Masonry Veneer Area' for those entries. I will replace such entries with 'None' for Masonry Veneer Type and 0.0 for Masonry Veneer Area. This affects 22 entries out of 2051, or 1% of the data.

In [None]:
ames['mas_vnr_type'].fillna(value = 'None', inplace = True) # Assuming 'NaN' should be 'None' for Masonry Type

In [None]:
ames['mas_vnr_area'].fillna(value = 0.0, inplace = True) # Assuming masonry area is 0.0 for houses with 'NaN' type

In [None]:
# filtering for columns which contain null values
nulls = ames.columns[ames.isnull().any()]
ames[nulls].head()

In [None]:
# for categorical variables, the null values are should actually be marked 'NA'
# they aren't missing data, they are values of 'NA' for the category in question
for col in ames[nulls].select_dtypes(include = 'object').columns:
    ames[col].fillna(value = 'NA', inplace = True)

In [None]:
# filtering for houses with no basement
no_bsmt = ames['bsmt_qual'] == 'NA'
ames[no_bsmt].filter(regex = 'bsmt')
 
# filling 'NaNs' in numerical columns for houses with no basement with 0.0
for col in ames[no_bsmt].filter(regex = 'bsmt'):
    ames[col].fillna(value = 0.0, inplace = True)

In [None]:
# filtering for houses with no garage
no_garage = ames['garage_type'] == 'NA'

# filling 'NaNs' in numerical columns for houses with no garage with 0.0
for col in ames[no_garage].filter(regex = 'garage'):
    ames[col].fillna(value = 0.0, inplace = True)

NOTE: Potential problem here - setting a value of 0.0 for garage year built. Will mess up results if you're trying to use garage year built as a predictor.

In [None]:
# Take a look at 'garage year' data
sns.distplot(ames.garage_yr_blt);

In [None]:
(ames.garage_yr_blt == 0).sum()

In [None]:
# taking a look at lot frontage
nulls = ames.columns[ames.isnull().any()]
print(ames['lot_frontage'].isnull().sum())

not_null = ames['lot_frontage'].isnull() == False
sns.distplot(ames[not_null]['lot_frontage'])

In [None]:
ames['lot_frontage'].describe()

15% of my data has a missing value for Lot Frontage. It is likely that I cannot use this variable as a predictor in my model.

I have a hunch that the basement 'NaN's are for houses with no basement. I'll explore that here.

In [None]:
# Take a look at null values again..
nulls = ames.columns[ames.isnull().any()]
ames[nulls].columns

In [None]:
#unf = ames.bsmtfin_type_1 == 'Unf'
#ames[unf].filter(regex = 'bsmt')

In [None]:
# write a function to clean data

def clean_ames_data(df):
    '''Generalized function to clean a sample of Ames Housing Data'''
    
    # convert column names to useable format
    df.columns = [x.lower().replace(' ','_') for x in df.columns]
    
    # drop 'id' and 'pid' columns
    #df.drop(['id','pid'], axis=1, inplace=True)

    # Dealing with NaN values. Handling the special case of Masonry Veneer Type first
    
    df['mas_vnr_type'].fillna(value = 'None', inplace = True) # Assuming 'NaN' should be 'None' for Masonry Type
    df['mas_vnr_area'].fillna(value = 0.0, inplace = True) # Assuming masonry area is 0.0 for houses with 'NaN' type
    
    # for categorical variables, the missing values should actually be marked 'NA'
    nulls = df.columns[df.isnull().any()]
    for col in df[nulls].select_dtypes(include = 'object').columns:
        df[col].fillna(value = 'NA', inplace = True)
    
    # filtering for houses with no basement, replacing numerical columns 'NaNs' with 0.0
    no_bsmt = df['bsmt_qual'] == 'NA'    
    for col in df[no_bsmt].filter(regex = 'bsmt'):
        df[col].fillna(value = 0.0, inplace = True)
        
    # use the same procedure to handle numerical columns for houses with no garage
    no_garage = df['garage_type'] == 'NA' 
    for col in df[no_garage].filter(regex = 'garage'):
        df[col].fillna(value = 0.0, inplace = True)


In [None]:
# clean the data
clean_ames_data(ames)

In [None]:
sns.distplot(ames['garage_yr_blt']);

In [None]:
# re-inspect the data
inspect_data(ames)

# Note to Reader

Everything below here is a rough draft - a 'messy' version of the rest of the workflow. The other notebooks in this project are cleaner, better representations of my EDA, Feature Selection, and Modeling processes.

In [None]:
# Split data into X an y
X = ames.loc[:,ames.columns != 'saleprice']
y = ames['saleprice']

In [None]:
# Train-test split before EDA
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 

In [None]:
inspect_data(ames.select_dtypes(include = 'object'))

In [None]:
ames.nunique()

Time for some Exploratory Data Analysis (on my training data only).

Tools to use

1. describe  
2. distplots  
3. heatmap  
4. pairplots  

In [None]:
ames.describe().T

In [None]:
sns.distplot(ames['pid'])

In [None]:
ames.head()

In [None]:
ames_corr = ames.corr()
ames_corr

In [None]:
# Set the default matplotlib figure size to 7x7:
fig, ax = plt.subplots(figsize = (7,7))

# Generate a mask for the upper triangle (taken from seaborn example gallery)
mask = np.zeros_like(ames_corr, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True

# Plot the heatmap with seaborn.
# Assign the matplotlib axis the function returns. This will let us resize the labels.
ax = sns.heatmap(ames_corr, mask = mask, ax = ax, cmap = 'coolwarm')

# Resize the labels.
ax.set_xticklabels(ax.xaxis.get_ticklabels(), fontsize = 14)
ax.set_yticklabels(ax.yaxis.get_ticklabels(), fontsize = 14)

# If you put plt.show() at the bottom, it prevents those useless printouts from matplotlib.
plt.show()

In [None]:
sns.heatmap(ames[['lot_area','year_built','yr_sold','1st_flr_sf','2nd_flr_sf','saleprice']].corr(), cmap = 'coolwarm')

In [None]:
sns.pairplot(ames[['lot_area','year_built','yr_sold','1st_flr_sf','2nd_flr_sf','saleprice']])

In [None]:
sns.heatmap(ames[['full_bath','half_bath','bedroom_abvgr','kitchen_abvgr','fireplaces','saleprice']].corr(), cmap = 'coolwarm')

In [None]:
sns.pairplot(ames[['full_bath','half_bath','bedroom_abvgr','kitchen_abvgr','fireplaces','saleprice']]);

In [None]:
sns.pairplot(ames[['overall_qual','overall_cond','saleprice']]);

In [None]:
sns.heatmap(ames[['overall_qual','overall_cond','saleprice']].corr(), cmap = 'coolwarm')

In [None]:
ames.select_dtypes(include = 'int64').info()

In [None]:
sns.distplot(ames['saleprice']);

In [None]:
subset = ames[['lot_area','full_bath','half_bath','bedroom_abvgr','totrms_abvgrd','saleprice']]

sns.pairplot(subset)

Lot area, full baths, bedrooms above ground, and total rooms above ground appear to be correlated to sale price. Half baths not so much.

In [None]:
subset2 = ames[['overall_qual','overall_cond','saleprice']]
sns.pairplot(subset2)

Unsurprisingly, overall quality and overall condition appear to be strongly correlated to saleprice. They are highly correlated with each other, so I may need to think about potential multicollinearity here.

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
from sklearn.linear_model import RidgeCV, Ridge
from sklearn.preprocessing import StandardScaler

In [None]:
# Create a simple model
features = ['lot_area','full_bath','half_bath','bedroom_abvgr','totrms_abvgrd',
            'overall_qual','overall_cond']

X = ames[features]
y = ames['saleprice']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 

lr = LinearRegression()

In [None]:
# Create a new simple model
features = ['full_bath','half_bath','bedroom_abvgr','totrms_abvgrd',
            'overall_qual','overall_cond','fireplaces','year_built','fireplaces','1st_flr_sf','2nd_flr_sf']

X = ames[features]
y = ames['saleprice']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 

lr = LinearRegression()

In [None]:
# Add Polynomial Features to my model
features = ['full_bath','half_bath','bedroom_abvgr','totrms_abvgrd',
            'overall_qual','overall_cond','fireplaces','year_built','fireplaces','1st_flr_sf','2nd_flr_sf']

poly = PolynomialFeatures(include_bias = False)

X = ames[features]
y = ames['saleprice']

X_poly = poly.fit_transform(X)

# train test split
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state = 42) 

lr = LinearRegression()

In [None]:
# Add Polynomial Features to my model, add more variables, and use Ridge
features = ['full_bath','half_bath','bedroom_abvgr','totrms_abvgrd',
            'overall_qual','overall_cond','fireplaces','year_built','fireplaces','1st_flr_sf','2nd_flr_sf','garage_cars','garage_area']

poly = PolynomialFeatures(include_bias = False)

X = ames[features]
y = ames['saleprice']

X_poly = poly.fit_transform(X)

# train test split
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state = 42) 

# scale the predictors
ss = StandardScaler()
ss.fit(X_train)
X_train_scaled = ss.transform(X_train)
X_test_scaled = ss.transform(X_test)

r_alphas = np.logspace(0, 5, 200)
ridge_model = RidgeCV(alphas=r_alphas, store_cv_values=True)
ridge_model = ridge_model.fit(X_train_scaled, y_train)
r_optimal_alpha = ridge_model.alphas

ridge_opt = Ridge(alpha=r_optimal_alpha)
ridge_opt.fit(X_train_scaled, y_train)

In [None]:
# Add more numeric variables - see what happens
#features = ['lot_area','mas_vnr_area','kitchen_abvgr','garage_cars','garage_area','full_bath','half_bath','bedroom_abvgr','totrms_abvgrd',
            'overall_qual','overall_cond','fireplaces','year_built','yr_sold','fireplaces','1st_flr_sf','2nd_flr_sf']

#poly = PolynomialFeatures(include_bias = False)

#X = ames[features]
#y = ames['saleprice']

#X_poly = poly.fit_transform(X)

# train test split
#X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state = 42) 

#lr = LinearRegression()

In [None]:
ames.shape

In [None]:
ames = pd.get_dummies(ames, columns = ['neighborhood', 'bldg_type', 'house_style'], drop_first = True)
ames.shape

In [None]:
ames.filter(regex = 'neighborhood').columns

In [None]:
# Try to add dummy variables to data trying with polynomial
features = ['full_bath','half_bath','bedroom_abvgr','totrms_abvgrd',
            'overall_qual','overall_cond','fireplaces','year_built','fireplaces','1st_flr_sf','2nd_flr_sf','bldg_type_2fmCon','bldg_type_Duplex', 'bldg_type_Twnhs', 'bldg_type_TwnhsE',
       'house_style_1.5Unf', 'house_style_1Story', 'house_style_2.5Fin',
       'house_style_2.5Unf', 'house_style_2Story', 'house_style_SFoyer',
       'house_style_SLvl']

poly = PolynomialFeatures(include_bias = False)

X = ames[features]
y = ames['saleprice']

X_poly = poly.fit_transform(X)

In [None]:
# Try to add dummy variables to data trying with polynomial
features = ['full_bath','half_bath','bedroom_abvgr','totrms_abvgrd',
            'overall_qual','overall_cond','fireplaces','year_built','fireplaces','1st_flr_sf','2nd_flr_sf','bldg_type_2fmCon','bldg_type_Duplex', 'bldg_type_Twnhs', 'bldg_type_TwnhsE',
       'house_style_1.5Unf', 'house_style_1Story', 'house_style_2.5Fin',
       'house_style_2.5Unf', 'house_style_2Story', 'house_style_SFoyer',
       'house_style_SLvl','garage_cars','garage_area']

poly = PolynomialFeatures(include_bias = False)

X = ames[features]
y = ames['saleprice']

X_poly = poly.fit_transform(X)

In [None]:
# Another attempt with no polynomials 
features = ['full_bath','half_bath','bedroom_abvgr','totrms_abvgrd',
            'overall_qual','overall_cond','fireplaces','year_built','fireplaces','1st_flr_sf','2nd_flr_sf','garage_cars','garage_area']

poly = PolynomialFeatures(include_bias = False)

X = ames[features]
y = ames['saleprice']

X_poly = poly.fit_transform(X)

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state = 42) 

lr = LinearRegression()

In [None]:
# fit the model
lr.fit(X_train, y_train)

In [None]:
# score the model
lr.score(X_test, y_test)

In [None]:
# Load the test data, clean it, then predict salesprice using model developed above
ames_test = pd.read_csv('../data/test.csv')

In [None]:
inspect_data(ames_test)

In [None]:
clean_ames_data(ames_test)

In [None]:
ames_test = pd.get_dummies(ames_test, columns = ['neighborhood', 'bldg_type', 'house_style'], drop_first = True)

In [None]:
ames_test.align(ames, join = 'inner')
inspect_data(ames_test)

In [None]:
X_ames_test = ames_test[features]
X_ames_test_poly = poly.fit_transform(X_ames_test)

In [None]:
ames_test['salesprice_hat'] = lr.predict(X_ames_test_poly)

In [None]:
ames_test.head()

In [None]:
ames_test.shape

In [None]:
# Re-format Id and SalePrice columns to meet Kaggle requirements
ames_test.rename({'id': 'Id', 'salesprice_hat': 'SalePrice'}, axis = 1, inplace = True)

In [None]:
predictions = ames_test[['Id','SalePrice']].to_csv('../data/submission_8.csv', index = False)