# Project 2: Regression Challenge

## Matt Garton - General Assembly Boston (DSI)
### Clean notebook representing my workflow (data cleaning -> EDA -> feature engineering -> model design -> results)

In [None]:
# import necessary modules
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics
import sklearn.linear_model as linear_model
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

sns.set_style('darkgrid')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [None]:
# write a function to handle basic data inspection
def inspect_data(df):
    '''Perfrom basic data inspection tasks, provide user with readable output'''
    print('Shape: {}\n'.format(df.shape))
    print(df.info(),'\n')
    print(df.isnull().sum())

In [None]:
# load training data
ames = pd.read_csv("../data/train.csv")

# inspect the data
inspect_data(ames)

In [None]:
# write a function to clean data

def clean_ames_data(df):
    '''Generalized function to clean a sample of Ames Housing Data'''
    
    # convert column names to useable format
    df.columns = [x.lower().replace(' ','_') for x in df.columns]
    
    # drop 'id' and 'pid' columns
    #df.drop(['id','pid'], axis=1, inplace=True)

    # Dealing with NaN values. Handling the special case of Masonry Veneer Type first
    
    df['mas_vnr_type'].fillna(value = 'None', inplace = True) # Assuming 'NaN' should be 'None' for Masonry Type
    df['mas_vnr_area'].fillna(value = 0.0, inplace = True) # Assuming masonry area is 0.0 for houses with 'NaN' type
    
    # for categorical variables, the missing values should actually be marked 'NA'
    nulls = df.columns[df.isnull().any()]
    for col in df[nulls].select_dtypes(include = 'object').columns:
        df[col].fillna(value = 'NA', inplace = True)
    
    # filtering for houses with no basement, replacing numerical columns 'NaNs' with 0.0
    no_bsmt = df['bsmt_qual'] == 'NA'    
    for col in df[no_bsmt].filter(regex = 'bsmt'):
        df[col].fillna(value = 0.0, inplace = True)
        
    # go back and set 'garage yr built to 0 for houses w/ no garage'
    for row in df[no_garage]:
        df['garage_yr_blt'] = df['year_built']
    
    # use the same procedure to handle numerical columns for houses with no garage
    no_garage = df['garage_type'] == 'NA' 
    for col in df[no_garage].filter(regex = 'garage'):
        df[col].fillna(value = 0.0, inplace = True)


In [None]:
# divide your dataset into matrix X and vector y


In [None]:
# perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42) 

# align train and test dataframes to ensure they have the same columns


In [None]:
# clean the data
clean_ames_data(ames)

In [None]:
# re-inspect the data
inspect_data(ames)

In [None]:
ames.describe().T

In [None]:
sns.heatmap(ames.select_dtypes(include = ['float64']).corr())