# Matthew Garton
# Regression Project - Data Handling Functions

### Purpose: Define functions to automate basic workflows for handling data, fitting, and running my models

### Note: 
I have used some, but not all, of these functions in my project by copy and pasting them into the relevant notebook. I am sure this is not the most efficient way to do this. My goals to improve upon my work would be to finalize these functions and use them effectively, so that I can run through my data processing, model training, and model implementation as seamlessly as possible. This notebook is a starting point for that framework.

In [1]:
# import necessary modules - will clean this up later; starting with 'kitchen sink approach'
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.metrics as metrics
import sklearn.linear_model as linear_model
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV, Ridge

sns.set_style('darkgrid')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline

In [2]:
# define function to handle basic data inspection
def inspect_data(df):
    '''Perfrom basic data inspection tasks, provide user with readable output'''
    print('Shape: {}\n'.format(df.shape))
    print(df.info(),'\n') 
    print('Null Counts: \n',df.isnull().sum(),'\n')
    
    return df.describe().T

In [3]:
# define function to clean the data
def clean_ames_data(df):
    '''Generalized function to clean a sample of Ames Housing Data'''
    
    # convert column names to useable format
    df.columns = [x.lower().replace(' ','_') for x in df.columns]
    
    # drop 'id' and 'pid' columns
    #df.drop(['id','pid'], axis=1, inplace=True)

    # Dealing with NaN values. Handling the special case of Masonry Veneer Type first
    
    df['mas_vnr_type'].fillna(value = 'None', inplace = True) # Assuming 'NaN' should be 'None' for Masonry Type
    df['mas_vnr_area'].fillna(value = 0.0, inplace = True) # Assuming masonry area is 0.0 for houses with 'NaN' type
    
    # for categorical variables, the missing values should actually be marked 'NA'
    nulls = df.columns[df.isnull().any()]
    for col in df[nulls].select_dtypes(include = 'object').columns:
        df[col].fillna(value = 'NA', inplace = True)
    
    # filtering for houses with no basement, replacing numerical columns 'NaNs' with 0.0
    no_bsmt = df['bsmt_qual'] == 'NA'    
    for col in df[no_bsmt].filter(regex = 'bsmt'):
        df[col].fillna(value = 0.0, inplace = True)
        
    # use the same procedure to handle numerical columns for houses with no garage
    no_garage = df['garage_type'] == 'NA' 
    for col in df[no_garage].filter(regex = 'garage'):
        df[col].fillna(value = 0.0, inplace = True)

In [4]:
# define a function to create the dummy variables I need
def create_ames_dummies(df, columns):
    '''Turn categorical variables into dummies'''

    # specialized dummy to account for a specific house style
    style_mask = df['house_style'] == '2.5Fin'
    df['StyleDummy'] = np.where(df['house_style'] == '2.5Fin', 1, 0)

    # specialized dummy for being adjacent to or near a positive feature
    df['PosFeature'] = np.where((df['condition_2'] == 'PosN') | (df['condition_2'] == 'PosA'), 1, 0)

    # Create categorical variable for Location
    df['Location'] = df['neighborhood']
    df['Location'].replace({'MeadowV':'Low','IDOTRR': 'Low','BrDale': 'Low','OldTown': 'Low',
                                'Edwards':'Low','BrkSide':'Low', 'Landmrk': 'LowMed','Sawyer': 'LowMed',
                                'SWISU':'LowMed','NAmes':'LowMed','NPkVill':'LowMed','Blueste':'LowMed',
                                'Mitchel':'LowMed','Gilbert':'MedHigh','Greens':'MedHigh','SawyerW':'MedHigh',
                                'NWAmes':'MedHigh','Blmngtn':'MedHigh','CollgCr':'MedHigh','ClearCr':'MedHigh',
                                'Crawfor':'MedHigh','Somerst':'High','Timber':'High','Veenker':'High','GrnHill':'High',
                                'NoRidge':'High','NridgHt':'High','StoneBr':'High'}, inplace = True)
    
    # one-hot encoding for other categorical variables
    df = pd.get_dummies(df, columns = columns, drop_first = True)
    
    return df

In [None]:
# define a function to prepare data for modeling
def prepare_training_data(df, predictors, target, scale_data = False, polynomial = False):
    '''
    Prepares training data for model fitting
    
    Params
    -------
    df: DataFrame
    predictors: list of columns to use as predictor variables
    target: column to use as response variable
    scale_data: default False; set to True to standard-scale data
    polynomial: default False; set to True to generate polynomial features
    '''
    
    clean_ames_data(df) # clean the data
    create_ames_dummies(df) # create dummies
    
    # split data into X and y
    X = df[predictors]
    y = df[target]
    
    # use polynomial features, if appropriate
    if polynomial == True:
        poly = PolynomialFeatures(include_bias = False)
        X = poly.fit_transform(X)
        
    # train test split
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    # standard-scale data, if appropriate
    if scale_data == True:
        ss = StandardScaler()
        X_train = ss.fit_transform(X_train)
        X_test = ss.fit_transform(X_test)
    
    return X_train, X_test, y_train, y_test
        

In [None]:
# define a function to prepare testing data for the model
def prepare_testing_data(df, predictors, train_df, scale_data = False, polynomial = False):
    '''
    Prepares testing data for use in prediction model. Ensure testing data has the same 
    format as the training data on which the model was fit.
    
    Params
    -------
    df: test DataFrame
    predictors: list of columns to use as predictors
    train_df: training data, used to compare testing data to
    scale_data: default False; set to True to standard-scale data
    polynomial: default False; set to True to generate polynomial features
    '''
    
    clean_ames_data(df) # clean the data
    create_ames_dummies(df) # create dummies
    
    # align testing data with training data to ensure no missing columns
    df, train_df = train_df.align(df, join = 'outer', axis = 1, fill_value = 0.0)
    
    # create matrix of predictor variables
    X = df[predictors]
    
    # use polynomial features, if appropriate
    if polynomial == True:
        poly = PolynomialFeatures(include_bias = False)
        X = poly.fit_transform(X)
    
    # standard-scale data, if appropriate
    if scale_data == True:
        ss = StandardScaler()
        X = ss.fit_transform(X)
    
    return X

In [None]:
# define a generalized model to deploy a model on new data
def deploy_model(data, X, model, submission):
    '''
    Given prepared data, predictor matrix, and a model, use the model to predict response variable.
    Export the result to a csv file in the appropriate directory'''
    
    data['salesprice_hat'] = model.predict(X) # predicy y
    
    # Re-format Id and SalePrice columns to meet Kaggle requirements
    data.rename({'id': 'Id', 'salesprice_hat': 'SalePrice'}, axis = 1, inplace = True)
    
    result = data[['Id','SalePrice']] # store the relevant, formatted columns into a df
    
    # export the result to a csv in the appropriate directory
    result.to_csv('../data/submission_{}.csv'.format(submission), index = False) 

In [None]:
# define a function to fit and evaluate the chosen model
def fit_eval_model(X_train, X_test, y_train, y_test, kind = 'linear'):
    '''
    Given training and testing data, as well as a model selection, fit the model
    using the training data, and evaluate the model on both training and testing data.
    Return a model and a model evaluation
    '''
    
    # Instantiate model
    if kind == 'linear':
        model = LinearRegression()
    elif kind == 'lasso':
        model = LassoCV()
    elif kind == 'ridge':
        model = RidgeCV()
    elif kind == 'elastic net':
        model = ElasticNetCV()
    
    # Fit the model to the training data
    model = model.fit(X_train, y_train)
    
    # score on training data
    print('Training Score: {}'.format(lr.score(X_train, y_train)))
    
    # score on test data
    print('Testing Score: {}'.format(lr.score(X_test, y_test)))
          
    return model