In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


In [29]:
ordinal_dicts = {'Lot Shape': {'Reg': 3, 'IR1': 2, 'IR2': 1, 'IR3': 0},
                 'Utilities': {'AllPub': 3, 'NoSewr': 2, 'NoSeWa': 1, 'ELO': 0},
                 'Land Slope': {'Gtl': 2, 'Mod': 1, 'Sev': 0},
                 'Exter Qual': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0},
                 'Exter Cond': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0},
                 'Bsmt Qual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
                 'Bsmt Cond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
                 'Bsmt Exposure': {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'NA': 0},
                 'BsmtFin Type 1': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0},
                 'BsmtFin Type 2': {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'LwQ': 2, 'Unf': 1, 'NA': 0},
                 'Heating QC': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0},
                 'Electrical': {'SBrkr': 4, 'FuseA': 3, 'FuseF': 2, 'FuseP': 1, 'Mix': 0},
                 'Kitchen Qual': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'Po': 0},
                 'Fireplace Qu': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
                 'Functional': {'Typ': 7, 'Min1': 6, 'Min2': 5, 'Mod': 4, 'Maj1': 3, 'Maj2': 2, 'Sev': 1, 'Sal': 0},
                 'Garage Finish': {'Fin': 3, 'RFn': 2, 'Unf': 1, 'NA': 0},
                 'Garage Qual': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
                 'Garage Cond': {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0},
                 'Paved Drive': {'Y': 2, 'P': 1, 'N': 0},
                 'Pool QC': {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'NA': 0},
                 'Fence': {'GdPrv': 4, 'MnPrv': 3, 'GdWo': 2, 'MnWw': 1, 'NA': 0}
                }

nominal_cols = ['PID', 'MS SubClass', 'MS Zoning', 'Land Contour', 'Street',
                 'Alley', 'Lot Config', 'Neighborhood', 'Condition 1',
                 'Condition 2', 'Bldg Type', 'House Style', 'Roof Style',
                 'Roof Matl', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
                 'Foundation', 'Garage Type', 'Misc Feature', 'Heating',
                 'Central Air', 'Sale Type']

ordinal_cols = ['Lot Shape', 'Utilities', 'Land Slope', 'Overall Qual',
                 'Overall Cond', 'Exter Qual', 'Exter Cond', 'Bsmt Qual',
                 'Bsmt Cond', 'Bsmt Exposure', 'BsmtFin Type 1',
                 'BsmtFin Type 2', 'Heating QC', 'Electrical', 'Kitchen Qual',
                 'Fireplace Qu', 'Functional', 'Garage Finish', 'Garage Qual',
                 'Garage Cond', 'Paved Drive', 'Pool QC', 'Fence']
    
discrete_cols = ['Year Built', 'Year Remod/Add', 'Bsmt Full Bath',
                 'Bsmt Half Bath', 'Full Bath', 'Half Bath', 'Bedroom AbvGr',
                 'Kitchen AbvGr', 'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt',
                 'Garage Cars', 'Mo Sold', 'Yr Sold']

continuous_cols = ['Lot Frontage', 'Lot Area', 'Mas Vnr Area', 'BsmtFin SF 1',
                   'BsmtFin SF 2', 'Bsmt Unf SF', 'Total Bsmt SF', '1st Flr SF',
                   '2nd Flr SF', 'Low Qual Fin SF', 'Gr Liv Area', 'Garage Area',
                   'Wood Deck SF', 'Open Porch SF', 'Enclosed Porch', '3Ssn Porch',
                   'Screen Porch', 'Pool Area', 'Misc Val', 'SalePrice']

In [None]:
def preprocess(df):
    '''Cleans and preprocesses dataframe.  
    Returned dataframe has all numeric columns.'''
    df = replace_nan(df)
    df = convert_ordinal(df)
    df = make_dummies(df)
    return df

In [24]:
def convert_ordinal(df):
    '''Preprocess helper function. Takes data-
    frame with unmapped ordinal columns and
    returns dataframe with mapped ordinal.'''
    for col in df:
        if col in ordinal_dicts.keys():
            df[col] = df[col].map(ordinal_dicts[col])
    return df

In [4]:
def make_dummies(df):
    '''Preprocess helper function. Takes data-
    frame with nominal columns represented as
    strings and returns dataframe with nominal
    columns removed and replaced with dummies.'''
    for col in df.columns:
        if col in nominal_cols:
            df = pd.get_dummies(data=df, columns = [col], drop_first=True)
    return df

In [5]:
def replace_nan(df):
    '''Preprocess helper function. Takes data-
    frame with NaN values and returns dataframe
    with NaNs replaced with 0 or "NA" depending 
    on column dtype.'''
    for col in df.columns:
        if (df[col].dtype == np.float64 or df[col].dtype == np.int64):
              df[col].fillna(0,inplace = True)
        else:
              df[col].fillna('NA',inplace = True)
    return df

In [30]:
def display_corr_heatmap(df,target,title_prefix="",save=False):
    '''EDA helper function. Takes dataframe and
    target column to compare correlation values 
    against. Displays heatmap with option to save
    as file and augment figure title.'''
    corr = df.corr()
    plt.figure(figsize=(16,9))
    corr_hmap = sns.heatmap(corr[[target]].sort_values(by=target,
                                                       ascending=False),
                            vmin=-1,
                            vmax=1,
                            annot= True,
                            cmap='icefire');
    title_name = title_prefix+'Features Correlation with '+target
    plt.title(title_name,size=20)
    if save:
        fname = "".join(title_name.split())
        plt.savefig(fname+".png")

In [33]:
def generate_histograms(df,h,k,save=False,fname="untitled_hist"):
    '''EDA helper function. Takes dataframe and
    plt subplot dimensions to display distributions
    of columns as seaborn distplots. Option to save
    figure as file.'''
    plt.figure(figsize=(16,9))
    sns.set_style('darkgrid')
    f, axes = plt.subplots(h, k, figsize=(25, 9), sharex=False);
    for i in range(h):
        for j in range(k):
            sns.distplot(df[features[i*h+j]], ax=axes[i, j], hist=True, kde=False, bins=100);
    if save:
        plt.savefig(fname+'.png')

In [27]:
def model(df, features, target, print_metrics=True, save=False, fname = "untitled_metrics"):
    '''Function to streamline preprocessing, model 
    selection, modeling, and model evalution. Takes 
    dataframe, features, target, and flags to save
    or print evalution results. Returns selected 
    model and final feature columns.'''
    
    X = df[features]
    y = df[target]
    
    X = preprocess(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = 42)
    
    mod = find_optimal_regression(X_train, y_train)
    
    rmse_sum = 0
    r2_sum = 0
    cross_val_sum = 0
    n = 10
    
    for rand in range(n):
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.2,
                                                    random_state = rand)
    
        mod.fit(X_train, y_train)
        for col in X_train.columns:
            if X_train[col].isnull().sum() != 0:
                print(col)
        pipescore = mod.score(X_test,y_test)
        r2_sum += pipescore
        y_preds = mod.predict(X_test)
        rmse_sum += mean_squared_error(y_test, y_preds, squared=False)
        cross_val_sum += cross_val_score(mod, X_train, y_train).mean()
    
    evals = f"\n\nR2: {round(r2_sum/n,2)}\n\nRMSE: {round(rmse_sum/n,2)}\n\nCrossVal: {round(cross_val_sum/n,2)}\n\n"
    
    if save:
        file = open(fname+".txt", "x")
        file.write(features)
        file.write(evals)
        file.write(mod)
        file.close()
    
    if print_metrics:
        print(evals)
        print(mod)
    
    return mod,X.columns

In [40]:
# Alpha range code taken from Analytics Vidhya
# https://www.analyticsvidhya.com/blog/2016/01/ridge-lasso-regression-python-complete-tutorial/
def find_optimal_regression(X_train, y_train):
    
    '''Model selection and tuning function. Uses 
    GridSearch to iterate over 7 potential alpha
    values for both ridge and lasso. Returns best
    scoring model between ridge, lasso, and linear
    regression with no regularization.'''
    
    ridge_grid = {'alpha': [1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
    lasso_grid = {'alpha': [1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
    
    ridge_gridsearch = GridSearchCV(estimator=Ridge(),
                              param_grid=ridge_grid,
                              cv=5,
                              verbose=1)
    
    lasso_gridsearch = GridSearchCV(estimator=Ridge(),
                              param_grid=ridge_grid,
                              cv=5,
                              verbose=1)
    
    lr = LinearRegression()
    
    ridge_gridsearch.fit(X_train, y_train)
    lasso_gridsearch.fit(X_train, y_train)
    lr.fit(X_train, y_train)
    
    lr_score = cross_val_score(lr,X_train,y_train).mean()
    ridge_score = ridge_gridsearch.best_score_
    lasso_score = lasso_gridsearch.best_score_
    
    scores = [lr_score, ridge_score, lasso_score]
    
    if max(scores)==ridge_score:
        return ridge_gridsearch.best_estimator_
    elif max(scores)==lasso_score:
        return lasso_gridsearch.best_estimator_
    else:
        return lr
    

## All functions below this point were not used in Project 2 ##
But I will keep them here for my future reference

In [None]:
def get_pipeline(X_train, y_train):
    model = find_optimal_regression(X_train, y_train)
    pipe = Pipeline([('ss', StandardScaler()),
                    (str(type(model)),model)])
    return pipe,model

In [37]:
def create_pipeline(X, y):

    
    imp_num = FunctionTransformer(imputer_numeric)
    ss = StandardScaler()
    
    imp_cat = FunctionTransformer(imputer_categorical)
    ord_enc = FunctionTransformer(convert_ordinal)
    ohe = OneHotEncoder(drop='first',handle_unknown='error')
    
    num_cols = [col for col in X.columns if (X[col].dtype == np.int64 or X[col].dtype == np.float64)]
    cat_cols = [col for col in X.columns if (col not in num_cols)]
    ord_cols = [col for col in X.columns if (col in ordinal_dicts.keys())]
    nom_cols = [col for col in X.columns if (col in nominal_cols)]
    
    ord_nom_trans = ColumnTransformer([('ord', ord_enc, ord_cols),
                                  ('nom', ohe, nom_cols)])
    #nom_trans = ColumnTransformer([('nom', ohe, nom_cols)])
    
    num_pip = Pipeline([('imp_num', imp_num), ('scale', ss)])
    cat_pip = Pipeline([('imp_cat', imp_cat), ('ord_nom', ord_nom_trans)])
    
    preprocessing = ColumnTransformer([('numerical', num_pip, num_cols), ('categorical', cat_pip, cat_cols)],remainder='passthrough')

    X = cat_pip.fit_transform(X)
    print(X)
    X = ord_nom_trans.fit_transform(X)
    print(type(X))
    X = num_pip.fit_transform(X)
    
    
    #model = find_optimal_regression(X_train, y_train)
    
    # Make pipeline
    pipe = Pipeline([('preprocess', preprocessing),
                    ('linreg', LinearRegression())])
                    #(str(type(model)),model)])
    
    return pipe    

In [38]:
# Takes: dataframe and list of tuples of columns to make interacting
# Returns: dataframe with interacting variables 
def generate_interactor(df,variables):
    
    for a,b in variables:
        df[a+" X "+b] = df[a]*df[b]
    
    return df
    

In [39]:
# Takes: dataframe, target variable, list of tuple columns to evaluate interacting
# Prints: coefficients and p-values for each interactor
def evaluate_interactor(df,features,target):
    poly = PolynomialFeatures(include_bias = False)
    X = df[features]
    X_poly = poly.fit_transform(X)
    poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names(features))
    poly_df[target] = df[target]
    display_corr_heatmap(poly_df,target,title_prefix="Polynomial ")