In [1]:
# AI Singapore
# Regression 2 Exercise
# Exercise: Building a Regression job template

# 1. Import required libraries
import numpy as np 
import pandas as pd 
import datetime as d

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import joblib

Information on Data
https://www.kaggle.com/c/home-data-for-ml-course/data


In [2]:
# Custom Classes and Functions
def display_df_info(df_name, my_df, v=False):
    """Convenience function to display information about a dataframe"""

    print("Data: {}".format(df_name))
    print("Shape (rows, cols) = {}".format(my_df.shape))
    print("First few rows...")
    print(my_df.head())

    # Optional: Display other optional information with the (v)erbose flag
    if v:
        print("Dataframe Info:")
        print(my_df.info())

In [3]:
class GetAge(BaseEstimator, TransformerMixin):
    """Custom Transformer: Calculate age (years only) relative to current year. Note that 
    the col values will be replaced but the original col name remains. When the transformer is 
    used in a pipeline, this is not an issue as the names are not used. However, if the data 
    from the pipeline is to be converted back to a DataFrame, then the col name change should 
    be done to reflect the correct data content."""

    def fit(self, X, y=None):
        return self
    
    def transform(self,X):
        current_year = int(d.datetime.now().year)

        """TASK: Replace the 'YearBuilt' column values with the calculated age (subtract the 
        current year from the original values). -----------Done
        """
        X['Age'] = current_year - X['YearBuilt']
        # X.drop(['YearBuilt'], axis=1 , inplace=True) 

        return X

In [4]:
import os
os.getcwd()

'C:\\Users\\lorei\\anaconda3\\ai4i\\sup3-regression\\home-data-for-ml-course'

In [6]:
def main():
    
    # DATA INPUT
    ############
    file_path = "train_csv_EDA.csv" #TASK: Modify to path of file ---- Done
    input_data = pd.read_csv(file_path) # TASK: Read in the input csv file using pandas ---- Done
    display_df_info("Raw Input", input_data)

    # Seperate out the outcome variable from the loaded dataframe
    output_var_name = 'SalePrice'
    output_var = input_data[output_var_name]
    input_data.drop(output_var_name, axis=1, inplace=True)

    # DATA ENGINEERING / MODEL DEFINITION
    #####################################

    # Subsetting the columns: define features to keep
    feature_names = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 
                     'TotRmsAbvGrd', 'HouseStyle'] # TASK: Define the names of the columns to keep
    features = input_data[feature_names]
    display_df_info('Features before Transform', features, v=True)

    # Create the pipeline ...
    # 1. Pre-processing
    # Define variables made up of lists. Each list is a set of columns that will go through the same data transformations.
    # numerical_features = [col for col in features.columns if features.dtypes[col] != 'object'] # TASK: Define numerical column names
    # categorical_features = [col for col in features.columns if col not in numerical_features] # TASK: Define categorical column names
    
    
    categorical_features = features.select_dtypes(include="object").columns
    numerical_features = features.select_dtypes(exclude="object").columns

    """TASK:
    Define the data processing steps (transformers) to be applied to the numerical features in the dataset.

    At a minimum, use 2 transformers: GetAge() and one other. Combine them using make_pipeline() or Pipeline()
    """
    int_transformer = Pipeline(steps = [
        ('imputer', SimpleImputer(strategy = 'median')),
        ('scaler', StandardScaler())
        ])

    cat_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

    preprocess = ColumnTransformer(
        transformers=[
            ('ints', int_transformer, numerical_features),
            ('cat', cat_transformer, categorical_features)
            ])


    # preprocess = make_column_transformer(
    #     # ("""TASK: Define transformers""", numerical_features),
    #     (StandardScaler(), GetAge(), numerical_features),
    #     (OneHotEncoder(), categorical_features)
    # )
    
    # 2. Combine pre-processing with ML algorithm
    # model = make_pipeline(
    #     preprocess,
    #     LinearRegression() # TASK : replace with ML algorithm from scikit ---Done
    # )
    reg = LinearRegression()
    model = Pipeline(steps=[
        ('preprocessor', preprocess),
        ('regression', reg)
        ])

    # TRAINING
    ##########
    # Train/Test Split
    """TASK:
    Split the data in test and train sets by completing the train_test_split function below. 
    Define a random_state value so that the experiment is repeatable.
    """
    x_train, x_test, y_train, y_test = train_test_split(features, output_var, test_size=0.3, random_state=42) # TASK: Complete the code ---Done

    # Train the pipeline
    model.fit(x_train, y_train)

    # Optional: Train with cross-validation and/or parameter grid search

    # # Perform 10-fold CV
    # cvscores_10 = cross_val_score(reg, features, output_var, cv = 10)
    # print("CV score mean: {}".format(np.mean(cvscores_10)))

    # SCORING/EVALUATION
    ####################
    # Fit the model on the test data
    pred_test = model.predict(x_test)    # y_pred = predicted
    
    # Display the results of the metrics
    """TASK: /Done
    Calculate the RMSE and Coeff of Determination between the actual and predicted sale prices. 
        
    Name your variables rmse and r2 respectively.
    """
    rmse = np.sqrt(mean_squared_error(y_test, pred_test))  # y_test = actual
    # print("pred_test type: {}".format(type(pred_test)))
    r2 = r2_score(y_test, pred_test)


    print("Results on Test Data")
    print("####################")
    print("RMSE: {:.2f}".format(rmse))
    print("R2 Score: {:.5f}".format(r2))
    
    # Compare actual vs predicted values
    """TASK: /Done
    Create a new dataframe which combines the actual and predicted Sale Prices from the test dataset. You
    may also add columns with other information such as difference, abs diff, %tage difference etc.
    
    Name your variable compare
    """
    data = { 'actual': y_test, 'predicted':pred_test } # build dataset

    compare = pd.DataFrame(data) # make it a new DataFrame
    # add a column for difference, abs diff, %tage diff: diff, abs_diff, percentage_diff
    compare['diff'] = compare['predicted'] - compare['actual']
    compare['abs_diff'] = compare['diff'].abs()
    compare['percentage_diff'] = (compare['diff'] / compare['predicted']) * 100

    display_df_info('Actual vs Predicted Comparison', compare)

    # Save the model 
    with open('my_model_lr.joblib', 'wb') as fo:  
        joblib.dump(model, fo)


if __name__ == '__main__':
    main()

Data: Raw Input
Shape (rows, cols) = (1460, 72)
First few rows...
   MSSubClass MSZoning  LotFrontage  LotArea Street LotShape LandContour  \
0          60       RL         65.0     8450   Pave      Reg         Lvl   
1          20       RL         80.0     9600   Pave      Reg         Lvl   
2          60       RL         68.0    11250   Pave      IR1         Lvl   
3          70       RL         60.0     9550   Pave      IR1         Lvl   
4          60       RL         84.0    14260   Pave      IR1         Lvl   

  Utilities LotConfig LandSlope  ... EnclosedPorch 3SsnPorch ScreenPorch  \
0    AllPub    Inside       Gtl  ...             0         0           0   
1    AllPub       FR2       Gtl  ...             0         0           0   
2    AllPub    Inside       Gtl  ...             0         0           0   
3    AllPub    Corner       Gtl  ...           272         0           0   
4    AllPub       FR2       Gtl  ...             0         0           0   

  PoolArea MiscVal  