In [138]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline 

PATH = "test.csv"
PATH_2 = "train.csv"

test_data = pd.read_csv(PATH)
train_data = pd.read_csv(PATH_2)

In [139]:
train_data.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [140]:
#determine features to be considered
train_data.columns 

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'],
      dtype='object')

In [141]:
# determine the features to consider 
features = ['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'ApplicantIncome', 'CoapplicantIncome',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area']

train_X = train_data[features]
test_X = test_data[features]

# prediction target
train_y = train_data.LoanAmount
test_y = test_data.LoanAmount


In [142]:
# using pipelines for preprocessing and modelling (numerical & categorical columns)

numerical_cols = [col for col in train_X.columns 
                 if train_X[col].dtype in ['int64', 'float64']]
print(numerical_cols)

categorical_cols = [col for col in train_X.columns
                   if train_X[col].nunique() < 10 and 
                   train_X[col].dtype == 'object']
print(categorical_cols)
NaN_cols = [col for col in train_X.columns
                if train_X[col].isnull().any()]
print(NaN_cols)



# preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

NaN_transformer = SimpleImputer()

#preprocessing for categorical data
categorical_transformer = Pipeline(steps= [
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#bundle preprocessing for both numerical and categorical data
preprocessor = ColumnTransformer(
    transformers = [
    ('num', numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols),
    ('Nan', NaN_transformer, NaN_cols)
])

#define model and build pipeline 
model = RandomForestRegressor(n_estimators=100, random_state=1)

my_pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', model)
])

#fit and make predictions
my_pipeline.fit(train_X, train_y)



['ApplicantIncome', 'CoapplicantIncome', 'Loan_Amount_Term', 'Credit_History']
['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
['Gender', 'Married', 'Dependents', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History']


ValueError: Cannot use mean strategy with non-numeric data:
could not convert string to float: 'Male'