## Loading packages and dataset

In [None]:
using DataFrames
using ScikitLearn: fit!, predict, @sk_import, fit_transform!

@sk_import preprocessing: LabelEncoder
@sk_import model_selection: cross_val_score 
@sk_import metrics: accuracy_score
@sk_import linear_model: LogisticRegression
@sk_import ensemble: RandomForestClassifier
@sk_import tree: DecisionTreeClassifier

In [3]:
train = readtable("train.csv");
test = readtable("test.csv");
samplesub = readtable("sample_sub.csv");

## Imputing Missing Values from dataset

In [4]:
showcols(train)

614×13 DataFrames.DataFrame

│ Col # │ Name              │ Eltype  │ Missing │

├───────┼───────────────────┼─────────┼─────────┤

│ 1     │ Loan_ID           │ String  │ 0       │

│ 2     │ Gender            │ String  │ 13      │

│ 3     │ Married           │ String  │ 3       │

│ 4     │ Dependents        │ String  │ 15      │

│ 5     │ Education         │ String  │ 0       │

│ 6     │ Self_Employed     │ String  │ 32      │

│ 7     │ ApplicantIncome   │ Int64   │ 0       │

│ 8     │ CoapplicantIncome │ Float64 │ 0       │

│ 9     │ LoanAmount        │ Int64   │ 22      │

│ 10    │ Loan_Amount_Term  │ Int64   │ 14      │

│ 11    │ Credit_History    │ Int64   │ 50      │

│ 12    │ Property_Area     │ String  │ 0       │

│ 13    │ Loan_Status       │ String  │ 0       │

In [5]:
train[isna.(train[:LoanAmount]), :LoanAmount] = floor(mean(dropna(train[:LoanAmount])));
train[train[:LoanAmount] .== 0, :LoanAmount] = floor(mean(dropna(train[:LoanAmount])));
train[isna.(train[:Gender]), :Gender] = mode(dropna(train[:Gender]));
train[isna.(train[:Married]), :Married] = mode(dropna(train[:Married]));
train[isna.(train[:Dependents]), :Dependents] = mode(dropna(train[:Dependents]));
train[isna.(train[:Self_Employed]), :Self_Employed] = mode(dropna(train[:Self_Employed]));
train[isna.(train[:Loan_Amount_Term]), :Loan_Amount_Term] = mode(dropna(train[:Loan_Amount_Term]));
train[isna.(train[:Credit_History]), :Credit_History] = mode(dropna(train[:Credit_History]));

In [6]:
showcols(test)

367×12 DataFrames.DataFrame

│ Col # │ Name              │ Eltype │ Missing │

├───────┼───────────────────┼────────┼─────────┤

│ 1     │ Loan_ID           │ String │ 0       │

│ 2     │ Gender            │ String │ 11      │

│ 3     │ Married           │ String │ 0       │

│ 4     │ Dependents        │ String │ 10      │

│ 5     │ Education         │ String │ 0       │

│ 6     │ Self_Employed     │ String │ 23      │

│ 7     │ ApplicantIncome   │ Int64  │ 0       │

│ 8     │ CoapplicantIncome │ Int64  │ 0       │

│ 9     │ LoanAmount        │ Int64  │ 5       │

│ 10    │ Loan_Amount_Term  │ Int64  │ 6       │

│ 11    │ Credit_History    │ Int64  │ 29      │

│ 12    │ Property_Area     │ String │ 0       │

In [7]:
test[isna.(test[:Gender]), :Gender] = mode(dropna(test[:Gender]));
test[isna.(test[:Dependents]), :Dependents] = mode(dropna(test[:Dependents]));
test[isna.(test[:Self_Employed]), :Self_Employed] = mode(dropna(test[:Self_Employed]));
test[isna.(test[:LoanAmount]), :LoanAmount] = floor(mean(dropna(test[:LoanAmount])));
test[test[:LoanAmount] .== 0, :LoanAmount] = floor(mean(dropna(test[:LoanAmount])));
test[isna.(test[:Loan_Amount_Term]), :Loan_Amount_Term] = mode(dropna(test[:Loan_Amount_Term]));
test[isna.(test[:Credit_History]), :Credit_History] = mode(dropna(test[:Credit_History]));

## Label Encoding categorical data

In [11]:
labelencoder = LabelEncoder()
categories = [2 3 4 5 6 12 13]

for col in categories
    train[col] = fit_transform!(labelencoder, train[col])
    if col != 13
        test[col] = fit_transform!(labelencoder, test[col])
    end
end

## Building Model

In [62]:
function classification_model(model, predictors)
    y = convert(Array, train[:13])
    X = convert(Array, train[predictors])
    X2 = convert(Array, test[predictors])                 

    #Fit the model:
    fit!(model, X, y)

    #Make predictions on training set:
    predictions = predict(model, X)

    #Print accuracy
    accuracy = accuracy_score(predictions, y)
    println("\naccuracy: ",accuracy)

    #5 fold cross validation
    cross_score = cross_val_score(model, X, y, cv=5)
    
    #print cross_val_score
    println("cross_validation_score: ", mean(cross_score))
    
    #return predictions
    fit!(model, X, y)
    pred = predict(model, X2)
    return pred
end

classification_model (generic function with 2 methods)

In [78]:
predictors = [:ApplicantIncome, :CoapplicantIncome, :LoanAmount, :Credit_History, :Loan_Amount_Term, :Gender, :Dependents]

7-element Array{Symbol,1}:
 :ApplicantIncome  
 :CoapplicantIncome
 :LoanAmount       
 :Credit_History   
 :Loan_Amount_Term 
 :Gender           
 :Dependents       

### Logistic Regression

In [103]:
lrmodel = LogisticRegression()
lrpred = classification_model(lrmodel, predictors);



accuracy: 0.8127035830618893

cross_validation_score: 0.8095716552088842


### Decision Tree Classifier

In [104]:
dtcmodel = DecisionTreeClassifier()
dtcpred = classification_model(dtcmodel, predictors);



accuracy: 1.0

cross_validation_score: 0.718455843469064


### Random Forest Classifier

In [105]:
rfmodel = RandomForestClassifier(n_estimators=25, min_samples_split=25, max_depth=7, max_features=1, n_jobs=-1)
rfpred = classification_model(rfmodel, predictors);



accuracy: 0.8159609120521173

cross_validation_score: 0.8030671602326811


## Create Submission File

In [93]:
pred = map(x-> if x==1 "Y" else "N" end, lrpred)  #Convert to "Y" and "N" 
outdf = DataFrame(Loan_ID=test[:Loan_ID], Loan_Status=pred)
writetable("sub.csv", outdf)