In [1]:
# importing dependencies
import pandas as pd

In [2]:
# read csv
train_df = pd.read_csv("static/data/train_cleaned.csv")
test_df = pd.read_csv("static/data/test_cleaned.csv")

In [3]:
train_df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,Male,Yes,1,Graduate,No,54996,18096,128000,360,1,Rural,N
1,LP001005,Male,Yes,0,Graduate,Yes,36000,0,66000,360,1,Urban,Y
2,LP001006,Male,Yes,0,Not Graduate,No,30996,28296,120000,360,1,Urban,Y
3,LP001008,Male,No,0,Graduate,No,72000,0,141000,360,1,Urban,Y
4,LP001011,Male,Yes,2,Graduate,Yes,65004,50352,267000,360,1,Urban,Y


In [4]:
# setting up the variables correctly
train_df["Gender"].replace({"Male": "0", "Female": "1"}, inplace=True)
test_df["Gender"].replace({"Male": "0", "Female": "1"}, inplace=True)

train_df["Married"].replace({"Yes": "1", "No": "0"}, inplace=True)
test_df["Married"].replace({"Yes": "1", "No": "0"}, inplace=True)

train_df["Dependents"].replace({"3+": "3"}, inplace=True)
test_df["Dependents"].replace({"3+": "3"}, inplace=True)

train_df["Education"].replace({"Graduate": "1", "Not Graduate": "0"}, inplace=True)
test_df["Education"].replace({"Graduate": "1", "Not Graduate": "0"}, inplace=True)

train_df["Self_Employed"].replace({"Yes": "1", "No": "0"}, inplace=True)
test_df["Self_Employed"].replace({"Yes": "1", "No": "0"}, inplace=True)

train_df["Property_Area"].replace({"Urban": "2", "Semiurban": "1", "Rural": "0"}, inplace=True)
test_df["Property_Area"].replace({"Urban": "2", "Semiurban": "1", "Rural": "0"}, inplace=True)

In [5]:
#min max scaler

In [6]:
# creating list of features
col_list = list(train_df.columns)[1:-1]

In [7]:
# creating X and y from dataframe
X = train_df[col_list]
y = train_df['Loan_Status']

# Use train_test_split to create training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
# Scale your data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train, X_test)
X_test_scaled = scaler.fit_transform(X_test, X_test)

In [9]:
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier(max_features=1.0, min_samples_split=6, n_estimators=150)
model1 = model1.fit(X_train_scaled, y_train)
model1.score(X_test_scaled, y_test)

0.7416666666666667

In [10]:
y_predict = model1.predict(X_test_scaled)

In [11]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_predict)

array([[21, 22],
       [ 9, 68]])

In [12]:
print(f"Training Data Score: {model1.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {model1.score(X_test_scaled, y_test)}")

Training Data Score: 0.9666666666666667
Testing Data Score: 0.7416666666666667


In [13]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [50, 150, 250],
              'max_features': ['sqrt', 0.25, 0.5, 0.75, 1.0],
              'min_samples_split': [2, 4, 6]}
grid = GridSearchCV(model1, param_grid, verbose=3)

In [14]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV] max_features=sqrt, min_samples_split=2, n_estimators=50 .........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=50, score=0.833, total=   0.1s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=50 .........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=50, score=0.778, total=   0.1s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=50 .........


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.2s remaining:    0.0s


[CV]  max_features=sqrt, min_samples_split=2, n_estimators=50, score=0.847, total=   0.1s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=50 .........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=50, score=0.806, total=   0.1s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=50 .........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=50, score=0.778, total=   0.1s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=150 ........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=150, score=0.861, total=   0.2s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=150 ........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=150, score=0.778, total=   0.2s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=150 ........
[CV]  max_features=sqrt, min_samples_split=2, n_estimators=150, score=0.847, total=   0.3s
[CV] max_features=sqrt, min_samples_split=2, n_estimators=150 ........
[CV]  max_features=sqrt, min_sa

[CV]  max_features=0.25, min_samples_split=2, n_estimators=150, score=0.792, total=   0.2s
[CV] max_features=0.25, min_samples_split=2, n_estimators=150 ........
[CV]  max_features=0.25, min_samples_split=2, n_estimators=150, score=0.806, total=   0.2s
[CV] max_features=0.25, min_samples_split=2, n_estimators=250 ........
[CV]  max_features=0.25, min_samples_split=2, n_estimators=250, score=0.847, total=   0.4s
[CV] max_features=0.25, min_samples_split=2, n_estimators=250 ........
[CV]  max_features=0.25, min_samples_split=2, n_estimators=250, score=0.806, total=   0.4s
[CV] max_features=0.25, min_samples_split=2, n_estimators=250 ........
[CV]  max_features=0.25, min_samples_split=2, n_estimators=250, score=0.847, total=   0.4s
[CV] max_features=0.25, min_samples_split=2, n_estimators=250 ........
[CV]  max_features=0.25, min_samples_split=2, n_estimators=250, score=0.778, total=   0.4s
[CV] max_features=0.25, min_samples_split=2, n_estimators=250 ........
[CV]  max_features=0.25, min

[CV]  max_features=0.5, min_samples_split=2, n_estimators=250, score=0.778, total=   0.4s
[CV] max_features=0.5, min_samples_split=4, n_estimators=50 ..........
[CV]  max_features=0.5, min_samples_split=4, n_estimators=50, score=0.861, total=   0.1s
[CV] max_features=0.5, min_samples_split=4, n_estimators=50 ..........
[CV]  max_features=0.5, min_samples_split=4, n_estimators=50, score=0.806, total=   0.1s
[CV] max_features=0.5, min_samples_split=4, n_estimators=50 ..........
[CV]  max_features=0.5, min_samples_split=4, n_estimators=50, score=0.847, total=   0.1s
[CV] max_features=0.5, min_samples_split=4, n_estimators=50 ..........
[CV]  max_features=0.5, min_samples_split=4, n_estimators=50, score=0.778, total=   0.1s
[CV] max_features=0.5, min_samples_split=4, n_estimators=50 ..........
[CV]  max_features=0.5, min_samples_split=4, n_estimators=50, score=0.778, total=   0.1s
[CV] max_features=0.5, min_samples_split=4, n_estimators=150 .........
[CV]  max_features=0.5, min_samples_spl

[CV]  max_features=0.75, min_samples_split=4, n_estimators=150, score=0.833, total=   0.2s
[CV] max_features=0.75, min_samples_split=4, n_estimators=150 ........
[CV]  max_features=0.75, min_samples_split=4, n_estimators=150, score=0.764, total=   0.2s
[CV] max_features=0.75, min_samples_split=4, n_estimators=150 ........
[CV]  max_features=0.75, min_samples_split=4, n_estimators=150, score=0.861, total=   0.2s
[CV] max_features=0.75, min_samples_split=4, n_estimators=150 ........
[CV]  max_features=0.75, min_samples_split=4, n_estimators=150, score=0.778, total=   0.2s
[CV] max_features=0.75, min_samples_split=4, n_estimators=150 ........
[CV]  max_features=0.75, min_samples_split=4, n_estimators=150, score=0.778, total=   0.3s
[CV] max_features=0.75, min_samples_split=4, n_estimators=250 ........
[CV]  max_features=0.75, min_samples_split=4, n_estimators=250, score=0.833, total=   0.4s
[CV] max_features=0.75, min_samples_split=4, n_estimators=250 ........
[CV]  max_features=0.75, min

[CV]  max_features=1.0, min_samples_split=4, n_estimators=250, score=0.806, total=   0.4s
[CV] max_features=1.0, min_samples_split=4, n_estimators=250 .........
[CV]  max_features=1.0, min_samples_split=4, n_estimators=250, score=0.847, total=   0.4s
[CV] max_features=1.0, min_samples_split=4, n_estimators=250 .........
[CV]  max_features=1.0, min_samples_split=4, n_estimators=250, score=0.792, total=   0.4s
[CV] max_features=1.0, min_samples_split=4, n_estimators=250 .........
[CV]  max_features=1.0, min_samples_split=4, n_estimators=250, score=0.806, total=   0.4s
[CV] max_features=1.0, min_samples_split=6, n_estimators=50 ..........
[CV]  max_features=1.0, min_samples_split=6, n_estimators=50, score=0.819, total=   0.1s
[CV] max_features=1.0, min_samples_split=6, n_estimators=50 ..........
[CV]  max_features=1.0, min_samples_split=6, n_estimators=50, score=0.819, total=   0.1s
[CV] max_features=1.0, min_samples_split=6, n_estimators=50 ..........
[CV]  max_features=1.0, min_samples_

[Parallel(n_jobs=1)]: Done 225 out of 225 | elapsed:   53.5s finished


GridSearchCV(estimator=RandomForestClassifier(max_features=1.0,
                                              min_samples_split=6,
                                              n_estimators=150),
             param_grid={'max_features': ['sqrt', 0.25, 0.5, 0.75, 1.0],
                         'min_samples_split': [2, 4, 6],
                         'n_estimators': [50, 150, 250]},
             verbose=3)

In [15]:
print(grid.best_params_)
print(grid.best_score_)

{'max_features': 0.5, 'min_samples_split': 6, 'n_estimators': 150}
0.8305555555555555


In [16]:
train_df.groupby(['Property_Area']).count()

Unnamed: 0_level_0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
Property_Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,139,139,139,139,139,139,139,139,139,139,139,139
1,191,191,191,191,191,191,191,191,191,191,191,191
2,150,150,150,150,150,150,150,150,150,150,150,150
