In [1]:
import pandas as pd
import numpy as np
import pickle 

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import  accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [2]:
# Tam Veri Kümesi (Full Dataset)
# Veriseti geçmişte müşterilerin kredi başvurusunun onaylanıp onaylanmadığı bilgisini taşımaktadır
full_set = pd.read_csv('./data/full_dataset.csv')
full_set.head(5)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [3]:
full_set.dtypes

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object

In [4]:
print(full_set.shape)

(614, 13)


In [5]:
full_set = full_set.drop('Loan_ID',axis = 1)
full_set.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [6]:
# Bağımlı değişken Loan_Status değişkeni kategorik verilerden oluşmakta. Bu değişkeni nümerikleştirelim.

full_set['Loan_Status'] = full_set['Loan_Status'].map({'N':0, 'Y':1})

In [7]:
full_set.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,1
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1


In [8]:
full_set.isnull().sum()

Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

- Kayıp gözlem imputasyonunu tam veri kümesine uygulamak veri sızıntısına neden olacaktor.
- Öncelikle veri kümesini %80 -> eğitim, %20 -> test ayıralım
- Eğitim veri kümesindeki parametreleri kullanarak test veri kümesi üzerinde veri imputasyonu gerçekleştirebiliriz.

In [9]:
x = full_set.drop('Loan_Status',axis = 1)
y = full_set['Loan_Status']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)

In [11]:
print(f"Eğitim kümesi: {X_train.shape} \nTest kümesi: {X_test.shape}")

Eğitim kümesi: (491, 11) 
Test kümesi: (123, 11)


- Veri kümesi çok büyük olmadığı için model doğrulama cross validation kullanılarak elde edilecektir.
- Çapraz doğrulama yapılırken kullanacağımız ml modelinin hiperparametrelerine GridSearch kullanılarak fine tuning verilecektir.

In [12]:
num_features = full_set.drop(['Loan_Status'],axis = 1).select_dtypes(include = 'number').columns

In [13]:
# Numerik veri yapısına sahip değişkenler 
num_features

Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')

In [14]:
cat_features =full_set.select_dtypes(include='object').columns

In [15]:
# Kategorik veri yapısına sahip değişkenler
cat_features

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Property_Area'],
      dtype='object')

- Nümerik ve kategorik değişkenlere ayrı ayrı veri önişleme uygulanacaktır
- Nümerik değişkenlere min-max scaling yöntemi ile ölçeklenecektir ve varsa kayıp gözlemler ortalama ile impute edilecektir.
- Kategorik değişkenler one-hot-encoding uygulanacak nümerikleştirilecekler ve kayıp gözler varsa bu kayıp gözlem sabit bir değer ile impute edilecek.
- İki değişken türü için gerçekleştirilecek işlemler, iki farklı iletim hattı(pipeline) ile gerçekleştirilecektir.

In [16]:
num_transformer = Pipeline(steps = [('Imputer',SimpleImputer(missing_values=np.nan, strategy='mean')),
                                   ('MinMaxScaler', MinMaxScaler())])

In [17]:
num_transformer

Pipeline(steps=[('Imputer', SimpleImputer()), ('MinMaxScaler', MinMaxScaler())])

In [18]:
cat_transformer = Pipeline(steps = [('Imputer',SimpleImputer(strategy='constant',fill_value='missing')),
                                   ('OneHotEncoder', OneHotEncoder(categories='auto', drop=None, handle_unknown='ignore'))])

In [19]:
cat_transformer

Pipeline(steps=[('Imputer',
                 SimpleImputer(fill_value='missing', strategy='constant')),
                ('OneHotEncoder', OneHotEncoder(handle_unknown='ignore'))])

### Sütun Dönüşümleri

In [20]:
preprocessor = ColumnTransformer(transformers=[('num', num_transformer, num_features), # nümerik değişkenlerime num_transformer daki pipeline'ı uygula
                                               ('cat', cat_transformer, cat_features)], # kategorik değişkenlerime cat_transformer daki pipeline'ı uygula
                                 remainder='drop',
                                 n_jobs=-1,
                                 verbose=False)

In [21]:
preprocessor

ColumnTransformer(n_jobs=-1,
                  transformers=[('num',
                                 Pipeline(steps=[('Imputer', SimpleImputer()),
                                                 ('MinMaxScaler',
                                                  MinMaxScaler())]),
                                 Index(['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History'],
      dtype='object')),
                                ('cat',
                                 Pipeline(steps=[('Imputer',
                                                  SimpleImputer(fill_value='missing',
                                                                strategy='constant')),
                                                 ('OneHotEncoder',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'Proper

In [22]:
pipe = Pipeline(steps=[('preprocess', preprocessor),
                      ('RF_model', RandomForestClassifier(class_weight='balanced', n_jobs = -1))],
               verbose=False)

- Izgara arama gerçekleştirilirken kullanılacak parametreler için olası değerleri içeren sözlük

In [23]:
parameters_grid = [{'RF_model__n_estimators':[10, 20, 50],
                    'RF_model__max_features': ['sqrt', 'log2', 0.25, 0.5, 0.75],
                    'RF_model__max_depth' : [2, 4, 5, 6, 7, 8]}
                  ]

In [24]:
search = GridSearchCV(estimator = pipe, param_grid = parameters_grid, cv = 10, scoring = 'accuracy', return_train_score=False, verbose=1, n_jobs=-1)

In [25]:
best_model = search.fit(X_train, y_train)

Fitting 10 folds for each of 90 candidates, totalling 900 fits


In [26]:
best_model.best_params_

{'RF_model__max_depth': 2,
 'RF_model__max_features': 0.25,
 'RF_model__n_estimators': 50}

In [27]:
def TrainTestScores(y_train, y_train_pred, y_test, y_test_pred):
    
    scores = {"train_set": {"Accuracy" : accuracy_score(y_train, y_train_pred),
                            "Precision" : precision_score(y_train, y_train_pred),
                            "Recall" : recall_score(y_train, y_train_pred),                          
                            "F1 Score" : f1_score(y_train, y_train_pred),
                           "AUC": roc_auc_score(y_train, y_train_pred)},
    
              "test_set": {"Accuracy" : accuracy_score(y_test, y_test_pred),
                           "Precision" : precision_score(y_test, y_test_pred),
                           "Recall" : recall_score(y_test, y_test_pred),                          
                           "F1 Score" : f1_score(y_test, y_test_pred),
                          "AUC:": roc_auc_score(y_test, y_test_pred)}}
    
    return scores

In [28]:
ytrain_pred = best_model.predict(X_train)
ytest_pred = best_model.predict(X_test)

In [29]:
TrainTestScores(y_train, ytrain_pred, y_test, ytest_pred)

{'train_set': {'Accuracy': 0.824847250509165,
  'Precision': 0.8121951219512196,
  'Recall': 0.9736842105263158,
  'F1 Score': 0.8856382978723405,
  'AUC': 0.7284528435181915},
 'test_set': {'Accuracy': 0.7804878048780488,
  'Precision': 0.7623762376237624,
  'Recall': 0.9625,
  'F1 Score': 0.8508287292817679,
  'AUC:': 0.7021802325581395}}

### Modeli Kaydedelim

In [30]:
pickle_out = open("classifier.pkl", mode = "wb")
pickle.dump(best_model, pickle_out)
pickle_out.close()

### Modeli Geri Yüklemek

In [31]:
pickle_in = open("classifier.pkl", mode='rb')
classifier = pickle.load(pickle_in)