In [1]:
import pickle
import random
import json

import pandas as pd
import numpy as np 

import seaborn as sns
from matplotlib import pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import classification_report, f1_score, precision_score, recall_score


#import xgboost as xgb, lightgbm as lgbm, catboost as catb
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score, ShuffleSplit, learning_curve
from sklearn import svm

In [2]:
!pip install flask_ngrok



In [3]:
!pip install catboost



In [4]:
import catboost as catb

In [5]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print(f'F1-score = {f1_score(y_test_true, y_test_pred)}\n')
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
from flask_ngrok import run_with_ngrok
from flask import Flask, request, jsonify

In [8]:
TRAIN_PATH = '/content/drive/MyDrive/Colab Notebooks/data/ML_in_business/Course_work/course_project_train.csv'
TEST_PATH = '/content/drive/MyDrive/Colab Notebooks/data/ML_in_business/Course_work/course_project_test.csv'

In [9]:
df = pd.read_csv(TRAIN_PATH)

In [10]:
df.head()

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,,0.0,11.0,26.3,685960.0,1.0,,1.0,debt consolidation,Short Term,99999999.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,8 years,0.0,11.0,35.0,1182434.0,0.0,,0.0,debt consolidation,Short Term,99999999.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,6 years,0.0,8.0,22.5,147400.0,1.0,,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,8 years,0.0,13.0,13.6,385836.0,1.0,,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0


In [11]:
TARGET_NAME = 'Credit Default'

In [12]:
df.loc[23]

Home Ownership                            Own Home
Annual Income                                  NaN
Years in current job                     10+ years
Tax Liens                                        0
Number of Open Accounts                         17
Years of Credit History                       14.8
Maximum Open Credit                         700040
Number of Credit Problems                        0
Months since last delinquent                    17
Bankruptcies                                     0
Purpose                         debt consolidation
Term                                    Short Term
Current Loan Amount                         174878
Current Credit Balance                      168074
Monthly Debt                                 17132
Credit Score                                   NaN
Credit Default                                   1
Name: 23, dtype: object

In [13]:
def transform(df):
    """Трансформация данных"""

    # Исправляем выбросы по Credit Score - очевидно что там лишний ноль дописали 
    df.loc[(df['Credit Score']>850),'Credit Score'] = df['Credit Score']/10

    # Вместо Nan в "Months since last delinquent" ставим 0, так как логично допустить что когда нет просрочемк то и не вводили

    df.loc[(df['Months since last delinquent'].isnull()),'Months since last delinquent'] = 0

    # Аналогично по банкротствам
#     df.loc[(df['Bankruptcies'].isnull()),'Bankruptcies'] = 0

    # Правим Credit Score и Annual Income у них Nan одновременно, что логично если не завели средний годовой доход то Credit Score не считается
    # Если количество месяцев просрочки меньше 1 и Years in current job не Null:

    df.loc[(df['Credit Score'].isnull()) & (df['Years in current job'].notnull()) &\
           (df['Months since last delinquent'] < 1),'Annual Income'] = df['Monthly Debt']*18

    df.loc[(df['Credit Score'].isnull()) & (df['Years in current job'].notnull()) &\
           (df['Months since last delinquent'] < 1),'Credit Score'] = 729.0
    # Если количество месяцев просрочки больше 0 :
    df.loc[(df['Credit Score'].isnull()),['Annual Income','Credit Score']] = [0,585.0]

    # Правим Years in current job как меньшее из значений
    df.loc[(df['Years in current job'].isnull()),'Years in current job'] = '< 1 year'

    # Есть выбросы в Current Loan Amount ( = 99999999.0),но данная величина очень хорошо коррелирует с Credit Default = 0, 
    # так что оставим как есть
    # но оценив этот признак в разрезе целевой переменной получается что лучше переопределить этот показатель на 100 000
    # так с точки зрения распределения при увеличении Current Loan Amount идет увеличение Credit Default = 1
    df.loc[df['Current Loan Amount']> 1000000,'Current Loan Amount'] = 100000

    # КАТЕГОРИАЛЬНЫЕ ПРИЗНАКИ
    # Объединим некоторые параметры в поле 'Years in current job'
    df.loc[df['Years in current job'].isin(['1 year','4 years','3 years']),'Years in current job'] = '2 years'
    df.loc[df['Years in current job'].isin(['6 years','7 years','8 years','9 years']),'Years in current job'] = '5 years'

    # Аналогично для Purpose
    df.loc[df['Purpose'].isin(['take a trip','buy a car','small business','business loan','wedding','educational expenses',\
                               'buy house','medical bills','moving','major purchase','vacation', 'renewable energy']),'Purpose'] = 'other'

    # Объединим Home Mortgage' и 'Have Mortgage' в колонке 'Home Ownership'
    df.loc[df['Home Ownership'] == 'Have Mortgage','Home Ownership'] = 'Home Mortgage'
    return df

In [14]:
df = transform(df)

In [15]:
df.head(5)

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Tax Liens,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Number of Credit Problems,Months since last delinquent,Bankruptcies,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score,Credit Default
0,Own Home,482087.0,< 1 year,0.0,11.0,26.3,685960.0,1.0,0.0,1.0,debt consolidation,Short Term,100000.0,47386.0,7914.0,749.0,0
1,Own Home,1025487.0,10+ years,0.0,15.0,15.3,1181730.0,0.0,0.0,0.0,debt consolidation,Long Term,264968.0,394972.0,18373.0,737.0,1
2,Home Mortgage,751412.0,5 years,0.0,11.0,35.0,1182434.0,0.0,0.0,0.0,debt consolidation,Short Term,100000.0,308389.0,13651.0,742.0,0
3,Own Home,805068.0,5 years,0.0,8.0,22.5,147400.0,1.0,0.0,1.0,debt consolidation,Short Term,121396.0,95855.0,11338.0,694.0,0
4,Rent,776264.0,5 years,0.0,13.0,13.6,385836.0,1.0,0.0,0.0,debt consolidation,Short Term,125840.0,93309.0,7180.0,719.0,0


In [16]:
SEL_FEATURES = ['Home Ownership',
 'Annual Income',
 'Years in current job',
 'Number of Open Accounts',
 'Years of Credit History',
 'Maximum Open Credit',
 'Months since last delinquent',
 'Purpose',
 'Term',
 'Current Loan Amount',
 'Current Credit Balance',
 'Monthly Debt',
 'Credit Score'
               ]

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Home Ownership                7500 non-null   object 
 1   Annual Income                 7500 non-null   float64
 2   Years in current job          7500 non-null   object 
 3   Tax Liens                     7500 non-null   float64
 4   Number of Open Accounts       7500 non-null   float64
 5   Years of Credit History       7500 non-null   float64
 6   Maximum Open Credit           7500 non-null   float64
 7   Number of Credit Problems     7500 non-null   float64
 8   Months since last delinquent  7500 non-null   float64
 9   Bankruptcies                  7486 non-null   float64
 10  Purpose                       7500 non-null   object 
 11  Term                          7500 non-null   object 
 12  Current Loan Amount           7500 non-null   float64
 13  Cur

In [18]:
Xc = df[SEL_FEATURES]
yc = df[TARGET_NAME]
CAT_FEATURES = Xc.select_dtypes(include='object').columns.tolist()

In [19]:
model_catb_c = catb.CatBoostClassifier(class_weights=[1, 3.0], silent=True, n_estimators=1500,  
                                  max_depth=3, random_state=42)
model_catb_c.fit(Xc, yc, CAT_FEATURES)

y_train_pred = model_catb_c.predict(Xc)

y_train_pred_proba = model_catb_c.predict_proba(Xc)
threshold_prob = 0.53
predict_train = np.array([1 if x>=threshold_prob else 0 for x in y_train_pred_proba[:,1]])

get_classification_report(yc, y_train_pred, yc, predict_train)

TRAIN

              precision    recall  f1-score   support

           0       0.91      0.54      0.68      5387
           1       0.42      0.87      0.57      2113

    accuracy                           0.63      7500
   macro avg       0.67      0.70      0.62      7500
weighted avg       0.77      0.63      0.65      7500

TEST

              precision    recall  f1-score   support

           0       0.89      0.60      0.72      5387
           1       0.44      0.81      0.57      2113

    accuracy                           0.66      7500
   macro avg       0.67      0.70      0.64      7500
weighted avg       0.76      0.66      0.68      7500

F1-score = 0.5723860589812332

CONFUSION MATRIX

col_0              0     1
Credit Default            
0               3240  2147
1                405  1708


In [20]:
with open('model.pkl', 'wb') as file:
    pickle.dump(model_catb_c, file)

In [21]:
loaded_model = pickle.load(open('model.pkl', 'rb'))

In [22]:
jn = {'ID':23, "Home Ownership":"Rent","Annual Income":164562.0,"Years in current job":"2 years","Tax Liens":0.0,
      "Number of Open Accounts":9.0,"Years of Credit History":12.5,"Maximum Open Credit":220968.0,
      "Number of Credit Problems":0.0,"Months since last delinquent":70.0,"Purpose":"debt consolidation","Term":"Short Term",
      "Current Loan Amount":162470.0,"Current Credit Balance":105906.0,"Monthly Debt":6813.0,"Credit Score":585.0}

jn1 = {'ID':2,"Home Ownership":"Have Mortgage", "Annual Income":113186.0,"Years in current job":"10+ years","Tax Liens":0.0,
       "Number of Open Accounts":16.0,"Years of Credit History":17.0,"Maximum Open Credit":456302.0,
       "Number of Credit Problems":0.0,"Months since last delinquent":70.0,"Purpose":"debt consolidation",
       "Term":"Short Term","Current Loan Amount":217382.0,"Current Credit Balance":213199.0,"Monthly Debt":27559.0}

jn2 = {'ID':3,"Home Ownership":"Home Mortgage","Annual Income":1152540.0,"Years in current job":"2 years","Tax Liens":0.0,
       "Number of Open Accounts":10.0,"Years of Credit History":13.7,"Maximum Open Credit":204600.0,
       "Number of Credit Problems":0.0,"Months since last delinquent":0.0,"Purpose":"debt consolidation","Term":"Short Term",
       "Current Loan Amount":200178.0,"Current Credit Balance":146490.0,"Monthly Debt":18729.0,"Credit Score":7260.0}


In [23]:
ddf = pd.DataFrame(columns = SEL_FEATURES)
ddf.loc[jn['ID']] = jn

ddf

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Months since last delinquent,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score
23,Rent,164562.0,2 years,9.0,12.5,220968.0,70.0,debt consolidation,Short Term,162470.0,105906.0,6813.0,585.0


In [24]:
ddf = transform(ddf)
ddf

Unnamed: 0,Home Ownership,Annual Income,Years in current job,Number of Open Accounts,Years of Credit History,Maximum Open Credit,Months since last delinquent,Purpose,Term,Current Loan Amount,Current Credit Balance,Monthly Debt,Credit Score
23,Rent,164562.0,2 years,9.0,12.5,220968.0,70.0,debt consolidation,Short Term,162470.0,105906.0,6813.0,585.0


In [25]:
y_pred = loaded_model.predict(ddf)
y_pred_proba = loaded_model.predict_proba(ddf)
y_pred, y_pred_proba

(array([1]), array([[0.31862452, 0.68137548]]))

## Запуск Flask

In [26]:
app = Flask(__name__)
run_with_ngrok(app)  # Start ngrok when app is run

@app.route("/a")
def hello():
  print('Hello, Anatoly')
  return "Hello World!"

if __name__ == '__main__':
  app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://19db-35-230-164-122.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [05/Sep/2021 19:51:36] "[33mGET / HTTP/1.1[0m" 404 -
127.0.0.1 - - [05/Sep/2021 19:51:36] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
127.0.0.1 - - [05/Sep/2021 19:51:37] "[33mGET / HTTP/1.1[0m" 404 -
127.0.0.1 - - [05/Sep/2021 19:51:42] "[37mGET /a HTTP/1.1[0m" 200 -


Hello, Anatoly


127.0.0.1 - - [05/Sep/2021 19:51:43] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
127.0.0.1 - - [05/Sep/2021 19:51:44] "[37mGET /a HTTP/1.1[0m" 200 -


Hello, Anatoly


In [49]:
# Обработчики и запуск Flask
app = Flask(__name__)
run_with_ngrok(app)  # Start ngrok when app is run

def transform(df):
    """Трансформация данных"""
    # Исправляем выбросы по Credit Score - очевидно что там лишний ноль дописали 
    df.loc[(df['Credit Score']>850),'Credit Score'] = df['Credit Score']/10

    # Вместо Nan в "Months since last delinquent" ставим 0, так как логично допустить что когда нет просрочемк то и не вводили

    df.loc[(df['Months since last delinquent'].isnull()),'Months since last delinquent'] = 0
    # Правим Credit Score и Annual Income у них Nan одновременно, что логично если не завели средний годовой доход то Credit Score не считается
    # Если количество месяцев просрочки меньше 1 и Years in current job не Null:
    df.loc[(df['Credit Score'].isnull()) & (df['Years in current job'].notnull()) &\
           (df['Months since last delinquent'] < 1),'Annual Income'] = df['Monthly Debt']*18

    df.loc[(df['Credit Score'].isnull()) & (df['Years in current job'].notnull()) &\
           (df['Months since last delinquent'] < 1),'Credit Score'] = 729.0
    # Если количество месяцев просрочки больше 0 :
    df.loc[(df['Credit Score'].isnull()),['Annual Income','Credit Score']] = [0,585.0]

    # Правим Years in current job как меньшее из значений
    df.loc[(df['Years in current job'].isnull()),'Years in current job'] = '< 1 year'
    # Есть выбросы в Current Loan Amount ( = 99999999.0),но данная величина очень хорошо коррелирует с Credit Default = 0, 
    # так что оставим как есть
    # но оценив этот признак в разрезе целевой переменной получается что лучше переопределить этот показатель на 100 000
    # так с точки зрения распределения при увеличении Current Loan Amount идет увеличение Credit Default = 1
    df.loc[df['Current Loan Amount']> 1000000,'Current Loan Amount'] = 100000
    # Объединим некоторые параметры в поле 'Years in current job'
    df.loc[df['Years in current job'].isin(['1 year','4 years','3 years']),'Years in current job'] = '2 years'
    df.loc[df['Years in current job'].isin(['6 years','7 years','8 years','9 years']),'Years in current job'] = '5 years'
    # Аналогично для Purpose
    df.loc[df['Purpose'].isin(['take a trip','buy a car','small business','business loan','wedding','educational expenses',\
                               'buy house','medical bills','moving','major purchase','vacation', 'renewable energy']),'Purpose'] = 'other'
    df.loc[df['Home Ownership'] == 'Have Mortgage','Home Ownership'] = 'Home Mortgage'
    return df

SEL_FEATURES = ['Home Ownership',
 'Annual Income',
 'Years in current job',
 'Number of Open Accounts',
 'Years of Credit History',
 'Maximum Open Credit',
 'Months since last delinquent',
 'Purpose',
 'Term',
 'Current Loan Amount',
 'Current Credit Balance',
 'Monthly Debt',
 'Credit Score'
               ]

@app.route('/predict', methods=['GET', 'POST'])
def predict3():
    try:

        json_input = request.json
        print(json_input)
        ID = json_input["ID"]
        ddf = pd.DataFrame(columns = SEL_FEATURES)
        
        ddf.loc[ID] = json_input
#         print(ddf)
        ddf = transform(ddf)
        
        y_pred = loaded_model.predict(ddf)
#         print(y_pred)
        y_pred_proba = loaded_model.predict_proba(ddf)
        print(y_pred, y_pred_proba)

#         return json.dumps({'ID': ID, 'Credit Default': int(y_pred[0]) , 'Credit Default_probability_1': float(y_pred_proba[0,1])}) 
        return jsonify({'ID': ID, 'Credit Default': int(y_pred[0]) , 'Credit Default_probability_1': float(y_pred_proba[0,1])}) 
    except:

        return "Error"



if __name__ == '__main__':
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://56e0-35-230-164-122.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [05/Sep/2021 21:00:38] "[37mPOST /predict HTTP/1.1[0m" 200 -


{'ID': 23, 'Home Ownership': 'Rent', 'Annual Income': 0.0, 'Years in current job': '2 years', 'Tax Liens': 0.0, 'Number of Open Accounts': 9.0, 'Years of Credit History': 12.5, 'Maximum Open Credit': 220968.0, 'Number of Credit Problems': 0.0, 'Months since last delinquent': 70.0, 'Bankruptcies': 0.0, 'Purpose': 'debt consolidation', 'Term': 'Short Term', 'Current Loan Amount': 162470.0, 'Current Credit Balance': 105906.0, 'Monthly Debt': 6813.0, 'Credit Score': 585.0}
[1] [[0.31882937 0.68117063]]


127.0.0.1 - - [05/Sep/2021 21:00:58] "[37mPOST /predict HTTP/1.1[0m" 200 -


{'ID': 23, 'Home Ownership': 'Rent', 'Annual Income': 0.0, 'Years in current job': '2 years', 'Tax Liens': 0.0, 'Number of Open Accounts': 9.0, 'Years of Credit History': 12.5, 'Maximum Open Credit': 220968.0, 'Number of Credit Problems': 0.0, 'Months since last delinquent': 70.0, 'Bankruptcies': 0.0, 'Purpose': 'debt consolidation', 'Term': 'Short Term', 'Current Loan Amount': 162470.0, 'Current Credit Balance': 105906.0, 'Monthly Debt': 6813.0, 'Credit Score': 585.0}
[1] [[0.31882937 0.68117063]]


127.0.0.1 - - [05/Sep/2021 21:01:02] "[37mPOST /predict HTTP/1.1[0m" 200 -


{'ID': 2, 'Home Ownership': 'Home Mortgage', 'Years in current job': '10+ years', 'Tax Liens': 0.0, 'Number of Open Accounts': 16.0, 'Years of Credit History': 17.0, 'Maximum Open Credit': 456302.0, 'Number of Credit Problems': 0.0, 'Months since last delinquent': 70.0, 'Purpose': 'debt consolidation', 'Term': 'Short Term', 'Current Loan Amount': 217382.0, 'Current Credit Balance': 213199.0, 'Monthly Debt': 27559.0}
[1] [[0.44816659 0.55183341]]


127.0.0.1 - - [05/Sep/2021 21:01:08] "[37mPOST /predict HTTP/1.1[0m" 200 -


{'ID': 3, 'Home Ownership': 'Home Mortgage', 'Annual Income': 1152540.0, 'Years in current job': '2 years', 'Tax Liens': 0.0, 'Number of Open Accounts': 10.0, 'Years of Credit History': 13.7, 'Maximum Open Credit': 204600.0, 'Number of Credit Problems': 0.0, 'Months since last delinquent': 0.0, 'Bankruptcies': 0.0, 'Purpose': 'debt consolidation', 'Term': 'Short Term', 'Current Loan Amount': 200178.0, 'Current Credit Balance': 146490.0, 'Monthly Debt': 18729.0, 'Credit Score': 7260.0}
[0] [[0.50262446 0.49737554]]
