In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,roc_auc_score,f1_score,classification_report

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder, FunctionTransformer

import mlflow
import pickle

In [2]:
df=pd.read_csv('data/train.csv')

num_cols=['person_age','person_income','person_emp_length','loan_amnt','loan_int_rate','loan_percent_income','cb_person_cred_hist_length','person_home_ownership','loan_grade','cb_person_default_on_file']
cat_cols=['loan_intent']
logtrans_feats=['person_age','person_income','person_emp_length','loan_amnt','loan_percent_income','cb_person_cred_hist_length']

X=df.drop(columns=['id','loan_status'])
y=df['loan_status']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)


In [3]:

loangrade_map={'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7}
persondefault_map={'N':0,'Y':1}
homeownership_map={'OTHER':0,'RENT':1,'MORTGAGE':2,'OWN':3}

def custom_ordinalencoding(df):
    df['loan_grade']=df['loan_grade'].replace(loangrade_map)
    df['cb_person_default_on_file']=df['cb_person_default_on_file'].replace(persondefault_map)
    df['person_home_ownership']=df['person_home_ownership'].replace(homeownership_map)

    return df

X_train=custom_ordinalencoding(X_train)

coltrans1=ColumnTransformer(transformers=[
    ('LogTransform',FunctionTransformer(np.log1p,feature_names_out='one-to-one'),logtrans_feats),
],verbose_feature_names_out=False,remainder='passthrough')
# coltrans1.set_output(transform='pandas')

num_pipeline=Pipeline(steps=[
    ('logtrans',coltrans1),
    ('impute',SimpleImputer(strategy='mean')),
    ('scale',StandardScaler())
])

cat_pipeline=Pipeline(steps=[
    ('onehot',OneHotEncoder(handle_unknown='ignore',drop='first')),
    ('impute',SimpleImputer(strategy='most_frequent'))
])

coltrans2=ColumnTransformer(transformers=[
    ('NumericalPipeline',num_pipeline,num_cols),
    ('CategoricalPipeline',cat_pipeline,cat_cols)
],verbose_feature_names_out=False)

X_train=coltrans2.fit_transform(X_train)
X_train=pd.DataFrame(X_train,columns=coltrans2.get_feature_names_out())

X_test=custom_ordinalencoding(X_test)
X_test=coltrans2.transform(X_test)

X_train.head()


Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_percent_income,cb_person_cred_hist_length,loan_int_rate,person_home_ownership,loan_grade,cb_person_default_on_file,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
0,-0.612037,1.418711,-1.897461,0.071337,-0.999638,-1.289053,0.495079,0.786615,-0.063668,-0.417503,0.0,0.0,1.0,0.0,0.0
1,0.872295,1.317042,0.171148,0.699836,-0.522438,0.330025,-1.044898,0.786615,-1.019836,-0.417503,0.0,0.0,0.0,0.0,1.0
2,-0.612037,-0.218734,-1.897461,0.938787,1.228404,-0.739329,0.399449,-0.882909,-0.063668,-0.417503,1.0,0.0,0.0,0.0,0.0
3,-0.830288,1.317042,0.775243,0.823911,-0.405825,-1.289053,-1.338383,0.786615,-1.019836,-0.417503,0.0,0.0,1.0,0.0,0.0
4,-1.295486,0.399456,0.405485,0.071337,-0.522438,-1.289053,2.27578,0.786615,1.848667,-0.417503,0.0,1.0,0.0,0.0,0.0


In [4]:
from imblearn.over_sampling import SMOTE

sm=SMOTE(random_state=6)

X_train_sm,y_train_sm=sm.fit_resample(X_train,y_train)
y_train_sm.value_counts()

loan_status
0    40248
1    40248
Name: count, dtype: int64

In [5]:

mlflow.set_experiment('LoanApp')

with mlflow.start_run():
    logreg=LogisticRegression().fit(X_train_sm,y_train_sm)

    tr_pred=logreg.predict(X_train)
    te_pred=logreg.predict(X_test)

    clfrep_train=classification_report(y_train,tr_pred)
    clfrep_test=classification_report(y_test,te_pred)
    
    tr_acc=accuracy_score(y_train,tr_pred)
    tr_prec=precision_score(y_train,tr_pred)
    tr_f1=f1_score(y_train,tr_pred)
    tr_aucroc=roc_auc_score(y_train,tr_pred)

    te_acc=accuracy_score(y_test,te_pred)
    te_prec=precision_score(y_test,te_pred)
    te_f1=f1_score(y_test,te_pred)
    te_aucroc=roc_auc_score(y_test,te_pred)

    print('Train report')
    print(clfrep_train)

    print('Test rep')
    print(clfrep_test)

    metric_names=['f1','auc roc']
    tr_values=[tr_f1,tr_aucroc]
    te_values=[te_f1,te_aucroc]

    for inx,metname in enumerate(metric_names):
        mlflow.log_metric(' '.join(['tr',metname]),tr_values[inx])
        mlflow.log_metric(' '.join(['te',metname]),te_values[inx])

    print(te_values)

    mlflow.set_tag('model prop','Base logreg with SMOTE')


# model_pkl_file='outputs/loanapp_logreg.pkl'
# with open(model_pkl_file, 'wb') as file:  
#     pickle.dump(logreg, file)

# coltrans_pkl_file = "outputs/loanapp_coltransformer.pkl"  

# with open(coltrans_pkl_file, 'wb') as file:  
#     pickle.dump(coltrans2, file)

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet



Train report
              precision    recall  f1-score   support

           0       0.96      0.82      0.89     40248
           1       0.43      0.82      0.56      6668

    accuracy                           0.82     46916
   macro avg       0.70      0.82      0.72     46916
weighted avg       0.89      0.82      0.84     46916

Test rep
              precision    recall  f1-score   support

           0       0.97      0.82      0.88     10047
           1       0.43      0.83      0.57      1682

    accuracy                           0.82     11729
   macro avg       0.70      0.82      0.73     11729
weighted avg       0.89      0.82      0.84     11729

[np.float64(0.5656237301909792), np.float64(0.8218253518806437)]


In [15]:
logreg.coef_

array([[ 0.01309878,  0.25910834, -0.0196385 , -1.06607527,  1.80771539,
        -0.01183921, -0.1733223 , -0.61464601,  1.35590709, -0.07014974,
        -1.03307427,  0.10265845, -0.38111192, -0.65760755, -1.41986016]])

In [19]:
logreg.feature_names_in_

array(['person_age', 'person_income', 'person_emp_length', 'loan_amnt',
       'loan_percent_income', 'cb_person_cred_hist_length',
       'loan_int_rate', 'person_home_ownership', 'loan_grade',
       'cb_person_default_on_file', 'loan_intent_EDUCATION',
       'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL',
       'loan_intent_PERSONAL', 'loan_intent_VENTURE'], dtype=object)

In [20]:

def plotparams(featnames,modelcoef):
    sortedinx=np.argsort(modelcoef)
    plt.bar(featnames[sortedinx],modelcoef[sortedinx])
    plt.show()

plotparams(logreg.feature_names_in_,logreg.coef_)

IndexError: index 14 is out of bounds for axis 0 with size 1

In [6]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import RandomizedSearchCV

# # rfparameters={'max_depth':np.arange(5,80,10),'n_estimators':np.arange(10,400,50)}

# # rf=RandomForestClassifier(random_state=3)
# # hypersearch=RandomizedSearchCV(rf,rfparameters,scoring='roc_auc',cv=5,n_iter=30)
# # hypersearch.fit(X_train_sm,y_train_sm)

# # plt.plot(np.arange(len(hypersearch.cv_results_['mean_test_score'])),hypersearch.cv_results_['mean_test_score'])
# # plt.ylabel('Mean ROC AUC score')
  

In [7]:
# hypersearch.best_estimator_

In [8]:


# # model_pkl_file = "outputs/loanapp_rf_sm.pkl"  

# # with open(model_pkl_file, 'wb') as file:  
# #     pickle.dump(hypersearch.best_estimator_, file)

# # save the iris classification model as a pickle file
# coltrans_pkl_file = "outputs/loanapp_coltransformer_rf_sm.pkl"  

# with open(coltrans_pkl_file, 'wb') as file:  
#     pickle.dump(coltrans2, file)

In [9]:
# model_pkl_file = "outputs/loanapp_rf_sm.pkl"  

# rfmodel_sm=pickle.load(open(model_pkl_file,'rb'))

In [10]:
# # hypersearch_bestmodel=hypersearch.best_estimator_
# mlflow.set_experiment('LoanApp')

# with mlflow.start_run():
#     tr_pred=rfmodel_sm.predict(X_train)
#     te_pred=rfmodel_sm.predict(X_test)

#     clfrep_train=classification_report(y_train,tr_pred)
#     clfrep_test=classification_report(y_test,te_pred)
    
#     tr_acc=accuracy_score(y_train,tr_pred)
#     tr_prec=precision_score(y_train,tr_pred)
#     tr_f1=f1_score(y_train,tr_pred)
#     tr_aucroc=roc_auc_score(y_train,tr_pred)

#     te_acc=accuracy_score(y_test,te_pred)
#     te_prec=precision_score(y_test,te_pred)
#     te_f1=f1_score(y_test,te_pred)
#     te_aucroc=roc_auc_score(y_test,te_pred)

#     print('Train report')
#     print(clfrep_train)

#     print('Test rep')
#     print(clfrep_test)

#     metric_names=['f1','auc roc']
#     tr_values=[tr_f1,tr_aucroc]
#     te_values=[te_f1,te_aucroc]

#     for inx,metname in enumerate(metric_names):
#         mlflow.log_metric(' '.join(['tr',metname]),tr_values[inx])
#         mlflow.log_metric(' '.join(['te',metname]),te_values[inx])

#     mlflow.sklearn.log_model(rfmodel_sm,'RF best estimator')

#     mlflow.set_tag('model prop','Random Forest Classifier with SMOTE')

In [11]:
# rfmodel_sm

In [12]:
# rf_alldata=RandomForestClassifier(random_state=3,max_depth=75,n_estimators=360)

# X_all_trans=custom_ordinalencoding(X)
# X_all_trans=coltrans2.transform(X_all_trans)

# rf_alldata.fit(X_all_trans,y)

# mlflow.set_experiment('LoanApp')

# with mlflow.start_run():
#     tr_pred=rf_alldata.predict(X_train)
#     te_pred=rf_alldata.predict(X_test)

#     clfrep_train=classification_report(y_train,tr_pred)
#     clfrep_test=classification_report(y_test,te_pred)
    
#     tr_acc=accuracy_score(y_train,tr_pred)
#     tr_prec=precision_score(y_train,tr_pred)
#     tr_f1=f1_score(y_train,tr_pred)
#     tr_aucroc=roc_auc_score(y_train,tr_pred)

#     te_acc=accuracy_score(y_test,te_pred)
#     te_prec=precision_score(y_test,te_pred)
#     te_f1=f1_score(y_test,te_pred)
#     te_aucroc=roc_auc_score(y_test,te_pred)

#     print('Train report')
#     print(clfrep_train)

#     print('Test rep')
#     print(clfrep_test)

#     metric_names=['f1','auc roc']
#     tr_values=[tr_f1,tr_aucroc]
#     te_values=[te_f1,te_aucroc]

#     for inx,metname in enumerate(metric_names):
#         mlflow.log_metric(' '.join(['tr',metname]),tr_values[inx])
#         mlflow.log_metric(' '.join(['te',metname]),te_values[inx])

#     mlflow.sklearn.log_model(rfmodel_sm,'RF best estimator')

#     mlflow.set_tag('model prop','Random Forest Classifier_SMOTE_fulldata')

In [13]:
# from xgboost import XGBClassifier

# xgb=XGBClassifier().fit(X_train,y_train)

# mlflow.set_experiment('LoanApp')

# with mlflow.start_run():
#     tr_pred=xgb.predict(X_train)
#     te_pred=xgb.predict(X_test)

#     clfrep_train=classification_report(y_train,tr_pred)
#     clfrep_test=classification_report(y_test,te_pred)
    
#     tr_acc=accuracy_score(y_train,tr_pred)
#     tr_prec=precision_score(y_train,tr_pred)
#     tr_f1=f1_score(y_train,tr_pred)
#     tr_aucroc=roc_auc_score(y_train,tr_pred)

#     te_acc=accuracy_score(y_test,te_pred)
#     te_prec=precision_score(y_test,te_pred)
#     te_f1=f1_score(y_test,te_pred)
#     te_aucroc=roc_auc_score(y_test,te_pred)

#     print('Train report')
#     print(clfrep_train)

#     print('Test rep')
#     print(clfrep_test)

#     metric_names=['f1','auc roc']
#     tr_values=[tr_f1,tr_aucroc]
#     te_values=[te_f1,te_aucroc]

#     for inx,metname in enumerate(metric_names):
#         mlflow.log_metric(' '.join(['tr',metname]),tr_values[inx])
#         mlflow.log_metric(' '.join(['te',metname]),te_values[inx])

#     mlflow.sklearn.log_model(xgb,'XGB without SMOTE')

#     mlflow.set_tag('model prop','XGB')

In [14]:
# from sklearn.metrics import PrecisionRecallDisplay
# fig,ax=plt.subplots(nrows=1,ncols=2)
# fig.set_figwidth(15)

# PrecisionRecallDisplay.from_estimator(rfmodel_sm,X_train,y_train,ax=ax[0])
# PrecisionRecallDisplay.from_estimator(xgb,X_train,y_train,ax=ax[0])
# ax[0].set_title('Training set')

# PrecisionRecallDisplay.from_estimator(rfmodel_sm,X_test,y_test,ax=ax[1])
# PrecisionRecallDisplay.from_estimator(xgb,X_test,y_test,ax=ax[1])
# ax[1].set_title('Testing set')


# plt.tight_layout()
# plt.show()
