### Import Libraries

In [2]:
# Import Libraries
import numpy as np
import pandas as pd
# Model libraries
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier # Use XGBClassifier as we have binary classification problem

# Preprocessing and Pipeline libraries
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder
import pickle

## model accuracy, score and cross validation libraries
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# Do not print warning
import warnings
warnings.filterwarnings('ignore')

### Take a different approach - Fun with data wrangling

*Note- Since there hand selected features (10) are available, skipping feature engineering part*

* Combine and shuffle training and test data
* create a 80-20 train and test split
* Run a cross validation on the training set - This will help to avoid overfitting and underfitting
* Build a model using XGBoost for credit risk prediction
* Deploy the model

### Import Data

In [3]:
data_path = r'mle-project-challenge\data\combined_data.csv'  # Check with your folder path coreect them as needed

In [4]:
print("\nLoading data...")
# load data
data = pd.read_csv(data_path, engine='python', header=0)
print("\nThere are ", len(data), "records in dataset")


Loading data...

There are  117911 records in dataset


### Separate X, y and cretae Train Test Split (80:20)

In [5]:
# Separate out X and y
X = data.loc[:, data.columns != 'is_late']
y = data.iloc[:,-1:]

In [6]:
## From Sklearn train_test_split function
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [7]:
len(X_train)

94328

### Data Presprocessing (borrow from what has been already done/given)
###### We have mixed types, therefore, it has a nice pipeline to impute missing data and categorical transformers with onehot encoding

In [8]:
# Assign numeric features from dataset and impute with median for missing data.
numeric_features = ['loan_amnt', 
                    'int_rate', 'annual_inc', 'revol_util', 
                    'dti', 'delinq_2yrs'
                   ]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ])
# Assign values from dataset
categorical_features = ['purpose','grade', 'emp_length', 'home_ownership']

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

## Main pre-preprocess variable
preprocess = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
        ]
    )

### Create a Pipeline and Fit XGBoost Model into Data

In [9]:
## create a pipeline
# model = make_pipeline(preprocess, XGBClassifier())
model = make_pipeline(preprocess, RandomForestClassifier(n_estimators = 100, random_state=42))

In [24]:
### Train the model
print("\nTraining model ...")
model.fit(X_train, y_train)


Training model ...


Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                        

### Test Model with Test Data

In [25]:
print("Accuracy:\n%s" % accuracy_score(y_test, model.predict(X_test)))

Accuracy:
0.9777382012466608


In [27]:
print("\nSaving model ...")
file = open('mle-project-challenge/models/credit_risk_model.pkl', 'wb')
pickle.dump(model, file)
file.close()


Saving model ...


### A Function to Create Different Versions of the Model

In [87]:
def create_new_model(number_of_estimators = 10):
    import numpy as np
    import pandas as pd
    # Model libraries
    from sklearn.ensemble import RandomForestClassifier
    from xgboost import XGBClassifier # Use XGBClassifier as we have binary classification problem

    # Preprocessing and Pipeline libraries
    import sklearn
    from sklearn.model_selection import train_test_split
    from sklearn.compose import ColumnTransformer
    from sklearn.impute import SimpleImputer
    from sklearn.pipeline import Pipeline, make_pipeline
    from sklearn.preprocessing import OneHotEncoder
    import pickle

    ## model accuracy, score and cross validation libraries
    from sklearn.metrics import accuracy_score
    from sklearn.model_selection import KFold
    from sklearn.model_selection import cross_val_score

    # Do not print warning
    import warnings
    warnings.filterwarnings('ignore')
    
    cur_dir = os.getcwd() + "\\" + "mle-project-challenge"+ "\\"+ "data"
    file_name = 'combined_data.csv'

    for dirs, fol, files in os.walk(cur_dir):
        for file in files:  
            if file == file_name:
                file_path = os.path.join(dirs, file_name)
                print(file_path)
    
    # Load data into the python memory
    data = pd.read_csv(file_path, engine='python', header=0)
    
    # Split data into X, y
    X = data.loc[:, data.columns != 'is_late']
    y = data.iloc[:,-1:]
    
    ## From Sklearn train_test_split function
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    
    ## Data Pre-Processing
    # Assign numeric features from dataset and impute with median for missing data.
    numeric_features = ['loan_amnt', 
                        'int_rate', 'annual_inc', 'revol_util', 
                        'dti', 'delinq_2yrs'
                       ]

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ])
    # Assign values from dataset
    categorical_features = ['purpose','grade', 'emp_length', 'home_ownership']

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

    ## Main pre-preprocess variable
    preprocess = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
            ]
        )
    
    
    ## create a model pipeline with pre-processing
    
    print("deafault number of estimators of the model is : 100")    
    user_estimators = input("Do you want to change the estimators? if yes, enter a an integer number:")
    
    if user_estimators != '':
        user_estimators_int = int(user_estimators)
        model = make_pipeline(preprocess, XGBClassifier(n_estimators=user_estimators_int))
        
    else:
        model = make_pipeline(preprocess, XGBClassifier(n_estimators=number_of_estimators))
        
    
    
    ### Train the model
    print("\nTraining model ...")
    model.fit(X_train, y_train)
    
    # Print model accuracy
    print("Accuracy:\n%s" % accuracy_score(y_test, model.predict(X_test)))
    
    save_path = os.getcwd() + "\\" + "mle-project-challenge"+ "\\"+ "models"
    
    version = int(input("please enter a model version to save the model:"))
    
    if version != '':
        int_version = int(version)
        # Save model
        print("\nSaving model ...")
        file = open(save_path +"//"+'credit_risk_model_{}.pkl'.format(int_version), 'wb')
        pickle.dump(model, file)
        file.close()
        print("Done writing file")
    
    else:
        # Save model
        print("\nSaving model ...")
        file = open(save_path +"//"+'credit_risk_model.pkl', 'wb')
        pickle.dump(model, file)
        file.close()
        print("Done writing file")

In [88]:
create_new_model()

C:\Users\NajaMohamed\Documents\MLE_Challenge_Project\mle-project-challenge\data\combined_data.csv
deafault number of estimators of the model is : 100
Do you want to change the estimators? if yes, enter a an integer number:

Training model ...
Accuracy:
0.9777382012466608
please enter a model version to save the model:1

Saving model ...
Done writing file
