## Logistic Regression Model

In [153]:
# Imports

import pandas as pd
import numpy as np
import xgboost as xgb
import joblib

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import log_loss, roc_auc_score, recall_score, precision_score, average_precision_score, f1_score, classification_report, accuracy_score
from scikitplot.metrics import plot_roc_curve, plot_precision_recall_curve, plot_confusion_matrix

In [160]:
# Importing our dataset

df = pd.read_csv("cleaned_data.csv")

df.head()

Unnamed: 0,age_upon_outcome_days,intake_year,time_in_shelter_days,Outcome,Mixed_breed,Purebred,Bird,Cat,Dog,Other,...,Spayed_female,sex_Unknown,Multi_Colour,Solid_Colour,intake_cond_Aged,intake_cond_Feral,intake_cond_Normal,intake_cond_Other,intake_cond_Pregnant_nursing,intake_cond_Unhealthy
0,3650,2017,0.588194,0,1,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
1,2555,2014,1.259722,0,1,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
2,2190,2014,1.113889,0,1,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
3,3650,2014,4.970139,0,0,1,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
4,5840,2013,0.119444,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1


In [161]:
# Seperating numerical and categorical data

numeric_cols = ['age_upon_outcome_days', 'intake_year', 'time_in_shelter_days']
cat_cols = list(set(df.columns) - set(numeric_cols) - {'Outcome'})
cat_cols.sort()

print(numeric_cols)
print(cat_cols)

['age_upon_outcome_days', 'intake_year', 'time_in_shelter_days']
['Bird', 'Cat', 'Dog', 'Intact_female', 'Intact_male', 'Mixed_breed', 'Multi_Colour', 'Neutered_male', 'Other', 'Purebred', 'Solid_Colour', 'Spayed_female', 'intake_cond_Aged', 'intake_cond_Feral', 'intake_cond_Normal', 'intake_cond_Other', 'intake_cond_Pregnant_nursing', 'intake_cond_Unhealthy', 'sex_Unknown']


In [4]:
# Splitting data into test and train with a test size of 20%

random_seed = np.random.RandomState()

df_train, df_test = train_test_split(df, test_size=0.2, random_state=random_seed, stratify=df['Outcome'])

print(df_train.shape)
print(df_test.shape)
print()
print(df_train['Outcome'].value_counts(normalize=True))
print()
print(df_test['Outcome'].value_counts(normalize=True))

(63728, 23)
(15933, 23)

Outcome
0    0.578286
1    0.421714
Name: proportion, dtype: float64

Outcome
0    0.578297
1    0.421703
Name: proportion, dtype: float64


In [5]:
# Scaling numerical columns

scaler = StandardScaler()
scaler.fit(df_train[numeric_cols])

def get_features_and_target_arrays(df, numeric_cols, cat_cols, scaler):
    X_numeric_scaled = scaler.transform(df[numeric_cols])
    X_categorical = df[cat_cols].to_numpy()
    X = np.hstack((X_categorical, X_numeric_scaled))
    y = df['Outcome']
    return X, y

X, y = get_features_and_target_arrays(df_train, numeric_cols, cat_cols, scaler)

In [6]:
# Fitting model

model = LogisticRegression()

model.fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
X_test, y_test = get_features_and_target_arrays(df_test, numeric_cols, cat_cols, scaler)

test_prob = model.predict_proba(X_test)[:, 1]
test_pred = model.predict(X_test)

In [8]:
# Evaluating model results

print('Log loss = {:.3f}'.format(log_loss(y_test, test_prob)))
print('AUC = {:.3f}'.format(roc_auc_score(y_test, test_prob)))
print('Average Precision = {:.3f}'.format(average_precision_score(y_test, test_prob)))
print('\nUsing 0.5 as threshold:')
print('Accuracy = {:.3f}'.format(accuracy_score(y_test, test_pred)))
print('Precision = {:.3f}'.format(precision_score(y_test, test_pred)))
print('Recall = {:.3f}'.format(recall_score(y_test, test_pred)))
print('F1 score = {:.3f}'.format(f1_score(y_test, test_pred)))

print('\nClassification Report')
print(classification_report(y_test, test_pred))

Log loss = 0.451
AUC = 0.871
Average Precision = 0.804

Using 0.5 as threshold:
Accuracy = 0.782
Precision = 0.702
Recall = 0.837
F1 score = 0.764

Classification Report
              precision    recall  f1-score   support

           0       0.86      0.74      0.80      9214
           1       0.70      0.84      0.76      6719

    accuracy                           0.78     15933
   macro avg       0.78      0.79      0.78     15933
weighted avg       0.79      0.78      0.78     15933



In [9]:
# Hyper-parameter tuning

param_grid = [    
    {
    'C' : 0.0006951927961775605,
    'solver' : 'liblinear',
    'max_iter' : [100, 1000,2500, 5000]
    }
]

In [10]:
clf = GridSearchCV(model, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1)

In [11]:
best_clf = clf.fit(X,y)

Fitting 3 folds for each of 400 candidates, totalling 1200 fits


In [12]:
best_clf.best_estimator_

In [13]:
print (f'Accuracy - : {best_clf.score(X,y):.3f}')

Accuracy - : 0.793


In [28]:
model = LogisticRegression(C=0.0006951927961775605, solver='liblinear', penalty = 'l1')

model.fit(X, y)

In [30]:
X_test, y_test = get_features_and_target_arrays(df_test, numeric_cols, cat_cols, scaler)

test_prob = model.predict_proba(X_test)[:, 1]
test_pred = model.predict(X_test)

In [31]:
print('Accuracy = {:.3f}'.format(accuracy_score(y_test, test_pred)))

Accuracy = 0.794


In [32]:
model = LogisticRegression(C=0.0006951927961775605, solver='liblinear', penalty = 'l2')

model.fit(X, y)

In [33]:
X_test, y_test = get_features_and_target_arrays(df_test, numeric_cols, cat_cols, scaler)

test_prob = model.predict_proba(X_test)[:, 1]
test_pred = model.predict(X_test)

In [34]:
print('Accuracy = {:.3f}'.format(accuracy_score(y_test, test_pred)))

Accuracy = 0.790


## XGBoost Model

In [38]:
# fit model on training data
model_xgb = XGBClassifier()
model_xgb.fit(X, y)

In [39]:
X_test, y_test = get_features_and_target_arrays(df_test, numeric_cols, cat_cols, scaler)

test_prob = model_xgb.predict_proba(X_test)[:, 1]
test_pred = model_xgb.predict(X_test)

In [40]:
# evaluate predictions
accuracy = accuracy_score(y_test, test_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 83.47%


In [91]:
# Creating function to find the best parameters to use for model
def random_search_tuning():
    
    params = {'max_depth': [3, 6, 10, 15],
              'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],
              'subsample': np.arange(0.5, 1.0, 0.1),
              'colsample_bytree': np.arange(0.5, 1.0, 0.1),
              'colsample_bylevel': np.arange(0.5, 1.0, 0.1),
              'n_estimators': [100, 250, 500, 750],
              'num_class': [10],
              'gamma': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
              }
    
    xgbclf = xgb.XGBClassifier(objective="multi:softmax", tree_method='hist')
    clf = RandomizedSearchCV(estimator=xgbclf,
                             param_distributions=params,
                             scoring='accuracy',
                             n_iter=25,
                             n_jobs=4,
                             verbose=1)

    clf.fit(X, y)

    best_combination = clf.best_params_

    return best_combination

if __name__ == '__main__':
    
    best_params = random_search_tuning()


    print("Best hyperparameter combination: ", best_params)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best hyperparameter combination:  {'subsample': 0.7, 'num_class': 10, 'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.3, 'gamma': 0, 'colsample_bytree': 0.8999999999999999, 'colsample_bylevel': 0.5}


In [147]:
# fit model with parameters on training data
model_xgb = XGBClassifier(subsample=0.7, n_estimators=500, max_depth=10, learning_rate=0.01, colsample_bytree=0.7999999999999999, colsample_bylevel=0.7999999999999999, gamma=0, reg_lambda=0, min_child_weight=0 )
model_xgb.fit(X, y)

In [148]:
X_test, y_test = get_features_and_target_arrays(df_test, numeric_cols, cat_cols, scaler)

test_prob = model_xgb.predict_proba(X_test)[:, 1]
test_pred = model_xgb.predict(X_test)

In [149]:
# evaluate predictions
accuracy = accuracy_score(y_test, test_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 83.51%


**We have a slightly higher accuracy score**

In [162]:
# Saving model

pickle_out = open("model_xgb.pkl", "wb") 
pickle.dump(model_xgb, pickle_out) 
pickle_out.close()

In [163]:
df.columns

Index(['age_upon_outcome_days', 'intake_year', 'time_in_shelter_days',
       'Outcome', 'Mixed_breed', 'Purebred', 'Bird', 'Cat', 'Dog', 'Other',
       'Intact_female', 'Intact_male', 'Neutered_male', 'Spayed_female',
       'sex_Unknown', 'Multi_Colour', 'Solid_Colour', 'intake_cond_Aged',
       'intake_cond_Feral', 'intake_cond_Normal', 'intake_cond_Other',
       'intake_cond_Pregnant_nursing', 'intake_cond_Unhealthy'],
      dtype='object')

Unnamed: 0,age_upon_outcome_days,intake_year,time_in_shelter_days,Outcome,Mixed_breed,Purebred,Bird,Cat,Dog,Other,...,Spayed_female,sex_Unknown,Multi_Colour,Solid_Colour,intake_cond_Aged,intake_cond_Feral,intake_cond_Normal,intake_cond_Other,intake_cond_Pregnant_nursing,intake_cond_Unhealthy
0,3650,2017,0.588194,0,1,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
1,2555,2014,1.259722,0,1,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
2,2190,2014,1.113889,0,1,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
3,3650,2014,4.970139,0,0,1,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
4,5840,2013,0.119444,0,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79656,14,2018,0.077083,0,1,0,0,1,0,0,...,0,1,1,0,0,0,1,0,0,0
79657,730,2018,0.053472,0,1,0,0,0,0,1,...,0,1,0,1,0,0,1,0,0,0
79658,365,2018,0.047917,0,1,0,0,0,0,1,...,0,1,0,1,0,0,1,0,0,0
79659,300,2018,1.762500,0,1,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0


In [165]:
results = df.loc[df["Outcome"] == 1]


In [166]:
results

Unnamed: 0,age_upon_outcome_days,intake_year,time_in_shelter_days,Outcome,Mixed_breed,Purebred,Bird,Cat,Dog,Other,...,Spayed_female,sex_Unknown,Multi_Colour,Solid_Colour,intake_cond_Aged,intake_cond_Feral,intake_cond_Normal,intake_cond_Other,intake_cond_Pregnant_nursing,intake_cond_Unhealthy
12,5840,2013,49.747917,1,1,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
14,6205,2016,9.020139,1,0,1,0,0,1,0,...,0,0,1,0,1,0,0,0,0,0
22,5475,2015,23.104861,1,1,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
32,5475,2016,89.855556,1,1,0,0,0,1,0,...,1,0,1,0,0,0,1,0,0,0
48,4015,2013,2.755556,1,1,0,0,0,1,0,...,1,0,1,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79638,60,2018,5.039583,1,1,0,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
79642,60,2018,3.941667,1,0,1,0,0,1,0,...,0,0,1,0,0,0,1,0,0,0
79643,365,2018,5.152083,1,1,0,0,0,1,0,...,1,0,1,0,0,0,1,0,0,0
79644,365,2018,2.279167,1,1,0,0,0,1,0,...,1,0,1,0,0,0,1,0,0,0
