In [2]:
# Load all helper functions
%run -i '../util/helper.py'

In [20]:
#XXXXXXXXXXxxxxxxxxXXXXX
# import pandas as pd
# import numpy as np
# from sklearn.pipeline import make_pipeline, Pipeline
# from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
# random_state = 4995
# import matplotlib.pyplot as plt
# import time
# from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, PrecisionRecallDisplay, roc_curve, RocCurveDisplay
# from sklearn.linear_model import LogisticRegression
# from imblearn.under_sampling import RandomUnderSampler
# df = pd.read_csv('brazilian_ecommerce_encoded.csv')

# y = df["is_delayed"]
# X = df.drop(["is_delayed"], axis=1, inplace=False)

## Load and Split Data

In [3]:
# load dataset
df  = pd.read_csv('../Dataset/brazilian_ecommerce_encoded.csv')

#split dataset in features and target variable
y = df["is_delayed"]
X = df.drop(["is_delayed"], axis=1, inplace=False)

In [4]:
# split X and y into training and testing sets
X_dev, X_test, y_dev, y_test = train_test_split(X, y, random_state=random_state, test_size=0.2, stratify=y)

In [5]:
print(X_dev.shape)
print(X_test.shape)

(92506, 30)
(23127, 30)


# Hyperparameter Tuning

## Stratified Sampling

### GridSearch

In [11]:
penalty = ['l2', 'none']
C = np.logspace(-4, 4, 5)
solver = ['newton-cg', 'lbfgs', 'sag', 'saga']
start3 = time()

parameter_grid = {'classifier__penalty': penalty,
                  'classifier__C': C,
                  'classifier__solver':solver}

pipe_logistic_tune = Pipeline([('classifier', LogisticRegression(random_state=random_state))])

pipe_logistic = make_pipeline(GridSearchCV(pipe_logistic_tune,
                                      param_grid=parameter_grid,
                                      return_train_score=True,
                                      cv=StratifiedKFold(n_splits=10, shuffle=False),
                                      n_jobs=-1, 
                                      scoring=['recall','f1'],
                                      refit='f1',
                                      verbose = 5))
pipe_logistic.fit(X_dev, np.ravel(y_dev))
end3 = time()
print("Logistic Regression model takes " + str(end3-start3) + "seconds")
print("The performance of the Logistic Regression model on test data is ", pipe_logistic.score(X_test, y_test))
grid_search_logistic = pipe_logistic.named_steps["gridsearchcv"]
print("The Logistic Regression model Best hyperparameters are {}".format(grid_search_logistic.best_params_))



Fitting 10 folds for each of 80 candidates, totalling 800 fits


KeyboardInterrupt: 

## Evaluate on the Test Set

In [None]:
logistic_prediction = pipe_logistic.predict(X_test)

print('Accuracy: {}'.format(accuracy_score(y_test, logistic_prediction)))
print('Recall: {}'.format(recall_score(y_test, logistic_prediction)))
print('Precision: {}'.format(precision_score(y_test, logistic_prediction)))
print('F-1 Score: {}'.format(f1_score(y_test, logistic_prediction)))

recall, precision = recall_score(y_test, logistic_prediction), precision_score(y_test, logistic_prediction)

In [None]:
probs = pipe_logistic.predict_proba(X_test)[:,1]
PrecisionRecallDisplay.from_predictions(y_test, probs)
plt.plot(recall,precision,'ro');

NameError: name 'pipe_logistic' is not defined

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, probs, pos_label = 1)
disp = RocCurveDisplay(fpr = fpr, tpr = tpr)
disp.plot()

NameError: name 'probs' is not defined

In [None]:
filename = '../model/logistic_stratified.sav'
pickle.dump(pipe_logistic, open(filename, 'wb'))

## Downsampling

In [None]:
rus = RandomUnderSampler(replacement=False, random_state=random_state)
X_dev_us, y_dev_us = rus.fit_resample(X_dev, y_dev)

## GridSearch

In [None]:
penalty = ['l2', 'none']
C = [np.logspace(-4, 4, 20)]
solver = ['newton-cg', 'lbfgs', 'sag', 'saga']
start3 = time()

parameter_grid = {'classifier__penalty' :penalty,
                  'classifier__C': C,
                  'classifier__solver':solver}

pipe_logistic_tune = Pipeline([('classifier', LogisticRegression(random_state=random_state))])

pipe_logistic_rus = make_pipeline(GridSearchCV(pipe_logistic_tune,
                                      param_grid=parameter_grid,
                                      return_train_score=True,
                                      cv=StratifiedKFold(n_splits=10,shuffle=False),
                                      n_jobs=-1, 
                                      scoring=['recall','f1'],
                                      refit='f1'))
pipe_logistic_rus.fit(X_dev_us, np.ravel(y_dev_us))
end3 = time()
print("Logistic Regression model takes " + str(end3-start3) + "seconds")
print("The performance of the Logistic Regression model on test data is ", pipe_logistic_rus.score(X_test, y_test))
grid_search_logistic = pipe_logistic_rus.named_steps["gridsearchcv"]
print("The Logistic Regression model Best hyperparameters are {}".format(grid_search_logistic.best_params_))


## Evaluate on the Test Set

In [None]:
logistic_prediction = pipe_logistic_rus.predict(X_test)

print('Accuracy: {}'.format(accuracy_score(y_test, logistic_prediction)))
print('Recall: {}'.format(recall_score(y_test, logistic_prediction)))
print('Precision: {}'.format(precision_score(y_test, logistic_prediction)))
print('F-1 Score: {}'.format(f1_score(y_test, logistic_prediction)))

recall, precision = recall_score(y_test, logistic_prediction), precision_score(y_test, logistic_prediction)

NameError: name 'pipe_logistic' is not defined

In [None]:
probs = pipe_logistic.predict_proba(X_test)[:,1]
PrecisionRecallDisplay.from_predictions(y_test, probs)
plt.plot(recall,precision,'ro');

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, probs, pos_label = 1)
disp = RocCurveDisplay(fpr = fpr, tpr = tpr)
disp.plot()

In [None]:
filename = '../Model/logistic_downsampling.sav'
pickle.dump(pipe_logistic_rus, open(filename, 'wb'))