In [3]:
# timeit

# Student Name : Sophie Briques
# Cohort       : Castro - 3

################################################################################
# Import Packages
################################################################################

# importing libraries
import pandas            as pd                       # data science essentials
import numpy             as np
from sklearn.model_selection import train_test_split # train-test split
from sklearn.metrics import roc_auc_score            # auc score

# CART model packages
from sklearn.tree import DecisionTreeClassifier      # classification trees

#Grid Search
from sklearn.model_selection import GridSearchCV     # hyperparameter tuning
from sklearn.metrics import make_scorer              # customizable scorer


################################################################################
# Load Data
################################################################################

# specifying file name
file = "Apprentice_Chef_Dataset.xlsx"

# reading the file into Python
original_df = pd.read_excel(file)
chef_org = original_df.copy()

################################################################################
# Feature Engineering and (optional) Dataset Standardization
################################################################################

#################################################
##########  User-Defined Functions    ###########
#################################################

##########  Defining function to flag high outliers in variables
def outlier_flag_hi(variable, threshold, data):
    """
    This function is used to flag high outliers in a dataframe the variables' 
    outliers by creating a new column that is preceded by 'out_'.

    PARAMETERS
    ----------
    variable  : str, continuous variable.
    threshold : float, value that will identify where outliers would be.
    data      : dataframe, where the variables are located.
    
    """
    # creating a new column
    data['out_' + variable + '_hi'] = 0
        
    # defining outlier condition
    high = data.loc[0:,'out_' + variable + '_hi'][data[variable] > threshold]
        
    # imputing 1 inside flag column
    data['out_' + variable + '_hi'].replace(to_replace = high,
                                    value   = 1,
                                    inplace = True)

    


##########  Defining function to flag high outliers in variables
def outlier_flag_lo(variable, threshold, data):
    """
    This function is used to flag low outliers in a dataframe the variables' 
    outliers by creating a new column that is preceded by 'out_'.

    PARAMETERS
    ----------
    variable  : str, continuous variable.
    threshold : float, value that will identify where outliers would be.
    data      : dataframe, where the variables are located.
    
    """
    # creating a new column
    data['out_' + variable + '_lo'] = 0
        
    # defining outlier condition
    low = data.loc[0:,'out_' + variable + '_lo'][data[variable] < threshold]
        
    # imputing 1 inside flag column
    data['out_' + variable + '_lo'].replace(to_replace = low,
                                    value   = 1,
                                    inplace = True)
    # Defining function to flag higher variables
def success_flag(variable, threshold, data):
    """
    This function is used to flag in a dataframe the variables' trend changes 
    above a threshold by creating a new column that is preceded by 'success_'.

    PARAMETERS
    ----------
    variable  : str, continuous variable.
    threshold : float, value that will identify after which the trend on variable y changes
    data      : dataframe, where the variables are located.
    
    """
    new_column = 'success_' + variable
    
    # creating a new column
    data[new_column] = 0
        
    # defining outlier condition
    high = data.loc[0:,new_column][data[variable] > threshold]
        
    # imputing 1 inside flag column
    data[new_column].replace(to_replace = high,
                             value   = 1,
                             inplace = True)


#################################################
#############  Feature Engineering    ###########
#################################################

# Flagging missing variables for FAMILY_NAME
# creating a copy of dataframe for safety measures
chef_m = chef_org.copy()

# creating a new column where 1 indicates that observation has a missing family name
chef_m['m_FAMILY_NAME'] = chef_m['FAMILY_NAME'].isnull().astype(int)

# imputing missing values
chef_m['FAMILY_NAME'] = chef_m['FAMILY_NAME'].fillna('Unknown')

# Establishing outliers thresholds for analysis
# Continous
avg_time_per_site_visit_hi = 200
avg_prep_vid_time_hi       = 250
followed_rec_hi            = 75
followed_rec_lo            = 10 
largest_order_size_hi      = 5
avg_clicks_per_visit_hi    = 17
avg_clicks_per_visit_lo    = 11
median_meal_hi             = 3

# Counts:
total_meals_ordered_hi            = 320
unique_meals_purchased_hi         = 8
unique_meals_purchased_lo         = 2
contacts_with_customer_service_hi = 13
cancellations_before_noon_hi      = 8
late_deliveries_hi                = 17
total_photos_viewed_hi            = 800
products_viewed_hi                = 9 
products_viewed_lo                = 2 
median_meal_lo                    = 2

# Target Variable
revenue_hi  =  5500


# Creating Dictionary to link variables with outlier thresholds
lst_thresholds_hi = {
    'AVG_TIME_PER_SITE_VISIT'      : avg_time_per_site_visit_hi,
    'AVG_PREP_VID_TIME'            : avg_prep_vid_time_hi,
    'TOTAL_MEALS_ORDERED'          : total_meals_ordered_hi,
    'UNIQUE_MEALS_PURCH'           : unique_meals_purchased_hi,
    'CONTACTS_W_CUSTOMER_SERVICE'  : contacts_with_customer_service_hi,
    'CANCELLATIONS_BEFORE_NOON'    : cancellations_before_noon_hi,
    'LATE_DELIVERIES'              : late_deliveries_hi,
    'TOTAL_PHOTOS_VIEWED'          : total_photos_viewed_hi,
    'REVENUE'                      : revenue_hi,
    'FOLLOWED_RECOMMENDATIONS_PCT' : followed_rec_hi,
    'LARGEST_ORDER_SIZE'           : largest_order_size_hi,
    'PRODUCT_CATEGORIES_VIEWED'    : products_viewed_hi,
    'AVG_CLICKS_PER_VISIT'         : avg_clicks_per_visit_hi,
    'PRODUCT_CATEGORIES_VIEWED'    : products_viewed_hi,
    'MEDIAN_MEAL_RATING'           : median_meal_hi
    }

lst_thresholds_lo = {
    'AVG_CLICKS_PER_VISIT'          : avg_clicks_per_visit_lo,
    'PRODUCT_CATEGORIES_VIEWED'     : products_viewed_lo,
    'FOLLOWED_RECOMMENDATIONS_PCT'  : followed_rec_lo,
    'UNIQUE_MEALS_PURCH'            : unique_meals_purchased_lo,
    'MEDIAN_MEAL_RATING'            : median_meal_lo
     }

# creating a copy of dataframe for safety measures
chef_o = chef_m.copy()

# Looping over variables to create outlier flags:
for key in lst_thresholds_hi.keys():
    outlier_flag_hi(key,lst_thresholds_hi[key],chef_o)
    
for key in lst_thresholds_lo.keys():
    outlier_flag_lo(key,lst_thresholds_lo[key],chef_o)
    
#merging avg clicks per visit hi and lo
chef_o['out_AVG_CLICKS_PER_VISIT'] = chef_o['out_AVG_CLICKS_PER_VISIT_hi'] + chef_o['out_AVG_CLICKS_PER_VISIT_lo'] 

# STEP 1: splitting emails
# placeholder list
placeholder_lst = []

# looping over each email address
for index, col in chef_o.iterrows():
    
    # splitting email domain at '@'
    split_email = chef_o.loc[index, 'EMAIL'].split(sep = '@')

    # appending placeholder_lst with the results
    placeholder_lst.append(split_email)
    
# converting placeholder_lst into a DataFrame
email_df = pd.DataFrame(placeholder_lst)

# STEP 2: concatenating with original DataFrame
# Creating a copy of chef for features and safety measure
chef_v = chef_o.copy()

# renaming column to concatenate
email_df.columns = ['name' , 'EMAIL_DOMAIN'] 

# concatenating personal_email_domain with chef DataFrame
chef_v = pd.concat([chef_v, email_df.loc[:, 'EMAIL_DOMAIN']], 
                   axis = 1)

# printing value counts of personal_email_domain
chef_v.loc[: ,'EMAIL_DOMAIN'].value_counts()

# email domain types
professional_email_domains = ['@mmm.com',         '@amex.com',
                              '@apple.com',       '@boeing.com',
                              '@caterpillar.com', '@chevron.com',
                              '@cisco.com',       '@cocacola.com',
                              '@disney.com',      '@dupont.com',
                              '@exxon.com',       '@ge.org',
                              '@goldmansacs.com', '@homedepot.com',
                              '@ibm.com',         '@intel.com',
                              '@jnj.com',         '@jpmorgan.com',
                              '@mcdonalds.com',   '@merck.com',
                              '@microsoft.com',   '@nike.com',
                              '@pfizer.com',      '@pg.com',
                              '@travelers.com',   '@unitedtech.com',
                              '@unitedhealth.com','@verizon.com',
                              '@visa.com',        '@walmart.com']
personal_email_domains     = ['@gmail.com',       '@yahoo.com',    
                              '@protonmail.com']
junk_email_domains         = ['@me.com',          '@aol.com',
                              '@hotmail.com',     '@live.com', 
                              '@msn.com',         '@passport.com']

# placeholder list
placeholder_lst = []  


# looping to group observations by domain type
for domain in chef_v['EMAIL_DOMAIN']:
        if "@" + domain in professional_email_domains:
            placeholder_lst.append('professional')
            
        elif "@" + domain in personal_email_domains:
            placeholder_lst.append('personal')
            
        elif "@" + domain in junk_email_domains:
            placeholder_lst.append('junk')
            
        else:
            print('Unknown')


# concatenating with original DataFrame
chef_v['email_domain_group'] = pd.Series(placeholder_lst)

# checking results and sample size
#print(chef['email_domain_group'].value_counts())

# Step 3: One-Hot encoding
one_hot_email_domain = pd.get_dummies(chef_v['email_domain_group'])

# dropping orginal columns to keep only encoded ones
chef_e               = chef_v.drop(['email_domain_group','EMAIL','EMAIL_DOMAIN'], axis = 1)

# joining encoded columns to dataset
chef_e               = chef_e.join(one_hot_email_domain)

# including new categorical variables to list
domains              = ['professional','personal','junk']

# creating a copy of dataframe for safety measures
chef_n = chef_e.copy()

# placeholder for 'rev_per_meal' feature
chef_n['rev_per_meal'] = 0

# replacing values based on calculation
for index, col in chef_n.iterrows():
    revenue      = chef_n.loc[index, 'REVENUE']
    total_orders = chef_n.loc[index, 'TOTAL_MEALS_ORDERED']
    chef_n.loc[index, 'rev_per_meal'] = (revenue / total_orders).round(2)
    
# Determining Outliers in new variable
#distributions('rev_per_meal', chef_n)

# Establishing Outlier Flags
rev_per_meal_hi = 70
rev_per_meal_lo = 15
outlier_flag_hi('rev_per_meal', rev_per_meal_hi, chef_n)
outlier_flag_lo('rev_per_meal', rev_per_meal_lo, chef_n)

# creating a copy of dataframe for safety measures
chef_n = chef_n.copy()

# new column for 'rev_per_login' feature
chef_n['rev_per_pclogin']     = 0
chef_n['rev_per_mobilelogin'] = 0

# replacing values based on calculation
for index, col in chef_n.iterrows():
    revenue       = chef_n.loc[index, 'REVENUE']
    PC_LOGINS     = chef_n.loc[index, 'PC_LOGINS']
    if PC_LOGINS   == 0:
        chef_n.loc[index, 'rev_per_pclogin'] = 0
    elif PC_LOGINS >= 0:
        chef_n.loc[index, 'rev_per_pclogin'] = (revenue / PC_LOGINS).round(2)
    else:
        print('Something went wrong.')

for index, col in chef_n.iterrows():
    revenue       = chef_n.loc[index, 'REVENUE']
    MOBILE_LOGINS = chef_n.loc[index, 'MOBILE_LOGINS']    
    if MOBILE_LOGINS   == 0:
        chef_n.loc[index, 'rev_per_mobilelogin'] = 0
    elif MOBILE_LOGINS >= 0:
        chef_n.loc[index, 'rev_per_mobilelogin'] = (revenue / MOBILE_LOGINS).round(2)
    else:
        print('Something went wrong.')
        
# flagging outliers
rev_per_pclogin_hi = 800
rev_per_pclogin_lo = 150
outlier_flag_hi('rev_per_pclogin', rev_per_pclogin_hi, chef_n)
outlier_flag_lo('rev_per_pclogin', rev_per_pclogin_lo, chef_n)

# flagging outliers
rev_per_mobilelogin_hi = 2500
rev_per_mobilelogin_lo = 200
outlier_flag_hi('rev_per_mobilelogin', rev_per_mobilelogin_hi, chef_n)
outlier_flag_lo('rev_per_mobilelogin', rev_per_mobilelogin_lo, chef_n)

# Establishing trend thresholds for analysis
# above this threshold its a succes
followed_recommendations_pct_1 = 20 #(or 30 for certainty)
cancellations_before_noon_1    = 2 #(or 1 for mean)
median_ratings_1               = 3
median_ratings_2               = 2

# Creating Dictionary to link variables with outlier thresholds
success_trend = {
    'FOLLOWED_RECOMMENDATIONS_PCT' : followed_recommendations_pct_1,
    'CANCELLATIONS_BEFORE_NOON'    : cancellations_before_noon_1,
    'MEDIAN_MEAL_RATING'           : median_ratings_1,
    'MEDIAN_MEAL_RATING'           : median_ratings_2
     }

# creating a copy of dataframe for safety measures
chef_t = chef_n.copy()

# Looping over variables to create trend flags:
for key in success_trend.keys():
    success_flag(key,success_trend[key],chef_t)
    
# creating a copy for safety measures
chef = chef_t.copy()

# dropping discrete variables (only run once!)
chef = chef.drop(['NAME', 'FIRST_NAME', 'FAMILY_NAME'], axis = 1)

# Defining a dictionary with explanatory variables names 
variables_dict = {
    "target"     : [    # target variable
        'CROSS_SELL_SUCCESS'
    ],
    'Best Model' : [
        'EARLY_DELIVERIES',
          'MOBILE_NUMBER','CANCELLATIONS_BEFORE_NOON',
          'CANCELLATIONS_AFTER_NOON','TASTES_AND_PREFERENCES',
          'REFRIGERATED_LOCKER','FOLLOWED_RECOMMENDATIONS_PCT',
          'personal','professional','junk',
          'out_FOLLOWED_RECOMMENDATIONS_PCT_hi',
          'out_FOLLOWED_RECOMMENDATIONS_PCT_lo',
          'out_PRODUCT_CATEGORIES_VIEWED_hi',
          'out_PRODUCT_CATEGORIES_VIEWED_lo',
          'out_MEDIAN_MEAL_RATING_hi',
          'out_MEDIAN_MEAL_RATING_lo',
          'rev_per_mobilelogin',
          'out_rev_per_pclogin_hi',
          'out_rev_per_pclogin_lo',
          'out_rev_per_mobilelogin_hi',
          'out_rev_per_mobilelogin_lo',
          'success_FOLLOWED_RECOMMENDATIONS_PCT'
    ]
}


# setting random state
seed = 222

# Defining target variable
chef_target = chef.loc[: , variables_dict['target']]

###### Non Standardized Preparation 
# Defining explanatory variables (add according to new feature selections)
chef_best = chef.loc[: , variables_dict['Best Model']]

# train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
            chef_best,  # change
            chef_target,
            test_size = 0.25,
            random_state = seed,
            stratify = chef_target) # stratifying target variable to ensure balance

# merging training data for statsmodels
chef_train = pd.concat([X_train, y_train], axis = 1) # contains target variable!

################################################################################
# Final Model (instantiate, fit, and predict)
################################################################################

# declaring a hyperparameter space
#criterion_space = ['gini', 'entropy']
#splitter_space = ['best', 'random']
#depth_space    = pd.np.arange(1, 25)
#leaf_space     = pd.np.arange(1, 100)
#
#
## creating a hyperparameter grid
#param_grid = {'criterion'        : criterion_space,
#              'splitter'         : splitter_space,
#              'max_depth'        : depth_space,
#              'min_samples_leaf' : leaf_space}
#
#
## INSTANTIATING the model object without hyperparameters
#tuned_tree = DecisionTreeClassifier(random_state = seed)
#

# GridSearchCV object
#tuned_tree_cv = GridSearchCV(estimator  = tuned_tree,
#                             param_grid = param_grid,
#                             cv         = 3,
#                             scoring    = make_scorer(roc_auc_score,
#                                                      needs_threshold = False))
#

# INSTANTIATING a logistic regression model with tuned values
tree_tuned = DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                                    max_depth=18, max_features=None, max_leaf_nodes=None,
                                    min_impurity_decrease=0.0, min_impurity_split=None,
                                    min_samples_leaf=2, min_samples_split=2,
                                    min_weight_fraction_leaf=0.0, presort='deprecated',
                                    random_state=seed, splitter='random')


# FITTING to the FULL DATASET (due to cross-validation from GridSearch)
tree_tuned.fit(chef_best, chef_target)

# PREDICTING based on the testing set
tree_tuned_pred = tree_tuned.predict(X_test)

## printing the optimal parameters and best score
#print("Tuned Parameters  :", tuned_tree_cv.best_params_)
#print("Tuned Training AUC:", tuned_tree_cv.best_score_.round(4))

################################################################################
# Final Model Score (score)
################################################################################

test_score       = roc_auc_score(y_true  = y_test,
                               y_score = tree_tuned_pred).round(3)
test_score

0.906