In [10]:
import pandas as pd
import pickle

In [11]:
# import customers for scoring
to_be_scored = pickle.load(open('data/abc_regression_scoring.p', 'rb'))
to_be_scored.head()

Unnamed: 0,customer_id,distance_from_store,gender,credit_score,total_sales,total_items,transaction_count,product_area_count,avg_basket_value
6,1,4.78,F,0.66,3980.49,424,51,5,78.048824
7,120,3.49,F,0.38,2887.2,253,45,5,64.16
8,52,14.91,F,0.68,3342.75,335,47,5,71.12234
10,435,0.25,M,0.62,2326.71,267,48,5,48.473125
12,679,4.74,F,0.58,3448.59,370,49,5,70.379388


In [12]:
# import model and model objects
regressor = pickle.load(open('data/random_forest_regression_model.p', 'rb'))
one_hot_encoder = pickle.load(open('data/random_forest_regression_model_ohe.p', 'rb'))

In [13]:
# drop unused column (customer_id)
to_be_scored.drop(['customer_id'], axis = 1, inplace = True)
to_be_scored.head()

Unnamed: 0,distance_from_store,gender,credit_score,total_sales,total_items,transaction_count,product_area_count,avg_basket_value
6,4.78,F,0.66,3980.49,424,51,5,78.048824
7,3.49,F,0.38,2887.2,253,45,5,64.16
8,14.91,F,0.68,3342.75,335,47,5,71.12234
10,0.25,M,0.62,2326.71,267,48,5,48.473125
12,4.74,F,0.58,3448.59,370,49,5,70.379388


In [14]:
# drop missing values
to_be_scored.dropna(how = 'any', inplace = True)
to_be_scored.isna().sum()

distance_from_store    0
gender                 0
credit_score           0
total_sales            0
total_items            0
transaction_count      0
product_area_count     0
avg_basket_value       0
dtype: int64

In [15]:
# add columns to apply one hot encoding into a list
categorical_vars = ['gender']

In [17]:
# apply one hot encoder object
encoder_vars_array = one_hot_encoder.transform(to_be_scored[categorical_vars])

In [19]:
# create object to get feature names
encoder_feature_names = one_hot_encoder.get_feature_names(categorical_vars)

In [21]:
# create data frame with encoder variables and feature names
encoder_vars_df = pd.DataFrame(encoder_vars_array, columns = encoder_feature_names)
encoder_vars_df.tail()

Unnamed: 0,gender_M
458,0.0
459,1.0
460,1.0
461,0.0
462,1.0


In [22]:
# concat new dataframe to original resetting index (ensures no rows are misaligned)
to_be_scored = pd.concat([
    to_be_scored.reset_index(drop = True), 
    encoder_vars_df.reset_index(drop = True)], 
    axis = 1) # axis 1 concats columns and not rows
    
to_be_scored.tail()

Unnamed: 0,distance_from_store,gender,credit_score,total_sales,total_items,transaction_count,product_area_count,avg_basket_value,gender_M
458,400.97,F,0.54,4072.86,346,45,5,90.508,0.0
459,0.32,M,0.38,3097.99,281,51,5,60.744902,1.0
460,1.62,M,0.63,3736.02,250,46,5,81.217826,1.0
461,4.36,F,0.54,4407.12,354,47,5,93.768511,0.0
462,1.87,M,0.59,3067.83,254,53,5,57.883585,1.0


In [23]:
# drop original categorical columns
to_be_scored.drop(categorical_vars, axis = 1, inplace = True) # axis 1 drops columns and not rows
to_be_scored.tail()

Unnamed: 0,distance_from_store,credit_score,total_sales,total_items,transaction_count,product_area_count,avg_basket_value,gender_M
458,400.97,0.54,4072.86,346,45,5,90.508,0.0
459,0.32,0.38,3097.99,281,51,5,60.744902,1.0
460,1.62,0.63,3736.02,250,46,5,81.217826,1.0
461,4.36,0.54,4407.12,354,47,5,93.768511,0.0
462,1.87,0.59,3067.83,254,53,5,57.883585,1.0


In [25]:
# make our predictions
loyalty_predictions = regressor.predict(to_be_scored)
loyalty_predictions

array([0.42345, 0.35396, 0.34514, 0.95676, 0.38008, 0.9627 , 0.4302 ,
       0.69317, 0.31698, 0.73104, 0.44781, 0.54854, 0.43214, 0.45483,
       0.5508 , 0.37755, 0.74004, 0.2978 , 0.89329, 0.41708, 0.34697,
       0.55372, 0.5739 , 0.38557, 0.62913, 0.46992, 0.58406, 0.91735,
       0.53525, 0.3648 , 0.40272, 0.48865, 0.50685, 0.61025, 0.66266,
       0.41728, 0.43023, 0.92005, 0.45955, 0.48236, 0.84564, 0.70169,
       0.47268, 0.94093, 0.96867, 0.31728, 0.39394, 0.97276, 0.55982,
       0.47096, 0.8516 , 0.86386, 0.72613, 0.26382, 0.55749, 0.83494,
       0.24437, 0.26814, 0.24091, 0.22272, 0.25976, 0.87039, 0.84182,
       0.57464, 0.39745, 0.59339, 0.95581, 0.36904, 0.959  , 0.24945,
       0.58987, 0.42502, 0.65168, 0.48208, 0.6271 , 0.34694, 0.18675,
       0.62316, 0.80632, 0.50647, 0.24854, 0.27154, 0.32635, 0.71318,
       0.52275, 0.24928, 0.37679, 0.28283, 0.411  , 0.36569, 0.35441,
       0.90386, 0.92051, 0.26036, 0.40409, 0.3655 , 0.87016, 0.56444,
       0.29612, 0.95

In [36]:
loyalty_predictions_df = pd.DataFrame(loyalty_predictions)
loyalty_predictions_df.columns = ['loyalty_prediction']

scored = pd.concat([
    to_be_scored.reset_index (drop = True),
    loyalty_predictions_df.reset_index(drop = True)],
    axis = 1
)

scored

Unnamed: 0,distance_from_store,credit_score,total_sales,total_items,transaction_count,product_area_count,avg_basket_value,gender_M,loyalty_prediction
0,4.78,0.66,3980.49,424,51,5,78.048824,0.0,0.42345
1,3.49,0.38,2887.20,253,45,5,64.160000,0.0,0.35396
2,14.91,0.68,3342.75,335,47,5,71.122340,0.0,0.34514
3,0.25,0.62,2326.71,267,48,5,48.473125,1.0,0.95676
4,4.74,0.58,3448.59,370,49,5,70.379388,0.0,0.38008
...,...,...,...,...,...,...,...,...,...
458,400.97,0.54,4072.86,346,45,5,90.508000,0.0,0.35990
459,0.32,0.38,3097.99,281,51,5,60.744902,1.0,0.93349
460,1.62,0.63,3736.02,250,46,5,81.217826,1.0,0.67269
461,4.36,0.54,4407.12,354,47,5,93.768511,0.0,0.37293
