# The data

## Loading the data

In [154]:
import pandas as pd

data = pd.read_csv("data/in-vehicle-coupon-recommendation.csv")

# print(data.head())

## Pre-processing

As a very first step we are going to drop 2 of the features from our dataset, one of them being the feature car, because our dataset includes values of this feature for only 109 records, and the second one will be direction_opp, because it's the complete opposite of direction_same and together they would be redundant. 

In [155]:
data = data.drop(['car','direction_opp'], axis=1)

### Feature Engineering

One of our features has mixed values. That is the feature age and the possible values are the following: "21, 46, 26, 31, 41, 50plus, 36, below21". For that reason we will try to engineer new columns based on this one, we will try 2 options: converting it to numeric values and categorical and we'll see which one does better for our models. 

In [156]:

def convert_age_categorical(value):
    try:
        value = int(value)
        if value < 21:
            return "<21"
        elif value <= 30:
            return "21-30"
        elif value <= 40:
            return "31-40"
        elif value <= 50:
            return "41-50"
        else:
            return "51+"
    except:
        if str(value).lower() == "below21":
            return "<21"
        elif str(value).lower() == "50plus":
            return "51+"
        else:
            return "Unknown"
        

def convert_age_numeric(value):
    try:
        return int(value)
    except:
        mapping = {
            "below21": 20,
            "50plus": 55
        }
        return mapping.get(value.strip(), None)


data['age_numeric'] = data['age'].apply(convert_age_numeric)
data['age_group'] = data['age'].apply(convert_age_categorical)


We will try a similar engineering with income featue, which has the following values: "$37500 - $49999, $62500 - $74999, $12500 - $24999, $75000 - $87499, $50000 - $62499, $25000 - $37499, $100000 or More, $87500 - $99999, Less than $12500". 
As we can see there is obviously an ordered numeric meaning behind it, so we will try 2 way again: categorical and numeric.

We are going to drop former age and income columns after engineering new ones.

In [157]:
def convert_income_numeric(value):
    value = value.strip()

    if value == "Less than $12500":
        return 6250
    elif value == "$12500 - $24999":
        return (12500 + 24999) / 2
    elif value == "$25000 - $37499":
        return (25000 + 37499) / 2
    elif value == "$37500 - $49999":
        return (37500 + 49999) / 2
    elif value == "$50000 - $62499":
        return (50000 + 62499) / 2
    elif value == "$62500 - $74999":
        return (62500 + 74999) / 2
    elif value == "$75000 - $87499":
        return (75000 + 87499) / 2
    elif value == "$87500 - $99999":
        return (87500 + 99999) / 2
    elif value == "$100000 or More":
        return 110000
    else:
        return None
    

def convert_income_categorical(value):
    mapping = {
        "Less than $12500": "Under 12.5k",
        "$12500 - $24999": "12.5k-25k",
        "$25000 - $37499": "25k-37k",
        "$37500 - $49999": "37k-49k",
        "$50000 - $62499": "50k-62k",
        "$62500 - $74999": "62k-74k",
        "$75000 - $87499": "75k-87k",
        "$87500 - $99999": "87k-99k",
        "$100000 or More": "100k+"
    }
    return mapping.get(value.strip(), None)

data['income_numeric'] = data['income'].apply(convert_income_numeric)
data['income_group'] = data['income'].apply(convert_income_categorical)
data = data.drop(['age', 'income'], axis=1)

### Cyclical encoding

For the time feature, we will use cyclical encoding because it has a circular structure.

In [158]:
import numpy as np
import pandas as pd

times = ["2PM", "10AM", "6PM", "7AM", "10PM"]

def convert_time_to_hour(time):
    return pd.to_datetime(time, format='%I%p').hour

data['hour'] = data['time'].apply(convert_time_to_hour)

data['hour_sin'] = np.sin(2 * np.pi * data['hour'] / 24)
data['hour_cos'] = np.cos(2 * np.pi * data['hour'] / 24)
data = data.drop('time', axis=1)

### Bias, Skew, Correlation

In [None]:
# # Skew
print("Skew")
skew_features = ['temperature', 'age_numeric', 'income_numeric']
print(data[skew_features].skew())


# # Correlation
print("Correlation")
corr_features = ['temperature', 'income_numeric', 'age_numeric', 'hour_sin', 'hour_cos']
corr_matrix = data[corr_features].corr()

print(corr_matrix)


Modifications on features based on skew and correlation. 

We have moderate negative (left skew) for temperature and moderate positive (right skew) for age_numeric, so what we'll do is use Box-Cox transformer, which automaticall handles both left and right skewness of the data, and works only for positive values, which is the case for both of our features. 

And based on the correlation matrix our features look good, no strong correlation between variables so we can keep all of them for now.

In [160]:
from sklearn.preprocessing import PowerTransformer

skewed_features_to_transform = ['temperature', 'age_numeric']

pt = PowerTransformer(method='box-cox')
data_transformed = data.copy()
data_transformed[skewed_features_to_transform] = pt.fit_transform(data[skewed_features_to_transform])

# set(data.columns) == set(data_transformed.columns)


Now let's look at the bias.

In [None]:
# Bias
bias_features = ['gender', 'maritalStatus', 'occupation', 'passanger', 'destination', 'coupon', 'weather', 'education', 'income_group', 'age_group', 'has_children', 'toCoupon_GEQ5min', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min', 'direction_same']
print("Bias")
for col in bias_features:
    # print(data[col].value_counts(normalize=True))

    print(f"\n--- Value Distribution for '{col}' ---")
    print(data[col].value_counts(normalize=True).rename("proportion").to_frame())



One of the features that is imbalanced is marital status. So what we'll do is join some of the groups into one for the less represented groups to have more statistical effect and to reduce the noise in the data, this will help to avoid overfitting in our models.

We can say the same about education and occupation. So we will do the same re-grouping for them. In case of occupation, though we also have high cardinality, so it will help with this issue as well. 

We can also see that 2 of our features, weather and direction_same, we also have very high bias. But it was decided to keep these variables as they are because in our opinion they may hold important information and we don't want to lose it at early stages. 

Lastly, we will drop toCoupon_GEQ25min, because it has 88% 0s in it, and toCoupon_GEQ5min, because all the values here are the same, these features will be redundant in our analysis, and after all we also have toCoupon_GEQ15min, which is balanced and can give us an idea whether the driving distance effects the decision making or no.

In [162]:
data_transformed = data_transformed.drop(['toCoupon_GEQ25min', 'toCoupon_GEQ5min'], axis=1)


# marital status
def group_marital_status(x):
    if x in ["Divorced", "Widowed"]:
        return "Previously Married"
    else:
        return x


data_transformed['marital_status_group'] = data_transformed['maritalStatus'].apply(group_marital_status)



# education
def group_education(x):
    if x in ["Some High School", "High School Graduate"]:
        return "High School or Less"
    elif x in ["Associates degree", "Some college - no degree"]:
        return "Some College"
    else:
        return x


data_transformed['education_group'] = data_transformed['education'].apply(group_education)

print(data_transformed['education_group'].value_counts(normalize=True).rename("proportion").to_frame())

                                        proportion
education_group                                   
Some College                              0.433933
Bachelors degree                          0.341769
Graduate degree (Masters or Doctorate)    0.146011
High School or Less                       0.078288


### Encoding ( One-hot and Ordinal )

For categorical data like destination, passanger, weather, coupon, gender, marital status, occupation we will use one-hot encoding, which is good for nominal data, when there is no order/ranking between the categories. 

In [107]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


nominal_cat_features = ['destination', 'passanger', 'weather', 'coupon', 'gender', 'maritalStatus', 'occupation']

seed = 7
X = data.drop('Y', axis=1)
y = data['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)


one_hot_encoder = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


For the rest of the categorical data we will use a different method of encoding because these are considered ordinal data, where categories have order/ranking. Some of these features also have missing values, so we will replace them with the most frequent value of the category.

We also give the order of the categories for the processor to know the correct order and give correct importance.


In [108]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

ordinal_cat_features = ['expiration', 'education', 'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']

ordinal_categories = [
    ['2h', '1d'],   # expiration
    ['Some High School','High School Graduate', 'Some college - no degree', 'Associates degree', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)'], # education
    ['never', 'less1', '1~3', '4~8', 'gt8'],    # Bar
    ['never', 'less1', '1~3', '4~8', 'gt8'],    # CoffeeHouse
    ['never', 'less1', '1~3', '4~8', 'gt8'],    # CarryAway
    ['never', 'less1', '1~3', '4~8', 'gt8'],    # RestaurantLessThan20
    ['never', 'less1', '1~3', '4~8', 'gt8']     # Restaurant20To50
]

ordinal_encoder = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=ordinal_categories))
])

preprocessor = ColumnTransformer(transformers = [
    ('nom', one_hot_encoder, nominal_cat_features),
    ('ord', ordinal_encoder, ordinal_cat_features)
])

X_processed = preprocessor.fit_transform(X_train)
