# Loading the data

In [35]:
import pandas as pd

data = pd.read_csv("data/in-vehicle-coupon-recommendation.csv")

# print(data.head())

# Preparing the data

One of our features has mixed values. That is the feature age and the possible values are the following: "21, 46, 26, 31, 41, 50plus, 36, below21". For that reason we will try to engineer new columns based on this one, we will try 2 options: converting it to numeric values and categorical and we'll see which one does better for our models. 

In [None]:
import matplotlib.pyplot as plt

def map_age(value):
    try:
        value = int(value)
        if value < 21:
            return "<21"
        elif value <= 30:
            return "21-30"
        elif value <= 40:
            return "31-40"
        elif value <= 50:
            return "41-50"
        else:
            return "51+"
    except:
        if str(value).lower() == "below21":
            return "<21"
        elif str(value).lower() == "50plus":
            return "51+"
        else:
            return "Unknown"
        

def convert_age_numeric(value):
    try:
        return int(value)
    except:
        if str(value).lower() == "below21":
            return 20
        elif str(value).lower() == "50plus":
            return 55
        else:
            return None


data['age_numeric'] = data['age'].apply(convert_age_numeric)
data['age_group'] = data['age'].apply(map_age)
data = data.drop(['car','age'], axis=1)
#print(data['age_numeric'])
#print(data.head())



For categorical data like destination, passanger, weather, coupon, gender, marital status, occupation we will use one-hot encoding, which is good for nominal data, when there is no order/ranking between the categories. 

We are going to drop the feature car, because our dataset includes values of this feature for only 109 records. 

In [37]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


nominal_cat_features = ['destination', 'passanger', 'weather', 'coupon', 'gender', 'maritalStatus', 'occupation']

seed = 7
X = data.drop('Y', axis=1)
y = data['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)


one_hot_encoder = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


For the rest of the categorical data we will use a different method of encoding because these are considered ordinal data, where categories have order/ranking. Some of these features also have missing values, so we will replace them with the most frequent value of the category.

We also give the order of the categories for the processor to know the correct order and give correct importance.


In [38]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

ordinal_cat_features = ['expiration', 'education', 'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']

ordinal_categories = [
    ['2h', '1d'],   # expiration
    ['Some High School','High School Graduate', 'Some college - no degree', 'Associates degree', 'Bachelors degree', 'Graduate degree (Masters or Doctorate)'], # education
    ['never', 'less1', '1~3', '4~8', 'gt8'],    # Bar
    ['never', 'less1', '1~3', '4~8', 'gt8'],    # CoffeeHouse
    ['never', 'less1', '1~3', '4~8', 'gt8'],    # CarryAway
    ['never', 'less1', '1~3', '4~8', 'gt8'],    # RestaurantLessThan20
    ['never', 'less1', '1~3', '4~8', 'gt8']     # Restaurant20To50
]

ordinal_encoder = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=ordinal_categories))
])

preprocessor = ColumnTransformer(transformers = [
    ('nom', one_hot_encoder, nominal_cat_features),
    ('ord', ordinal_encoder, ordinal_cat_features)
])

X_processed = preprocessor.fit_transform(X_train)
