# Loading the data

In [None]:
import pandas as pd

data = pd.read_csv("data/in-vehicle-coupon-recommendation.csv")

print(data.head())

# Preparing the data


For categorical data like destination, passanger, weather, coupon, gender, marital status we will use one-hot encoding, which is good for nominal data, when there is no order/ranking between the categories. 

We are going to drop the feature car, because our dataset includes values of this feature for only 109 records. 


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split


nominal_cat_features = ['destination', 'passanger', 'weather', 'coupon', 'gender', 'maritalStatus']

seed = 7
X = data.drop(['car','Y'], axis=1)
y = data['Y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)


one_hot_encoder = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers = [
    ('nom', one_hot_encoder, nominal_cat_features)
])



For the rest of the categorical data we will use a different method of encoding because these are considered ordinal data, where categories have order/ranking. Some of these features also have missing values, so we will replace them with the most frequent value of the category.

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

ordinal_cat_features = ['expiration', 'education', 'Bar', 'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50']


ordinal_encoder = Pipeline(steps=[
    ('ordinal', OrdinalEncoder())
])

preprocessor = ColumnTransformer(transformers = [
    ('ord', ordinal_encoder, ordinal_cat_features)
])

X_processed = preprocessor.fit_transform(X_train)
