In [16]:
%store -r __importData
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from feature_extraction import AirlineFeature, GroupFeature, OrderFeature, \
                               MixingFeature, MeanEncoder

In [17]:
def categorical_feature(table, key_words):
    features = []
    """key_words = ["year", "month", "DoW", "DoM", "sub_line", "area", "unit",
                 "source_1", "source_2", "hour", "airport", "consistent",
                 "quarter", "part_of_day"]"""
    #key_words = ["part_of_day", "airport"]
    #key_words = ["year", "DoW", "sub_line", "area", "unit",
    #             "source_1", "source_2", "hour", "airport", "consistent"]
    #key_words = ["year", "DoW", "source_1", "source_2", "airport", "consistent"]
    for col in table.columns:
        for keyword in key_words:
            if keyword in col and col not in features:
                features.append(col)
    return features


def feature_transform(input_table, features):
    table = input_table.copy()
    for col in features:
        feature = pd.get_dummies(table[col])
        new_name = []
        for feature_col in feature.columns:
            if "part" in col:
                new_name.append(col + "_" + str(feature_col))
            else:
                new_name.append(col + "_" + str(int(feature_col)))
        feature.columns = new_name
        table = pd.concat([table, feature], axis=1)
    table.drop(columns=features, inplace=True)
    return table

def feature_transform2(input_table, features):
    table = input_table.copy()
    for col in features:
        le = LabelEncoder()
        table[col] = le.fit_transform(table[col].astype(str))
    return table

def feature_transform3(input_table, features):
    table = input_table.copy()
    for feature in features:
        df = pd.DataFrame(table[feature].value_counts())
        new_encode = {}
        for i, cate_name in enumerate(df.index.values):
            new_encode[cate_name] = df.iloc[i][0]
        table[feature] = table[feature].map(new_encode)
    return table

def impact_coding(data, feature, target='y'):
    '''
    In this implementation we get the values and the dictionary as two different steps.
    This is just because initially we were ignoring the dictionary as a result variable.

    In this implementation the KFolds use shuffling. If you want reproducibility the cv
    could be moved to a parameter.
    '''
    n_folds = 5
    n_inner_folds = 2
    impact_coded = pd.Series()

    oof_default_mean = data[target].mean() # Gobal mean to use by default (you could further tune this)
    kf = KFold(n_splits=n_folds, shuffle=True)
    oof_mean_cv = pd.DataFrame()
    split = 0
    for infold, oof in kf.split(data[feature]):
            impact_coded_cv = pd.Series()
            kf_inner = KFold(n_splits=n_inner_folds, shuffle=True)
            inner_split = 0
            inner_oof_mean_cv = pd.DataFrame()
            oof_default_inner_mean = data.iloc[infold][target].mean()
            for infold_inner, oof_inner in kf_inner.split(data.iloc[infold]):
                # The mean to apply to the inner oof split (a 1/n_folds % based on the rest)
                oof_mean = data.iloc[infold_inner].groupby(by=feature)[target].mean()
                impact_coded_cv = impact_coded_cv.append(data.iloc[infold].apply(
                            lambda x: oof_mean[x[feature]]
                                      if x[feature] in oof_mean.index
                                      else oof_default_inner_mean
                            , axis=1))

                # Also populate mapping (this has all group -> mean for all inner CV folds)
                inner_oof_mean_cv = inner_oof_mean_cv.join(pd.DataFrame(oof_mean), rsuffix=inner_split, how='outer')
                inner_oof_mean_cv.fillna(value=oof_default_inner_mean, inplace=True)
                inner_split += 1

            # Also populate mapping
            oof_mean_cv = oof_mean_cv.join(pd.DataFrame(inner_oof_mean_cv), rsuffix=split, how='outer')
            oof_mean_cv.fillna(value=oof_default_mean, inplace=True)
            split += 1

            impact_coded = impact_coded.append(data.iloc[oof].apply(
                            lambda x: inner_oof_mean_cv.loc[x[feature]].mean()
                                      if x[feature] in inner_oof_mean_cv.index
                                      else oof_default_mean
                            , axis=1))

    return impact_coded, oof_mean_cv.mean(axis=1), oof_default_mean

def train_validation_split(train_set, train_y):
    train_data = train_set.iloc[0:146620]
    train_target = train_y[0:146620]
    validation_data = train_set.iloc[146620:]
    validation_target = train_y[146620:]
    return (train_data, validation_data, train_target, validation_target)


def mean_encoding(train, test, enc_feature):
    x_train = train.copy()
    x_test = test.copy()
    impact_coding_map = {}
    for f in enc_feature:
        print("Impact coding for {}".format(f))
        x_train["impact_encoded_{}".format(f)], impact_coding_mapping, default_coding = impact_coding(x_train, f, "deal_or_not")
        impact_coding_map[f] = (impact_coding_mapping, default_coding)
        mapping, default_mean = impact_coding_map[f]
        x_test["impact_encoded_{}".format(f)] = x_test.apply(lambda x: mapping[x[f]]
                                                                         if x[f] in mapping
                                                                         else default_mean
                                                               , axis=1)
    return (x_train, x_test)

In [23]:
__importData

import group_table
import airline_table


  if self.run_code(code, result):


import order_table
import train_set
import test_set


In [19]:
drop_feature = []
group_table = GroupFeature.group_feature(group_table, drop_feature)
order_table = OrderFeature.order_feature(order_table, drop_feature)
airline_table = AirlineFeature.airline_feature(airline_table, drop_feature)
main_table = pd.merge(order_table, group_table, on="group_id", how="left")
main_table = pd.merge(main_table, airline_table, on="group_id", how="left")
main_table = MixingFeature.mixing_feature(main_table, drop_feature)
main_table.drop(columns=drop_feature, inplace=True)

In [11]:
train_set2 = pd.merge(train_set, main_table, on="order_id", how="left")
train_y = train_set2["deal_or_not"]
x_train, x_val, y_train, y_val = train_validation_split(train_set2, train_y)
test_set2 = pd.merge(test_set, main_table, on="order_id", how="left")

In [15]:
x_train.to_csv("dataset/train_val/training_set.csv", encoding="utf-8", index=False)
x_val.to_csv("dataset/train_val/validation_set.csv", encoding="utf-8", index=False)
test_set2.to_csv("dataset/train_val/testing_set.csv", encoding="utf-8", index=False)