In [62]:
import pandas as pd
from tools.add_datepart import add_datepart
from functools import reduce # to join multiple tables 

In [53]:
def discard_feature(table, feature_keyword):
    features = []
    for col in table.columns:
        for keyword in feature_keyword:
            if keyword in col and col not in features:
                features.append(col)
    return features

def feature_transform(input_table):
    table = input_table.copy()
    features = ["source_1", "source_2", "unit", "area", "sub_line"]
    for col in features:
        feature = pd.get_dummies(table[col])
        new_name = []
        for feature_col in feature.columns:
            new_name.append(col + "_" + str(feature_col))
        feature.columns = new_name
        table = pd.concat([table, feature], axis=1)
    table.drop(columns=features, inplace=True) 
    return table

In [54]:
data_dir = "dataset/"
group_table = pd.read_csv(data_dir + "group.csv")
order_table = pd.read_csv(data_dir + "order.csv")
airline_table = pd.read_csv(data_dir + "new_airline.csv")
train_set = pd.read_csv("training-set.csv")
test_set = pd.read_csv("testing-set.csv")

In [74]:
train_set = pd.read_csv("training-set.csv")
test_set = pd.read_csv("testing-set.csv")

In [55]:
group_table["begin_date"] = pd.to_datetime(group_table["begin_date"])
group_table["begin_quarter"] = group_table["begin_date"].dt.quarter
group_table["day_price"] = group_table["price"] / group_table["days"]
add_datepart(group_table, 'begin_date')

# convert them to integer code
for col in ["sub_line", "area"]:
    group_table[col] = list(map(lambda x: int(x[-1]), group_table[col]))
group_table.drop(columns=discard_feature(group_table, ["Year", "Is"]),
                 inplace=True) # drop begin_Year, because they are all 2017
group_table.head()

Unnamed: 0,group_id,sub_line,area,days,price,product_name,promotion_prog,begin_quarter,day_price,begin_Month,begin_Week,begin_Day,begin_Dayofweek,begin_Dayofyear,begin_Elapsed
0,63695,1,1,12,106900,最高省8000》大美西～夏威夷、優勝美地、西峽天空步道、聖地牙哥、環球影城(哈利波特)12日,&lt;B&gt;[GG]《行銷活動》[//]&lt;/B&gt;&lt;font face...,1,8908.333333,3,12,22,2,81,1490140800
1,53481,1,1,12,112900,《玩樂369》旗艦大美西12日～升等五星、優勝美地、西峽天空步道、卡利哥鬼鎮、環球影城大美西12日,&lt;B&gt;[GG]《行銷活動》[//]&lt;/B&gt;&lt;font face...,2,9408.333333,5,20,17,2,137,1494979200
2,54305,1,1,12,115900,《玩樂369》旗艦大美西12日～升等五星、優勝美地、西峽天空步道、卡利哥鬼鎮、環球影城大美西12日,&lt;B&gt;[GG]《行銷活動》[//]&lt;/B&gt;&lt;font face...,2,9658.333333,6,25,21,2,172,1498003200
3,41766,1,1,12,126900,《玩樂369》旗艦大美西12日～升等五星、優勝美地、西峽天空步道、卡利哥鬼鎮、環球影城大美西12日,&lt;B&gt;[GG]《行銷活動》[//]&lt;/B&gt;&lt;font face...,3,10575.0,7,28,12,2,193,1499817600
4,32196,1,1,12,126900,《玩樂369》旗艦大美西12日～升等五星、優勝美地、西峽天空步道、卡利哥鬼鎮、環球影城大美西12日,&lt;B&gt;[GG]《行銷活動》[//]&lt;/B&gt;&lt;font face...,3,10575.0,8,32,9,2,221,1502236800


In [56]:
order_table["order_date"] = pd.to_datetime(order_table["order_date"])
order_table["order_quarter"] = order_table["order_date"].dt.quarter
# convert them to integer code
for col in ["source_1", "source_2", "unit"]:
    order_table[col] = list(map(lambda x: int(x[-1]), order_table[col]))

# process the datetime feature
order_table.order_date = pd.to_datetime(order_table.order_date)
add_datepart(order_table, "order_date")
order_table.drop(columns=discard_feature(order_table, ["Is"]),
                 inplace=True)
# show examples
order_table.head()

Unnamed: 0,order_id,group_id,source_1,source_2,unit,people_amount,order_quarter,order_Year,order_Month,order_Week,order_Day,order_Dayofweek,order_Dayofyear,order_Elapsed
0,136100,63695,1,1,1,2,1,2017,1,1,5,3,5,1483574400
1,140370,63695,1,1,2,1,4,2016,11,45,9,2,314,1478649600
2,211009,63695,1,1,3,2,1,2017,1,3,20,4,20,1484870400
3,41571,53481,1,1,4,4,1,2017,3,11,14,1,73,1489449600
4,64125,53481,2,1,5,1,1,2017,3,13,27,0,86,1490572800


In [57]:
# split the [date time] into date and time
# then split the time into hour and minute
for col in airline_table.columns[1:]:
    col_head = col.split("_")
    col_head = col_head[0] + "_" + col_head[1]
    new_col = airline_table[col].str.split(" ", expand=True)
    airline_table[col_head + "_" + "date"] = new_col[0]
    hour_minute = new_col[1].str.split(":", expand=True)
    airline_table[col_head + "_" + "hour"] = pd.to_numeric(hour_minute[0])
    airline_table[col_head + "_" + "minute"] = pd.to_numeric(hour_minute[1])
    airline_table.drop(columns=[col], inplace=True)

# convert them to standard datetime type so that I can use the tool fastai.add_datepart()
for date_col in airline_table.columns[1:]:
    if "date" in date_col:
        airline_table[date_col] = pd.to_datetime(airline_table[date_col])
        add_datepart(airline_table, date_col)
airline_table.drop(columns=discard_feature(airline_table, ["Year", "Is"]),
                   inplace=True) # drop fly_Year, because they are all 2017
# show examples
airline_table.head()

Unnamed: 0,group_id,abroad_fly_hour,abroad_fly_minute,abroad_arrive_hour,abroad_arrive_minute,home_fly_hour,home_fly_minute,home_arrive_hour,home_arrive_minute,abroad_fly_Month,...,home_fly_Day,home_fly_Dayofweek,home_fly_Dayofyear,home_fly_Elapsed,home_arrive_Month,home_arrive_Week,home_arrive_Day,home_arrive_Dayofweek,home_arrive_Dayofyear,home_arrive_Elapsed
0,2,17,0,20,25,18,25,20,45,11,...,27,0,331,1511740800,11,48,27,0,331,1511740800
1,3,17,50,20,30,11,55,13,55,1,...,22,6,22,1485043200,1,3,22,6,22,1485043200
2,4,14,40,18,30,19,30,21,50,7,...,1,1,213,1501545600,8,31,1,1,213,1501545600
3,7,10,55,14,40,15,50,19,20,5,...,3,5,154,1496448000,6,22,3,5,154,1496448000
4,9,18,10,21,35,21,5,22,55,1,...,16,0,16,1484524800,1,3,16,0,16,1484524800


In [60]:
processed_datadir = data_dir + "new_data/"
for table_name, table in zip(["group.csv", "order.csv", "airline.csv"], [group_table, order_table, airline_table]):
    table.to_csv(processed_datadir + "new_" + table_name, encoding="utf-8", index=False)

In [75]:
train_set = pd.read_csv("training-set.csv")
train_set = pd.merge(train_set, order_table, on="order_id", how="left")
dfs = [train_set, group_table, airline_table]
train_set = reduce(lambda left, right: pd.merge(left, right, on="group_id", how="left"), dfs)
train_set.drop(columns=["product_name", "promotion_prog"], inplace=True)
train_set.to_csv(processed_datadir + "training_set.csv", encoding="utf-8", index=False)

In [77]:
train_set.columns

Index(['order_id', 'deal_or_not', 'group_id', 'source_1', 'source_2', 'unit',
       'people_amount', 'order_quarter', 'order_Year', 'order_Month',
       'order_Week', 'order_Day', 'order_Dayofweek', 'order_Dayofyear',
       'order_Elapsed', 'sub_line', 'area', 'days', 'price', 'begin_quarter',
       'day_price', 'begin_Month', 'begin_Week', 'begin_Day',
       'begin_Dayofweek', 'begin_Dayofyear', 'begin_Elapsed',
       'abroad_fly_hour', 'abroad_fly_minute', 'abroad_arrive_hour',
       'abroad_arrive_minute', 'home_fly_hour', 'home_fly_minute',
       'home_arrive_hour', 'home_arrive_minute', 'abroad_fly_Month',
       'abroad_fly_Week', 'abroad_fly_Day', 'abroad_fly_Dayofweek',
       'abroad_fly_Dayofyear', 'abroad_fly_Elapsed', 'abroad_arrive_Month',
       'abroad_arrive_Week', 'abroad_arrive_Day', 'abroad_arrive_Dayofweek',
       'abroad_arrive_Dayofyear', 'abroad_arrive_Elapsed', 'home_fly_Month',
       'home_fly_Week', 'home_fly_Day', 'home_fly_Dayofweek',
       'home_

In [84]:
categorical_feature = ["source_1", "source_2", "unit", "area", "sub_line"]
categorical_keywords = ["quarter", "Dayofweek"]
for col in train_set.columns:
    for keyword in categorical_keywords:
        if keyword in col and col not in categorical_feature:
            categorical_feature.append(col)

with open(processed_datadir + "categorical_feature.txt", "w", encoding="utf-8", newline="") as f:
    for feature in categorical_feature:
        f.write(feature + "\r\n")

In [76]:
test_set = pd.read_csv("testing-set.csv")
test_set = pd.merge(test_set, order_table, on="order_id", how="left")
dfs = [test_set, group_table, airline_table]
test_set = reduce(lambda left, right: pd.merge(left, right, on="group_id", how="left"), dfs)
test_set.to_csv(processed_datadir + "testing_set.csv", encoding="utf-8", index=False)