In [1]:
import pandas as pd
import calendar # to turn month abbrev into integer
from functools import reduce # to join multiple tables 
from fastai.imports import *

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
def add_datepart(df, fldname, drop=True, time=False):
    """add_datepart converts a column of df from a datetime64 to many columns containing
    the information from the date. This applies changes inplace.
    Parameters:
    -----------
    df: A pandas data frame. df gain several new columns.
    fldname: A string that is the name of the date column you wish to expand.
        If it is not a datetime64 series, it will be converted to one with pd.to_datetime.
    drop: If true then the original date column will be removed.
    time: If true time features: Hour, Minute, Second will be added.
    Examples:
    ---------
    >>> df = pd.DataFrame({ 'A' : pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000'], infer_datetime_format=False) })
    >>> df
        A
    0   2000-03-11
    1   2000-03-12
    2   2000-03-13
    >>> add_datepart(df, 'A')
    >>> df
        AYear AMonth AWeek ADay ADayofweek ADayofyear AIs_month_end AIs_month_start AIs_quarter_end AIs_quarter_start AIs_year_end AIs_year_start AElapsed
    0   2000  3      10    11   5          71         False         False           False           False             False        False          952732800
    1   2000  3      10    12   6          72         False         False           False           False             False        False          952819200
    2   2000  3      11    13   0          73         False         False           False           False             False        False          952905600
    """
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [7]:
data_dir = "dataset/"
group_table = pd.read_csv(data_dir + "group.csv")
airline_table = pd.read_csv(data_dir + "airline.csv")
order_table = pd.read_csv(data_dir + "order.csv")
dayschedule_table = pd.read_csv(data_dir + "day_schedule.csv")
train_set = pd.read_csv("training-set.csv")
test_set = pd.read_csv("testing-set.csv")

In [8]:
# convert them to standard datetime type so that I can use the tool fastai.add_datepart()
group_table.begin_date = pd.to_datetime(group_table.begin_date)
add_datepart(group_table, 'begin_date')

# convert them to integer code
for col in ["sub_line", "area"]:
    group_table[col] = list(map(lambda x: int(x[-1]), group_table[col]))

In [9]:
# convert them to integer code
for col in ["source_1", "source_2", "unit"]:
    order_table[col] = list(map(lambda x: int(x[-1]), order_table[col]))

# process the datetime feature
order_table.order_date = pd.to_datetime(order_table.order_date)
add_datepart(order_table, "order_date")

In [53]:
"""
# convert airline_table to a table contains "abroad_fly_time", "abroad_arrive_time", "home_fly_time", "home_arrive_time"
group_id_set = set(airline_table.group_id)
new_col_name = ["group_id", "abroad_fly_time", "abroad_arrive_time","home_fly_time", "home_arrive_time"]
#airline_table.drop(columns=["go_back", "src_airport", "dst_airport"], inplace=True)
tmp_data = list(map(lambda x: airline_table[airline_table.group_id == x].iloc[0], group_id_set)) # go abroad
new_airline_table = reduce(lambda x, y: pd.concat([x, y.to_frame().T]), tmp_data[1:], tmp_data[0].to_frame().T)
tmp_data = list(map(lambda x: airline_table[airline_table.group_id == x].iloc[-1], group_id_set)) # go home
tmp_table = reduce(lambda x, y: pd.concat([x, y.to_frame().T]), tmp_data[1:], tmp_data[0].to_frame().T)
new_airline_table = pd.DataFrame.merge(new_airline_table, tmp_table, on="group_id")
new_airline_table.columns = new_col_name
new_airline_table.to_csv("dataset/new_data/new_airline.csv", encoding="utf-8", index=False)
"""

KeyboardInterrupt: 

In [55]:
# split the [date time] into date and time
# then split the time into hour and minute
for col in airline_table.columns[1:]:
    col_head = col.split("_")
    col_head = col_head[0] + "_" + col_head[1]
    new_col = airline_table[col].str.split(" ", expand=True)
    airline_table[col_head + "_" + "date"] = new_col[0]
    hour_minute = new_col[1].str.split(":", expand=True)
    airline_table[col_head + "_" + "hour"] = pd.to_numeric(hour_minute[0])
    airline_table[col_head + "_" + "minute"] = pd.to_numeric(hour_minute[1])
    airline_table.drop(columns=[col], inplace=True)

In [57]:
# convert them to standard datetime type so that I can use the tool fastai.add_datepart()
for date_col in airline_table.columns[1:]:
    if "date" in date_col:
        airline_table[date_col] = pd.to_datetime(airline_table[date_col])
        add_datepart(airline_table, date_col)

# show examples
airline_table.head()

Unnamed: 0,group_id,abroad_fly_hour,abroad_fly_minute,abroad_arrive_hour,abroad_arrive_minute,home_fly_hour,home_fly_minute,home_arrive_hour,home_arrive_minute,abroad_fly_Year,...,home_arrive_Day,home_arrive_Dayofweek,home_arrive_Dayofyear,home_arrive_Is_month_end,home_arrive_Is_month_start,home_arrive_Is_quarter_end,home_arrive_Is_quarter_start,home_arrive_Is_year_end,home_arrive_Is_year_start,home_arrive_Elapsed
0,2,17,0,20,25,18,25,20,45,2017,...,27,0,331,False,False,False,False,False,False,1511740800
1,3,17,50,20,30,11,55,13,55,2017,...,22,6,22,False,False,False,False,False,False,1485043200
2,4,14,40,18,30,19,30,21,50,2017,...,1,1,213,False,True,False,False,False,False,1501545600
3,7,10,55,14,40,15,50,19,20,2017,...,3,5,154,False,False,False,False,False,False,1496448000
4,9,18,10,21,35,21,5,22,55,2017,...,16,0,16,False,False,False,False,False,False,1484524800


In [58]:
airline_table.to_csv(processed_datadir + "new_airline.csv", encoding="utf-8", index=False)

In [16]:
processed_datadir = data_dir + "new_data" + "/"
for table_name, table in zip(["group.csv", "order.csv", "airline.csv"], [group_table, order_table, airline_table]):
    table.to_csv(processed_datadir + "new_" + table_name, encoding="utf-8", index=False)

In [12]:
train_set = pd.read_csv("training-set.csv")
test_set = pd.read_csv("testing-set.csv")

In [59]:
main_table = pd.merge(train_set, order_table, on="order_id", how="left")
dfs = [main_table, group_table, airline_table]
main_table = reduce(lambda left, right: pd.merge(left, right, on="group_id", how="left"), dfs)
main_table.drop(columns=["product_name", "promotion_prog"], inplace=True)

In [61]:
test_table = pd.merge(test_set, order_table, on="order_id", how="left")
dfs = [test_table, group_table, airline_table]
test_table = reduce(lambda left, right: pd.merge(left, right, on="group_id", how="left"), dfs)
test_table.drop(columns=["product_name", "promotion_prog"], inplace=True)

In [62]:
main_table.to_csv(processed_datadir + "training_set.csv", encoding="utf-8", index=False)
test_table.to_csv(processed_datadir + "testing_set.csv", encoding="utf-8", index=False)