In [1]:
from catboost import CatBoostClassifier, Pool, cv
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from xgboost import XGBClassifier
from train_val_split import train_validation_split
import ggplot as gplt
from tools.mean_encoder import *
from keras.layers import Input, Dense, Dropout
from keras.models import Model
from autoencoders_keras.vanilla_autoencoder import VanillaAutoencoder

You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp
  from pandas.core import datetools
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [28]:
train_set = pd.read_csv("dataset/train_val/training_set.csv")
test_set = pd.read_csv("dataset/train_val/testing_set.csv")
train_set["order_id"] = train_set["order_id"].astype(str)

In [2]:
train_set = pd.read_csv("dataset/train_val/training_set.csv")
test_set = pd.read_csv("dataset/train_val/testing_set.csv")
train_set["order_id"] = train_set["order_id"].astype(str)


airport_me = pd.read_csv("./feature_extraction/airport_mean_encoding.csv")
airport_me["order_id"] = airport_me["order_id"].astype(str)
train_set = pd.merge(train_set, airport_me, on="order_id", how="left")
test_set = pd.merge(test_set, airport_me, on="order_id", how="left")


drop_col = [x for x in train_set.columns if "time_dist" in x]

y_train = train_set["deal_or_not"]
for table in [train_set, test_set]:
    table.drop(columns=["deal_or_not", "group_id", "order_id"] + drop_col, inplace=True)
    table.fillna(table.mean(), inplace=True)
    

cv = StratifiedKFold(5, shuffle=True, random_state=851206)
#mean_enc_col = ["source1_unit", "area", 'source1_source2', 'source1_source2_unit',
#                'source2_unit'] # 這個column是catboost feature importance最高的feature
mean_enc_col = ["source1_unit"]
#mean_enc_col = list(train_set.columns[cat_feature])
train_set = mean_encoder(train_set, y_train, mean_enc_col, "deal_or_not", cv)
test_set = test_set_encoder(train_set, y_train, test_set, mean_enc_col, "deal_or_not")

drop_col = ['source1_source2', 'source1_unit', 'source2_unit',
       'source1_source2_unit', 'order_month', 'order_quarter', 'subline_area',
       'begin_month', 'begin_quarter', 'abroad_airport', 'home_airport',
       'abroad_hour', 'abroad_part_of_day', 'abroad_DoY', 'abroad_DoW',
       'abroad_DoM', 'home_DoW', 'home_DoM', 'home_DoY', 'home_hour',
       'home_part_of_day']
for table in [train_set, test_set]:
    table.drop(columns=drop_col, inplace=True)

#and "DoY" not in col
cat_feature = []
key_words = ["source", "unit", "sub", "area", "order", "begin", "_airport",
             "abroad", "home"]
for i, col in enumerate(train_set.columns):
    for kw in key_words:
        if kw in col and "duration" not in col and "DoY" not in col and "target" not in col \
         and "accum" not in col and i not in cat_feature:
            cat_feature.append(i)

            
#train_x.drop(columns=mean_enc_col, inplace=True)
#val_x.drop(columns=mean_enc_col, inplace=True)

  interactivity=interactivity, compiler=compiler, result=result)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  x_val[colname] = means


In [3]:
whole_set = pd.concat([train_set, test_set])
whole_set.index = list(range(whole_set.shape[0]))

In [4]:
def dummy_rep(table, cat_feature):
    dummy = pd.get_dummies(table[cat_feature[0]])
    for col in cat_feature[1:]:
        dummy = pd.concat([dummy, pd.get_dummies(table[col])], axis=1)
    return pd.concat([table.drop(columns=cat_feature), dummy], axis=1)

In [5]:
whole_dummy = dummy_rep(whole_set, whole_set.columns[cat_feature])
train_dummy = whole_dummy.iloc[0:train_set.shape[0]]
test_dummy = whole_dummy.iloc[train_set.shape[0]:]

In [6]:
scaler = StandardScaler()
train_dummy_std = scaler.fit_transform(train_dummy)
test_dummy_std = scaler.transform(test_dummy)

In [50]:
train_dummy_std.shape

(297020, 382)

In [69]:
input_layer = Input(shape=(382, ))

# encoder
#encoded = Dense(256, activation="relu")(input_layer)
#encoded = Dropout(0.8)(encoded)
encoded = Dense(128, activation="relu")(input_layer)
#encoded = Dropout(0.8)(encoded)
encoded = Dense(64, activation="relu")(encoded)
encoded = Dense(32, activation="relu")(encoded)

#decoder
#decoded = Dense(32, activation="relu")(encoded)
decoded = Dense(64, activation="relu")(decoded)
decoded = Dense(128, activation="relu")(decoded)
#encoded = Dropout(0.8)(decoded)
#encoded = Dense(256, activation="relu")(decoded)
#encoded = Dropout(0.8)(decoded)
output_layer = Dense(382, activation="sigmoid")(decoded)

autoencoder = Model(input_layer, output_layer)
autoencoder.compile(optimizer="adam", loss="mse")

In [70]:
autoencoder.fit(train_dummy_std, train_dummy_std,
                epochs=50,
                batch_size=256,
                shuffle=True,
                validation_data=(test_dummy_std, test_dummy_std))

Train on 297020 samples, validate on 99895 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50

KeyboardInterrupt: 

In [17]:
autoencoder = VanillaAutoencoder(n_feat=train_dummy_std.shape[1],
                                 n_epoch=100,
                                 batch_size=250,
                                 encoder_layers=3,
                                 decoder_layers=3,
                                 n_hidden_units=100,
                                 encoding_dim=50,
                                 denoising=None)

In [18]:
autoencoder.fit(train_dummy_std, y_train)

Train on 207914 samples, validate on 89106 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100


VanillaAutoencoder(batch_size=250, decoder_layers=3, denoising=None,
          encoder_layers=3, encoding_dim=50, n_epoch=100, n_feat=382,
          n_hidden_units=100)

In [19]:
train_ae_feature = autoencoder.transform(train_dummy_std)
train_ae_feature = pd.DataFrame(train_ae_feature, columns=["ae_feat_" + str(i+1) for i in range(50)])
test_ae_feature = autoencoder.transform(test_dummy_std)
test_ae_feature = pd.DataFrame(test_ae_feature, columns=["ae_feat_" + str(i+1) for i in range(50)])

In [32]:
train_ae_feature["order_id"] = train_set["order_id"]
train_ae_feature = train_ae_feature[["order_id"] + list(train_ae_feature.columns[:-1])]
test_ae_feature["order_id"] = test_set["order_id"]
test_ae_feature = test_ae_feature[["order_id"] + list(test_ae_feature.columns[:-1])]

In [34]:
train_ae_feature.to_csv("train_ae_feature.csv", encoding="utf-8", index=False)
test_ae_feature.to_csv("test_ae_feature.csv", encoding="utf-8", index=False)