In [1]:
import os
import numpy as np
import pandas as pd
from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

DATA_ROOT = "../../data/ml100marathon-02-01/"

In [2]:
df_train = pd.read_csv(os.path.join(DATA_ROOT,'train_offline.csv'))
df_test  = pd.read_csv(os.path.join(DATA_ROOT,'test_offline.csv'))
df_train.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date
0,1439408,2632,,,0.0,,20160217.0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,
2,1439408,2632,1078.0,20:1,0.0,20160319.0,
3,1832624,3381,7610.0,200:20,0.0,20160429.0,
4,2029232,3381,11951.0,200:20,1.0,20160129.0,


In [3]:
# create target label
"""
According to the definition, 
1) buy with coupon within (include) 15 days ==> 1
2) buy with coupon but out of 15 days ==> 0
3) buy without coupon ==> -1 (we don't care)
"""
def label(row):
    if np.isnan(row['Date_received']):
        return -1
    if not np.isnan(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

df_train["label"] = df_train.apply(label, axis=1)
df_train["label"].value_counts()

 0    710665
-1    413773
 1     36304
Name: label, dtype: int64

In [4]:
df_train.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label
0,1439408,2632,,,0.0,,20160217.0,-1
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0


In [5]:
# Generate features - weekday acquired coupon
def getWeekday(row):
    if (np.isnan(row)) or (row==-1):
        return row
    else:
        # add one to make it from 0~6 -> 1~7
        return pd.to_datetime(row, format = "%Y%m%d").dayofweek + 1

df_train['weekday'] = df_train['Date_received'].apply(getWeekday)
df_test['weekday']  = df_test['Date_received'].apply(getWeekday)

# weekday_type (weekend = 1)
df_train['weekday_type'] = df_train['weekday'].astype('str').apply(
    lambda x : 1 if x in [6,7] else 0 )
df_test['weekday_type']  = df_test['weekday'].astype('str').apply(
    lambda x : 1 if x in [6,7] else 0 )

In [6]:
df_train.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type
0,1439408,2632,,,0.0,,20160217.0,-1,,0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,0
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0


In [7]:
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)

tmpdf = pd.get_dummies(df_train['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
df_train[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(df_test['weekday'].replace(-1, np.nan))
tmpdf.columns = weekdaycols
df_test[weekdaycols] = tmpdf

['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']


In [8]:
df_train.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7
0,1439408,2632,,,0.0,,20160217.0,-1,,0,0,0,0,0,0,0,0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0,0,0,1,0,0,0,0
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,0,0,0,0,0,0,1,0
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0,0,0,0,0,1,0,0
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0,0,0,0,0,1,0,0


In [9]:
# Generate features - coupon discount and distance
def getDiscountType(row):
    if row == 'null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if row == 'null':
        return 1.0
    elif ':' in row:
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in row:
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0

def processData(df):
    
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].astype('str').apply(convertRate)
    df['discount_man'] = df['Discount_rate'].astype('str').apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].astype('str').apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].astype('str').apply(getDiscountType)
    
    # convert distance
    df.loc[df.Distance.isna(), "Distance"] = df["Distance"].mean() # 99
    return df

df_train = processData(df_train)
df_test = processData(df_test)

In [10]:
df_train.head()

Unnamed: 0,User_id,Merchant_id,Coupon_id,Discount_rate,Distance,Date_received,Date,label,weekday,weekday_type,...,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,discount_rate,discount_man,discount_jian,discount_type
0,1439408,2632,,,0.0,,20160217.0,-1,,0,...,0,0,0,0,0,0,,0,0,0
1,1439408,2632,8591.0,20:1,0.0,20160217.0,,0,3.0,0,...,0,1,0,0,0,0,0.95,20,1,1
2,1439408,2632,1078.0,20:1,0.0,20160319.0,,0,6.0,0,...,0,0,0,0,1,0,0.95,20,1,1
3,1832624,3381,7610.0,200:20,0.0,20160429.0,,0,5.0,0,...,0,0,0,1,0,0,0.9,200,20,1
4,2029232,3381,11951.0,200:20,1.0,20160129.0,,0,5.0,0,...,0,0,0,1,0,0,0.9,200,20,1


## My Split Dataset

In [672]:
df_train.columns

Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date', 'label', 'weekday', 'weekday_type',
       'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5',
       'weekday_6', 'weekday_7', 'discount_rate', 'discount_man',
       'discount_jian', 'discount_type'],
      dtype='object')

In [707]:
features = [#'User_id',
            'User_id_Count',
            #'Merchant_id',
            'Merchant_id_Count',
            #'Coupon_id',
            'Coupon_id_Count',
            'discount_rate','discount_man','discount_jian',#'discount_type',
            'Distance',
            'weekday_1','weekday_2', 'weekday_3','weekday_4','weekday_5','weekday_6', 'weekday_7'
           ]
target = 'label'

#
df_train_mod = df_train[df_train['label'] != -1].copy()
#df_train_mod = df_train.copy()
df_test_mod  = df_test.copy()

#
us_count_df  = df_train_mod.groupby(['User_id'])['User_id'].agg({'User_id_Count':'size'}).reset_index()
df_train_mod = pd.merge(df_train_mod, us_count_df, on=['User_id'], how='left')
us_count_df  = df_test_mod.groupby(['User_id'])['User_id'].agg({'User_id_Count':'size'}).reset_index()
df_test_mod  = pd.merge(df_test_mod,  us_count_df, on=['User_id'], how='left')

mc_count_df  = df_train_mod.groupby(['Merchant_id'])['User_id'].agg({'Merchant_id_Count':'size'}).reset_index()
df_train_mod = pd.merge(df_train_mod, mc_count_df, on=['Merchant_id'], how='left')
mc_count_df  = df_test_mod.groupby(['Merchant_id'])['User_id'].agg({'Merchant_id_Count':'size'}).reset_index()
df_test_mod  = pd.merge(df_test_mod,  mc_count_df, on=['Merchant_id'], how='left')

cp_count_df  = df_train_mod.groupby(['Coupon_id'])['User_id'].agg({'Coupon_id_Count':'size'}).reset_index()
df_train_mod = pd.merge(df_train_mod, cp_count_df, on=['Coupon_id'], how='left')
cp_count_df  = df_test_mod.groupby(['Coupon_id'])['User_id'].agg({'Coupon_id_Count':'size'}).reset_index()
df_test_mod  = pd.merge(df_test_mod,  cp_count_df, on=['Coupon_id'], how='left')

#
df_test_mod.loc[df_test_mod.Coupon_id_Count.isna(),   "Coupon_id_Count"] = 1.0
df_test_mod.loc[df_test_mod.Merchant_id_Count.isna(), "Merchant_id_Count"] = 1.0
df_test_mod.loc[df_test_mod.User_id_Count.isna(),     "User_id_Count"] = 1.0
df_test_mod.loc[df_test_mod.discount_rate.isna(),     "discount_rate"] = 1.0

#df_train_mod.loc[df_train_mod.Coupon_id_Count.isna(),   "Coupon_id_Count"] = 1.0
#df_train_mod.loc[df_train_mod.discount_rate.isna(),     "discount_rate"]   = 1.0

x_train, x_valid, y_train, y_valid = train_test_split(
    df_train_mod[features], df_train_mod[target], test_size=0.01, random_state=4)


is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version
is deprecated and will be removed in a future version


## Teature Split Dataset

In [516]:
## Naive model
def split_train_valid(row, date_cut="20160416"):
    is_train = True if pd.to_datetime(row, format="%Y%m%d") < pd.to_datetime(date_cut, format="%Y%m%d") else False
    return is_train
    
df = df_train[df_train['label'] != -1].copy()
df["is_train"] = df["Date_received"].apply(split_train_valid)
train = df[df["is_train"]]
valid = df[~df["is_train"]]
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
print("Train size: {}, #positive: {}".format(len(train), train["label"].sum()))
print("Valid size: {}, #positive: {}".format(len(valid), valid["label"].sum())) 

KeyboardInterrupt: 

In [None]:
features = ['User_id','Merchant_id','Coupon_id',
            'discount_rate','discount_man','discount_jian','discount_type',
            'Distance',
            'weekday_type',
            'weekday_1','weekday_2', 'weekday_3','weekday_4','weekday_5','weekday_6', 'weekday_7'
           ]
target = 'label'

In [None]:
x_train = train[features]
y_train = train[target]
x_valid = valid[features]
y_valid = valid[target]

## Norm

In [708]:
def norm(x):
    stats = x.describe()
    stats = stats.transpose()
    return (x - stats['mean']) / stats['std']
x_train = norm(x_train)
x_valid = norm(x_valid)

## Training

In [709]:
import tensorflow as tf
print(tf.__version__)

regularizer = tf.keras.regularizers.l2(l=0.001)

def create_model():
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(64,
                              kernel_regularizer=regularizer, 
                              input_shape=[len(features)]),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.ReLU(),
        tf.keras.layers.Dense(64,
                              kernel_regularizer=regularizer,),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.ReLU(),
        tf.keras.layers.Dense(1, activation='sigmoid',
                              kernel_regularizer=regularizer,),
    ])
    return model

model = create_model()

2.0.0-beta0


In [717]:
def loss(y_true, y_pred):
    pos_mask = y_true
    neg_mask = 1 - y_true
    error = tf.square(y_pred - y_true)
    #error = tf.abs(y_pred - y_true)
    #error = tf.nn.sigmoid_cross_entropy_with_logits(y_true, y_pred)
    #loss = error
    loss = (5 * pos_mask + 1 * neg_mask) * error
    return tf.reduce_mean(loss, axis=-1)

optimizer = tf.keras.optimizers.SGD(0.01, 0.9)
#optimizer = tf.keras.optimizers.Adam(0.001)
model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

In [718]:
#pred_valid = model.predict(x_valid)
#pred_valid.shape

In [719]:
x_train_ND = x_train.to_numpy(dtype=np.float32)
x_valid_ND = x_valid.to_numpy(dtype=np.float32)
y_train_ND = y_train.to_numpy(dtype=np.float32)
y_valid_ND = y_valid.to_numpy(dtype=np.float32)
y_train_ND = np.expand_dims(y_train_ND, axis=-1)
y_valid_ND = np.expand_dims(y_valid_ND, axis=-1)

In [720]:
model.fit(x_train_ND,
          y_train_ND,
          batch_size=128,
          epochs=20)

Train on 739499 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20

KeyboardInterrupt: 

In [721]:
from sklearn.metrics import roc_auc_score, accuracy_score

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

logic_valid = model.predict(x_valid)
pred_valid = (logic_valid > 0.5).astype(np.int32)
#pred_valid = (sigmoid(logic_valid) > 0.5).astype(np.int32)

auc_score = roc_auc_score(y_true=y_valid, 
                          y_score=logic_valid)
acc = accuracy_score(y_true=y_valid, 
                     y_pred=pred_valid)
print("Validation AUC: {:.3f}, Accuracy: {:.3f}".format(auc_score, acc))

Validation AUC: 0.865, Accuracy: 0.933


In [722]:
print("Pos:\n", logic_valid[y_valid==1][:10])
print("Neg:\n", logic_valid[y_valid==0][:10])
print("\nDis:", logic_valid[y_valid==1][:20].sum() - logic_valid[y_valid==0][:20].sum())

Pos:
 [[0.6499265 ]
 [0.438892  ]
 [0.19883513]
 [0.28425753]
 [0.49823853]
 [0.6650557 ]
 [0.39293414]
 [0.5714172 ]
 [0.36244816]
 [0.33512443]]
Neg:
 [[0.06955925]
 [0.01402494]
 [0.05722174]
 [0.35696065]
 [0.02008468]
 [0.07519424]
 [0.10755134]
 [0.03084174]
 [0.5232168 ]
 [0.5495423 ]]

Dis: 4.9582405


## output test result

In [724]:
df = df_train[df_train['label'] != -1].copy()

targetset = df_test_mod.copy()
targetset = targetset[~targetset.Coupon_id.isna()]
targetset.reset_index(drop=True, inplace=True)

targetset_norm = norm(targetset[features])

In [725]:
targetset_norm.head()

Unnamed: 0,User_id_Count,Merchant_id_Count,Coupon_id_Count,discount_rate,discount_man,discount_jian,Distance,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7
0,-0.099051,0.074336,0.269607,0.36988,2.329563,1.557172,-0.3159,-0.402746,-0.389069,-0.42278,-0.393357,-0.40052,2.20953,-0.395181
1,-0.099051,-0.836758,-0.626585,1.217609,-0.589057,-0.813586,-0.661431,2.482948,-0.389069,-0.42278,-0.393357,-0.40052,-0.452584,-0.395181
2,-0.099051,-0.836758,-0.626585,1.217609,-0.589057,-0.813586,-0.661431,2.482948,-0.389069,-0.42278,-0.393357,-0.40052,-0.452584,-0.395181
3,-0.099051,1.23588,0.915844,0.030788,-0.364548,-0.314479,-0.661431,2.482948,-0.389069,-0.42278,-0.393357,-0.40052,-0.452584,-0.395181
4,-0.099051,-0.835405,-0.625541,1.217609,-0.589057,-0.813586,-0.661431,-0.402746,-0.389069,-0.42278,2.542211,-0.40052,-0.452584,-0.395181


In [726]:
pred_targetset = model.predict(targetset_norm)
#pred_targetset = sigmoid(model.predict(targetset_norm))
targetset['pred_prob'] = pred_targetset

In [727]:
output = targetset[["User_id", "Coupon_id", "Date_received", "pred_prob"]]
output.loc[:, "User_id"] = output["User_id"].apply(lambda x:str(int(x)))
output.loc[:, "Coupon_id"] = output["Coupon_id"].apply(lambda x:str(int(x)))
output.loc[:, "Date_received"] = output["Date_received"].apply(lambda x:str(int(x)))
output["uid"] = output[["User_id", "Coupon_id", "Date_received"]].apply(lambda x: '_'.join(x.values), axis=1)
output.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [728]:
out = output.groupby("uid", as_index=False).mean()
out = out[["uid", "pred_prob"]]
out.columns = ["uid", "label"]
out.head(10)

Unnamed: 0,uid,label
0,1000020_2705_20160519,0.103716
1,1000020_8192_20160513,0.11065
2,1000065_1455_20160527,0.311873
3,1000085_8067_20160513,0.059564
4,1000086_2418_20160613,0.011214
5,1000140_8192_20160526,0.192312
6,1000169_2418_20160606,0.01056
7,1000297_13704_20160520,0.099015
8,1000324_13165_20160526,0.06749
9,1000338_10161_20160612,0.091258


In [729]:
out.to_csv("20190615.csv", header=["uid", "label"], index=False) # submission format