In [5]:
import pandas as pd
import numpy as np
from scipy.stats import mode, entropy
from datetime import datetime, timedelta
from collections import defaultdict
import time
import json
import os
from pprint import pprint

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.utils import shuffle
from sklearn.svm import LinearSVC, SVC
from sklearn.metrics import confusion_matrix, fbeta_score, precision_score, recall_score, accuracy_score, roc_auc_score, average_precision_score

#from pyspark import SparkConf, SparkContext
#from pyspark.sql import SQLContext, Row, DataFrameWriter
# from pyspark.ml.classification import LogisticRegression

pd.set_option('display.max_columns', None)
os.chdir('/Users/meif/Desktop/SI 699')
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

% matplotlib inline 

## 0. Prepare Dataset

In [6]:
# continuous and categorical
mains = ["user_coupon", "user_id", "coupon_id", "start_time", "is_used"]

categorical = ['sex_1', 'sex_2', 
               'age_60', 'age_70', 'age_80', 'age_90', 'age_0', 
               'city1', 'city2', 'city3', 'city4', 'city5', 
               'AppVerLast_2.1', 'AppVerLast_2.2', 'AppVerLast_2.3', 'AppVerLast_2.4', 'AppVerLast_2.5', 'AppVerLast_2.7', 'AppVerLast_2.8',
               'covers_mon', 'covers_tue', 'covers_wed', 'covers_thu', 'covers_fri', 'covers_sat', 'covers_sun', 
               'type1', 'type6', 
               'Complaints', 'Eventsoperation', 'NewUserCouponPackageByBD', 'PreUserCouponCode', 'RecallUserDaily', 'home201603222253', 
               'home_dongbeiguan', 'home_jiangzhecai', 'home_muqinjie', 'home_xiangcaiguan', 'preuser', 'shareuser', 
               '商家拒单返券', '家厨发券', '活动赠券', '码兑券', '自运营赠券', '蒲公英受邀',
               'CoupUseLast']

conitnuous = ['kitchen_entropy', 
              'distance_median', 'distance_std',
              'user_longitude_median', 'user_longitude_std', 'user_latitude_median', 'user_latitude_std', 
              'coupon_effective_days', 'money', 'max_money', 
              'WeeklyCouponUsedCount', "BiWeeklyCouponUsedCount",
              'WeeklyOrderCount', 'BiWeeklyOrderCount',
              'coupon_usage_rate', 'order_coupon_usage_rate',
              'coupon_type1_usage_rate', 'coupon_type6_usage_rate',
              'coupon_used_weekend_perc', 'order_weekend_perc', 
              'worth_money_median', 'worth_money_std', 
              'InterCoup', 'InterOrder', 'Recency']

# take_log= ['Recency_log', 'InterCoup_log', 'InterOrder_log']

In [4]:
trainset = pd.read_csv("Dataset/trainset_180314.csv").iloc[:,1:]
testset = pd.read_csv("Dataset/testset_180314.csv").iloc[:,1:]
print(len(trainset) + len(testset))
print(trainset.columns[5:].tolist())

5484528
['sex_1', 'sex_2', 'age_60', 'age_70', 'age_80', 'age_90', 'age_0', 'city1', 'city2', 'city3', 'city4', 'city5', 'AppVerLast_2.1', 'AppVerLast_2.2', 'AppVerLast_2.3', 'AppVerLast_2.4', 'AppVerLast_2.5', 'AppVerLast_2.7', 'AppVerLast_2.8', 'kitchen_entropy', 'distance_median', 'distance_std', 'user_longitude_median', 'user_longitude_std', 'user_latitude_median', 'user_latitude_std', 'coupon_effective_days', 'money', 'max_money', 'covers_mon', 'covers_tue', 'covers_wed', 'covers_thu', 'covers_fri', 'covers_sat', 'covers_sun', 'type1', 'type6', 'Complaints', 'Eventsoperation', 'NewUserCouponPackageByBD', 'PreUserCouponCode', 'RecallUserDaily', 'home201603222253', 'home_dongbeiguan', 'home_jiangzhecai', 'home_muqinjie', 'home_xiangcaiguan', 'preuser', 'shareuser', '\xe5\x95\x86\xe5\xae\xb6\xe6\x8b\x92\xe5\x8d\x95\xe8\xbf\x94\xe5\x88\xb8', '\xe5\xae\xb6\xe5\x8e\xa8\xe5\x8f\x91\xe5\x88\xb8', '\xe6\xb4\xbb\xe5\x8a\xa8\xe8\xb5\xa0\xe5\x88\xb8', '\xe7\xa0\x81\xe5\x85\x91\xe5\x88\xb8', '

In [36]:
# initialization: CV, SCALER, KERNEL
CV = "3" # 3
if CV == "1":
    cv = [0]
elif CV == "2":
    cv = [3]
elif CV == "3":
    cv = [0,1,2,3]

SCALER = "1" # 1
if SCALER == "1":
    scaler = MinMaxScaler()
elif SCALER == "2":
    scaler = StandardScaler()
elif SCALER == "3":
    scaler = MaxAbsScaler()
elif SCALER == "4":
    scaler = RobustScaler()

KERNEL = "rbf"  # or rbf, poly
if KERNEL == "linear":
    C = [0.01, 0.1, 1, 10, 100]
    G = [0]
else:
    C = [0.01, 0.1, 1, 10, 100] #  A low C makes the decision surface smooth, while a high C select more samples as support vectors
    G = [0.01, 0.1, 1, 10] # low values meaning far and high values meaning close

BALANCE = int("2") # 2

RANDOM_STATE = 42

In [6]:
# scaling
X_train_continuous = scaler.fit_transform(trainset[conitnuous])
X_test_continuous = scaler.transform(testset[conitnuous])

trainset_scaled = pd.concat([trainset.loc[:,mains + categorical], pd.DataFrame(X_train_continuous, columns = conitnuous)], axis=1)
testset_scaled = pd.concat([testset.loc[:,mains + categorical], pd.DataFrame(X_test_continuous, columns = conitnuous)], axis=1)

In [7]:
# split train & dev
split_date1 = "2016-04-15"
split_date2 = "2016-04-22"
split_date3 = "2016-04-29"
split_date4 = "2016-05-06"

trainset1 = trainset_scaled[trainset_scaled["start_time"] <= split_date1]
devset1 = trainset_scaled[(trainset_scaled["start_time"] > split_date1) & (trainset_scaled["start_time"] <= split_date2)]

trainset2 = trainset_scaled[trainset_scaled["start_time"] <= split_date2]
devset2 = trainset_scaled[(trainset_scaled["start_time"] > split_date2) & (trainset_scaled["start_time"] <= split_date3)]

trainset3 = trainset_scaled[trainset_scaled["start_time"] <= split_date3]
devset3 = trainset_scaled[(trainset_scaled["start_time"] > split_date3) & (trainset_scaled["start_time"] <= split_date4)]

trainset4 = trainset_scaled[trainset_scaled["start_time"] <= split_date4]
devset4 = trainset_scaled[trainset_scaled["start_time"] > split_date4]

In [8]:
# shuffle trainset
trainset1 = trainset1.iloc[shuffle(trainset1.index).tolist(),]
trainset2 = trainset2.iloc[shuffle(trainset2.index).tolist(),]
trainset3 = trainset3.iloc[shuffle(trainset3.index).tolist(),]
trainset4 = trainset4.iloc[shuffle(trainset4.index).tolist(),]

In [9]:
trainsets = [trainset1, trainset2, trainset3, trainset4]
devsets = [devset1, devset2, devset3, devset4]

X_trains, y_trains, X_devs, y_devs = [], [], [], []
for i in trainsets:
    X_trains.append(i[i.columns[5:]])
    y_trains.append(i["is_used"])
for i in devsets:
    X_devs.append(i[i.columns[5:]])
    y_devs.append(i["is_used"])

In [None]:
trainset_all = trainset_scaled
trainset_all = trainset_all.iloc[shuffle(trainset_all.index).tolist(),]
X_train_all = trainset_all.iloc[:,5:]
y_train_all = trainset_all["is_used"]

X_test = testset_scaled.iloc[:,5:]
y_test = testset_scaled["is_used"]

## 1. SVM

In [9]:
trainset.columns

Index([u'user_coupon', u'user_id', u'coupon_id', u'start_time', u'is_used',
       u'sex_1', u'sex_2', u'age_60', u'age_70', u'age_80', u'age_90',
       u'age_0', u'city1', u'city2', u'city3', u'city4', u'city5',
       u'AppVerLast_2.1', u'AppVerLast_2.2', u'AppVerLast_2.3',
       u'AppVerLast_2.4', u'AppVerLast_2.5', u'AppVerLast_2.7',
       u'AppVerLast_2.8', u'kitchen_entropy', u'distance_median',
       u'distance_std', u'user_longitude_median', u'user_longitude_std',
       u'user_latitude_median', u'user_latitude_std', u'coupon_effective_days',
       u'money', u'max_money', u'covers_mon', u'covers_tue', u'covers_wed',
       u'covers_thu', u'covers_fri', u'covers_sat', u'covers_sun', u'type1',
       u'type6', u'Complaints', u'Eventsoperation',
       u'NewUserCouponPackageByBD', u'PreUserCouponCode', u'RecallUserDaily',
       u'home201603222253', u'home_dongbeiguan', u'home_jiangzhecai',
       u'home_muqinjie', u'home_xiangcaiguan', u'preuser', u'shareuser',
       u'商家拒单

In [118]:
res_svm = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: list))))
res_svm["KERNEL"] = KERNEL
res_svm["SCALER"] = SCALER
res_svm["BALANCE"] = BALANCE

evaluations = ["F05", "Precision", "Recall", "Mean_Pre", "AUC", "Accuracy"]
for ev in evaluations:
    for c in C:
        for g in G:
            res_svm[ev][str(c)][str(g)] = []
res_svm

defaultdict(<function __main__.<lambda>>,
            {'AUC': defaultdict(<function __main__.<lambda>>,
                         {'0.01': defaultdict(<function __main__.<lambda>>,
                                      {'0.01': [],
                                       '0.1': [],
                                       '1': [],
                                       '10': []}),
                          '0.1': defaultdict(<function __main__.<lambda>>,
                                      {'0.01': [],
                                       '0.1': [],
                                       '1': [],
                                       '10': []}),
                          '1': defaultdict(<function __main__.<lambda>>,
                                      {'0.01': [],
                                       '0.1': [],
                                       '1': [],
                                       '10': []}),
                          '10': defaultdict(<function __main__.<lambda>>

In [None]:
# train
start_time = time.time()

for c in C:
    start_time2 = time.time()
    for g in G:
        start_time3 = time.time()
            
        for n in cv:
            if KERNEL == "linear":
                # 5 hours
                svc = LinearSVC(C=c, 
                                class_weight={1: BALANCE},
                                random_state=RANDOM_STATE)
            else:
                svc = SVC(C=c, gamma=g, 
                          kernel = KERNEL,
                          class_weight={1: BALANCE},
                          random_state=RANDOM_STATE)
                
            svc.fit(X_trains[n], y_trains[n])
            y_pred = svc.predict(X_devs[n])
            y_dev = y_devs[n]

            print("K: {}, CV: {}, C: {}, G: {}".format(KERNEL, n, c, g))
            print(confusion_matrix(y_dev, y_pred, labels=[1,0]))
            
            f05 = fbeta_score(y_dev, y_pred, beta=0.5, labels=[1,0])
            precision = precision_score(y_dev, y_pred, labels=[1,0])
            recall = recall_score(y_dev, y_pred, labels=[1,0])
            mp = average_precision_score(y_dev, y_pred)
            auc = roc_auc_score(y_dev, y_pred)
            acc = accuracy_score(y_dev, y_pred)
            evaluations_res = [f05, precision, recall, mp, auc, acc]
            
            for i in range(len(evaluations)):
                print("{}: {}".format(evaluations[i], evaluations_res[i]))
                res_svm[evaluations[i]][str(c)][str(g)].append(evaluations_res[i])
            print("\n")
                
        print("Finished c {} g {} in {} sec\n".format(c, g, time.time() - start_time3))
            
    print("Finished c {} in {} sec\n".format(c, time.time() - start_time2))
        
print("{} sec\n".format(time.time() - start_time))

In [79]:
# average cv results
for ev in evaluations:
    for c in res_svm[ev]:
        res_svm[ev][c] = {g:np.mean(res_svm[ev][c][g]) for g in res_svm[ev][c]}

In [159]:
# save param output
with open('ParamResults/KernalSVM/json/res_svm_{}_{}_1v{}.json'.format(KERNEL, SCALER, BALANCE), 'w') as f:
    json.dump(res_svm, f)

# 2. Evaluation

In [None]:
# load params
res_svm_rbf_minmax_1v2 = json.load(open("ParamResults/KernalSVM/json/res_svm_rbf_1_1v2.json"))
res_svm_rbf_maxabs_1v2 = json.load(open("ParamResults/KernalSVM/json/res_svm_rbf_3_1v2.json"))
res_svm_poly_minmax_1v2 = json.load(open("ParamResults/KernalSVM/json/res_svm_poly_1_1v2.json"))
res_svm_poly_maxabs_1v2 = json.load(open("ParamResults/KernalSVM/json/res_svm_poly_3_1v2.json"))
# pprint(res_svm_poly_maxabs_1v2)

evaluations = ["F05", "Precision", "Recall", "Mean_Pre", "AUC", "Accuracy"]
C = ["0.01", "0.1", "1", "10", "100"]
G = ["0.01", "0.1", "1", "10"]
cs = [float(j) for i in [[c]*len(G) for c in C] for j in i]
gs = [float(i) for i in G*len(C)]

plot_svm_rbf_minmax_1v2 = {ev: [list(res_svm_rbf_minmax_1v2[ev][c].values()) for c in C] for ev in evaluations}
plot_svm_rbf_maxabs_1v2 = {ev: [list(res_svm_rbf_maxabs_1v2[ev][c].values()) for c in C] for ev in evaluations}
plot_svm_poly_minmax_1v2 = {ev: [list(res_svm_poly_minmax_1v2[ev][c].values()) for c in C] for ev in evaluations}
plot_svm_poly_maxabs_1v2 = {ev: [list(res_svm_poly_maxabs_1v2[ev][c].values()) for c in C] for ev in evaluations}

In [None]:
# plot
for i in range(len(evaluations)):
    fig = plt.figure(i + 1)
    
    plt.title(evaluations[i] + " Score", y=1.14, fontweight="bold")
    plt.grid(True, linestyle='--', color='#CCCCCC')
    
    ax = fig.add_subplot(111, projection='3d')
    
    ax_1 = plot_svm_rbf_minmax_1v2[evaluations[i]]
    ax_2 = plot_svm_rbf_maxabs_1v2[evaluations[i]]
    ax_3 = plot_svm_poly_minmax_1v2[evaluations[i]]
    ax_4 = plot_svm_poly_maxabs_1v2[evaluations[i]]
    
    all_scores = [j for i in ax_1 for j in i] + [j for i in ax_2 for j in i] + [j for i in ax_3 for j in i] + [j for i in ax_4 for j in i]
    maxIdx = np.argmax(np.array(all_scores))
    maxScore = max(all_scores)
    minScore = min(all_scores)
    
    ax.set_xlabel('C')
    ax.set_ylabel('gamma')
    ax.set_zlabel('Score')
    ax.set_zlim(minScore*0.999, maxScore*1.003)

    ax.scatter(cs, gs, [j for i in ax1_1 for j in i], label="svm_rbf_minmax_1v2", c='pink', marker='o')
    ax.scatter(cs, gs, [j for i in ax1_2 for j in i], label="svm_rbf_maxabs_1v2", c='red', marker='o')
    ax.scatter(cs, gs, [j for i in ax1_3 for j in i], label="svm_poly_minmax_1v2", c='#6699CC', marker='^')
    ax.scatter(cs, gs, [j for i in ax1_4 for j in i], label="svm_poly_maxabs_1v2", c='#6666FF', marker='^')
    ax.legend(bbox_to_anchor=(0.7, 0.2), loc=2, borderaxespad=0.)

    ax.text(maxIdx//5, maxIdx%5, maxScore*1.0005, 
            'Max: {0:.4f}'.format(maxScore)) 

    plt.savefig('SVM_{}.png'.format(evaluations[i]))
    plt.show()

In [2]:
# retrain using selected params
c, g, k, b, rs = 0.05, 0.01, 'rbf', 2, 42
svm_best = SVC(C=c, gamma=g, 
               kernel = k,
               class_weight={1: BALANCE},
               random_state=RANDOM_STATE)
svm_best.fit(X_train_all, y_train_all)

In [117]:
# attributes
# svm_best.coef_

In [None]:
# predict
y_pred = svm_best.predict(X_test)
print(confusion_matrix(y_test, y_pred, labels=[1,0]))
print("F05: {}".format(fbeta_score(y_test, y_pred, beta=0.5, labels=[1,0])))
print("Precision: {}".format(precision_score(y_test, y_pred, labels=[1,0])))
print("Recall: {}".format(recall_score(y_test, y_pred, labels=[1,0])))
print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))