In [1]:
import pandas as pd

df = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [2]:
from tqdm import tqdm
from sklearn import preprocessing

feature_columns = list(filter(lambda x: x not in {"id", "target"}, df.columns))

for c in tqdm(feature_columns):
    enc = preprocessing.LabelEncoder()
    enc.fit(pd.concat([df, df_test], sort=False)[c].fillna(-1).astype(str))
    df[c] = enc.transform(df[c].fillna(-1).astype(str))
    df_test[c] = enc.transform(df_test[c].fillna(-1).astype(str))

100%|██████████| 23/23 [00:18<00:00,  1.26it/s]


In [332]:
df.head(5)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,1,1,1,1,1,3,5,4,6,...,28,3,1,4,3,21,57,6,6,0
1,1,2,2,1,1,2,3,4,1,0,...,2113,3,3,6,5,24,151,7,10,0
2,2,1,2,1,1,1,3,0,4,1,...,0,3,0,3,14,16,106,5,12,0
3,3,0,1,1,1,1,3,1,4,4,...,2168,1,5,5,1,3,0,3,6,0
4,4,1,0,1,2,1,3,6,4,3,...,1748,3,3,2,8,3,51,5,4,0


In [333]:
binary = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
ordinal  = ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']
low_card = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
high_card = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
date = ['day', 'month']

def format_features(row, cols, namespace):
    features = " ".join(["{}".format(c + "_" + str(row[c])) for c in cols if row[c]!=0])
    return "|{} {}".format(namespace, features)
    

def to_vw_format(df, wv_file, feature_columns):
    with open(wv_file, 'w') as f:
        for index, row in tqdm(df.iterrows()):
            label = 2*(row["target"] - 0.5) if "target" in row else None
            tag = row["id"]
            
            f_bin = format_features(row, binary, "b")
            f_ord = format_features(row, ordinal, "o")
            f_loc = format_features(row, low_card, "l")
            f_hic = format_features(row, high_card, "h")
            f_dat = format_features(row, date, "d")
            
            features = " ".join([f_bin, f_ord, f_loc, f_hic, f_dat])
            
            if label:
                line = "{} {}{}\n".format(label, tag, features)
            else:
                line = "{}{}\n".format(tag, features)
            f.write(line)

In [334]:
to_vw_format(df, "../data/full_wv.txt", feature_columns)
to_vw_format(df_test, "../data/test_wv.txt", feature_columns)

600000it [03:43, 2686.00it/s]
400000it [02:23, 2788.34it/s]


In [571]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df, test_size=0.1, random_state=1, shuffle=True)

to_vw_format(df_train, "../data/train_wv.txt", feature_columns)
to_vw_format(df_val, "../data/val_wv.txt", feature_columns)

540000it [03:17, 2730.47it/s]
60000it [00:21, 2780.65it/s]


# Validate

In [572]:
!vw ../data/train_wv.txt -f ../data/cfec_model.vw --l2 1e-6 --holdout_period 20 -c --passes 20 --loss_function logistic --decay_learning_rate 0.7 --learning_rate 0.5 --classweight -1:0.23

using l2 regularization = 1e-06
final_regressor = ../data/cfec_model.vw
parsed 1 class weights
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 0.7
using cache_file = ../data/train_wv.txt.cache
ignoring text input in favor of cache input
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.517179 0.517179            5            1.2  -1.0000  -0.7010       23
0.820402 1.103903            7            2.4   1.0000  -0.9414       24
0.763249 0.711133           15            5.0   1.0000  -0.9575       24
0.779764 0.793521           31           11.0   1.0000  -0.6044       24
0.712360 0.646339           63           22.2  -1.0000  -0.7660       24
0.720013 0.727663          116           44.4   1.0000  -1.0231       24
0.695529 0.671504          229           89.6   1.0000   1.0189       24
0.675186 0.654859          465          179.3  

In [573]:
!vw -i ../data/cfec_model.vw -t ../data/val_wv.txt --link=logistic -p ../data/cfec2_val_wv.txt

only testing
predictions = ../data/cfec2_val_wv.txt
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ../data/val_wv.txt
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.355362 0.355362            1            1.0  -1.0000   0.4004       24
0.191235 0.027108            2            2.0   1.0000   0.7622       23
1.665071 3.138907            4            4.0  -1.0000   0.0561       24
1.091104 0.517137            8            8.0  -1.0000   0.4021       24
0.936203 0.781303           16           16.0  -1.0000   0.4829       24
1.183501 1.430798           32           32.0   1.0000   0.7601       24
1.457030 1.730559           64           64.0  -1.0000   0.5145       23
1.431397 1.405765          128          128.0  -1.0000   0.2567       24
1.376701 1.322005          256          256.0   1.0000   0.6880       24
1.35503

In [574]:
from sklearn.metrics import roc_auc_score

df_val_pred = pd.read_csv("../data/cfec2_val_wv.txt", sep=" ", names=["prob", "id"])
df_val_pred = df_val_pred.merge(df_val[["id", "target"]], on="id")
print("Validation AUC:", roc_auc_score(df_val_pred["target"], df_val_pred["prob"]))
df_val_pred.head(5)

Validation AUC: 0.7976569897581353


Unnamed: 0,prob,id,target
0,0.400381,555024,0
1,0.762176,236417,1
2,0.672507,91618,0
3,0.056128,53904,0
4,0.565805,350187,0


# Train and test

In [171]:
!vw ../data/full_wv.txt -f ../data/cfec_model.vw --loss_function logistic --learning_rate 0.5 --classweight -1:0.23

final_regressor = ../data/cfec_model.vw
parsed 1 class weights
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ../data/full_wv.txt
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.525411 0.525411            5            1.2  -1.0000  -0.7027       24
0.776757 0.927303           10            3.1   1.0000  -1.2420       24
0.826120 0.870955           18            6.5   1.0000  -0.7243       24
0.737718 0.651325           40           13.1  -1.0000  -0.9571       24
0.745550 0.753334           77           26.2  -1.0000  -0.8265       24
0.727658 0.709874          148           52.5  -1.0000  -0.6232       24
0.713718 0.699799          293          105.1  -1.0000  -0.2868       24
0.702813 0.691911          566          210.3  -1.0000   0.0267       24
0.685711 0.668620         1136          420.7  -1.0000   0.5489     

In [172]:
!vw -i ../data/cfec_model.vw -t ../data/test_wv.txt --link=logistic -p ../data/cfec2_test_wv.txt

only testing
predictions = ../data/cfec2_test_wv.txt
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ../data/test_wv.txt
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
    n.a.     n.a.            1            1.0  unknown   0.3674       24
    n.a.     n.a.            2            2.0  unknown   0.5905       24
    n.a.     n.a.            4            4.0  unknown   0.3384       24
    n.a.     n.a.            8            8.0  unknown   0.4894       24
    n.a.     n.a.           16           16.0  unknown   0.4172       24
    n.a.     n.a.           32           32.0  unknown   0.1970       24
    n.a.     n.a.           64           64.0  unknown   0.4197       24
    n.a.     n.a.          128          128.0  unknown   0.3897       24
    n.a.     n.a.          256          256.0  unknown   0.7784       24
    n

In [173]:
df_test_pred = pd.read_csv("../data/cfec2_test_wv.txt", sep=" ", names=["target", "id"])
df_test_pred.to_csv("../data/submission_wv.csv", index=False)
df_test_pred.head(5)

Unnamed: 0,target,id
0,0.367414,600000
1,0.590503,600001
2,0.420583,600002
3,0.338398,600003
4,0.401236,600004


Training:
```
vw train_wv.txt -f cfec_model.vw --loss_function logistic --learning_rate=0.5 --minibatch=1000
```
Validtion:
```
vw val_wv.txt -t -i cfec_model.vw --link=logistic --testonly -p cfec2_val_wv.txt
```
Training full:
```
vw full_wv.txt -f cfec_model.vw --loss_function logistic --learning_rate=0.5 --minibatch=1000
```
Testing VW:
```
vw test_wv.txt -t -i cfec_model.vw --link=logistic -p cfec2_test_wv.txt
```