In [1]:
import pandas as pd

df = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [2]:
from tqdm import tqdm
from sklearn import preprocessing

feature_columns = list(filter(lambda x: x not in {"id", "target"}, df.columns))

for c in tqdm(feature_columns):
    enc = preprocessing.LabelEncoder()
    enc.fit(pd.concat([df, df_test], sort=False)[c].fillna(-1).astype(str))
    df[c] = enc.transform(df[c].fillna(-1).astype(str))
    df_test[c] = enc.transform(df_test[c].fillna(-1).astype(str))

100%|██████████| 23/23 [00:17<00:00,  1.31it/s]


In [3]:
df.head(5)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,1,1,1,1,1,3,5,4,6,...,28,3,1,4,3,21,57,6,6,0
1,1,2,2,1,1,2,3,4,1,0,...,2113,3,3,6,5,24,151,7,10,0
2,2,1,2,1,1,1,3,0,4,1,...,0,3,0,3,14,16,106,5,12,0
3,3,0,1,1,1,1,3,1,4,4,...,2168,1,5,5,1,3,0,3,6,0
4,4,1,0,1,2,1,3,6,4,3,...,1748,3,3,2,8,3,51,5,4,0


In [4]:
binary = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
ordinal  = ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']
low_card = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
high_card = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
date = ['day', 'month']

def format_feature(row, cols, namespace):
    features = " ".join(["{}_{}".format(c, str(row[c])) for c in cols if row[c]!=0])
    return "|{} {}".format(namespace, features)
        
def to_vw_format(df, wv_file, feature_columns):
    with open(wv_file, 'w') as f:
        for index, row in tqdm(df.iterrows()):
            label = 2*(row["target"] - 0.5) if "target" in row else None
            tag = row["id"]
            
            f_bin = format_feature(row, binary, "b")
            f_ord = format_feature(row, ordinal, "o")
            f_loc = format_feature(row, low_card, "l")
            f_hic = format_feature(row, high_card, "h")
            f_dat = format_feature(row, date, "d")
            
            features = " ".join([f_bin, f_ord, f_loc, f_hic, f_dat])
            
            if label:
                line = "{} {}{}\n".format(label, tag, features)
            else:
                line = "{}{}\n".format(tag, features)
            
            f.write(line)

In [5]:
to_vw_format(df, "../data/full_wv.txt", feature_columns)
to_vw_format(df_test, "../data/test_wv.txt", feature_columns)

600000it [03:44, 2670.62it/s]
400000it [02:19, 2859.89it/s]


# Validate

In [7]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df, test_size=0.1, random_state=0, shuffle=True)

to_vw_format(df_train, "../data/train_wv.txt", feature_columns)
to_vw_format(df_val, "../data/val_wv.txt", feature_columns)

540000it [03:17, 2729.43it/s]
60000it [00:22, 2722.76it/s]


In [162]:
!rm ../data/cfec_model.vw
!rm ../data/train_wv.txt.cache
!vw ../data/train_wv.txt --l2 1e-6 --hessian_on -f ../data/cfec_model.vw --holdout_period 20 -c --passes 20 --initial_t 1 --loss_function logistic --decay_learning_rate 0.7 --learning_rate 0.5 --classweight -1:0.23

using l2 regularization = 1e-06
final_regressor = ../data/cfec_model.vw
parsed 1 class weights
Num weight bits = 18
learning rate = 0.5
initial_t = 1
power_t = 0.5
decay_learning_rate = 0.7
creating cache_file = ../data/train_wv.txt.cache
Reading datafile = ../data/train_wv.txt
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.589030 0.589030            5            1.2  -1.0000  -0.3869       23
0.745836 0.892443            7            2.4   1.0000  -0.5137       24
0.714104 0.685168           15            5.0   1.0000  -0.5126       24
0.722255 0.729045           31           11.0   1.0000  -0.3093       24
0.683472 0.645485           63           22.2  -1.0000  -0.5475       24
0.695912 0.708346          116           44.4   1.0000  -0.5539       24
0.681401 0.667163          229           89.6   1.0000   1.0485       24
0.672542 0.663689          465          179.3   

In [163]:
!vw -i ../data/cfec_model.vw -t ../data/val_wv.txt --link=logistic -p ../data/cfec2_val_wv.txt

only testing
predictions = ../data/cfec2_val_wv.txt
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ../data/val_wv.txt
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
1.896697 1.896697            1            1.0  -1.0000   0.5932       24
0.984407 0.072117            2            2.0  -1.0000   0.3249       23
1.977850 2.971293            4            4.0  -1.0000   0.7813       23
1.663826 1.349803            8            8.0  -1.0000   0.1613       22
0.949470 0.235113           16           16.0  -1.0000   0.1388       23
1.185268 1.421066           32           32.0  -1.0000   0.3638       23
1.097303 1.009338           64           64.0  -1.0000   0.3179       24
1.185373 1.273443          128          128.0  -1.0000   0.7647       23
1.278393 1.371413          256          256.0  -1.0000   0.3727       24
1.23433

In [164]:
from sklearn.metrics import roc_auc_score

df_val_pred = pd.read_csv("../data/cfec2_val_wv.txt", sep=" ", names=["prob", "id"])
df_val_pred = df_val_pred.merge(df_val[["id", "target"]], on="id")
print("Validation AUC:", roc_auc_score(df_val_pred["target"], df_val_pred["prob"]))
df_val_pred.head(5)

Validation AUC: 0.7883045041358747


Unnamed: 0,prob,id,target
0,0.593199,262570,0
1,0.324876,207212,0
2,0.470034,99242,0
3,0.781319,243446,0
4,0.751053,132564,0


# Train and test

In [25]:
vw ../data/full_wv.txt -q ll oo lo -f ../data/cfec_model.vw --l2 1e-6 --holdout_period 20 -c --passes 20 --loss_function logistic --decay_learning_rate 0.7 --learning_rate 0.5 --classweight -1:0.23

using l2 regularization = 1e-06
final_regressor = ../data/cfec_model.vw
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 0.7
using cache_file = ../data/full_wv.txt.cache
ignoring text input in favor of cache input
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
0.693147 0.693147            1            1.0  -1.0000   0.0000       24
0.621780 0.550412            2            2.0  -1.0000  -0.3093       23
0.486713 0.351646            4            4.0  -1.0000  -0.9060       22
0.352089 0.217466            8            8.0  -1.0000  -1.8267       24
0.505780 0.659471           16           16.0  -1.0000  -1.1486       22
0.511024 0.516267           32           32.0   1.0000  -2.2181       24
0.443500 0.375977           64           64.0  -1.0000  -2.1258       23
0.478387 0.513274          128          128.0   1.0000  -1.1177       2

In [26]:
!vw -i ../data/cfec_model.vw -t ../data/test_wv.txt --link=logistic -p ../data/cfec2_test_wv.txt

only testing
predictions = ../data/cfec2_test_wv.txt
Num weight bits = 18
learning rate = 0.5
initial_t = 0
power_t = 0.5
using no cache
Reading datafile = ../data/test_wv.txt
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
    n.a.     n.a.            1            1.0  unknown   0.1205       24
    n.a.     n.a.            2            2.0  unknown   0.2517       23
    n.a.     n.a.            4            4.0  unknown   0.1148       24
    n.a.     n.a.            8            8.0  unknown   0.1835       23
    n.a.     n.a.           16           16.0  unknown   0.1516       24
    n.a.     n.a.           32           32.0  unknown   0.0644       24
    n.a.     n.a.           64           64.0  unknown   0.1578       24
    n.a.     n.a.          128          128.0  unknown   0.1302       24
    n.a.     n.a.          256          256.0  unknown   0.4482       22
    n

In [27]:
df_test_pred = pd.read_csv("../data/cfec2_test_wv.txt", sep=" ", names=["target", "id"])
df_test_pred.to_csv("../data/submission_wv.csv", index=False)
df_test_pred.head(5)

Unnamed: 0,target,id
0,0.120507,600000
1,0.25169,600001
2,0.149894,600002
3,0.114812,600003
4,0.133167,600004


# Blend

In [None]:
import pandas as pd
df_lines = pd.read_csv("../data/full_wv.txt", names = ["text"])
df_lines.loc[train_idx, :].to_csv("../data/train_wv.txt", index=False, header=None)

In [4]:
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

folds = 50
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

df_lines = pd.read_csv("../data/full_wv.txt", names = ["text"])
y = pd.read_csv('../data/train.csv')["target"].values
df_test = pd.read_csv('../data/test.csv')[["id"]]
df_test["target"] = 0


for train_idx, _ in tqdm(skf.split(df_lines, y)):
    !rm ../data/train_wv.txt
    !rm ../data/cfec_model.vw
    !rm ../data/train_wv.txt.cache
    df_lines.loc[train_idx, :].to_csv("../data/train_wv.txt", index=False, header=None)
    !vw ../data/train_wv.txt  --quiet -f ../data/cfec_model.vw --l2 1e-6 --holdout_period 20 -c --passes 20 --loss_function logistic --decay_learning_rate 0.7 --learning_rate 0.5 --classweight -1:0.23
    !vw -i ../data/cfec_model.vw --quiet -t ../data/test_wv.txt --link=logistic -p ../data/cfec2_test_wv.txt
    df_test["target"] += pd.read_csv("../data/cfec2_test_wv.txt", sep=" ", names=["target", "id"])["target"].values
    
df_test["target"] /= folds

df_test.to_csv("../data/submission_wv.csv", index=False)
df_test.head()

50it [11:16, 13.54s/it]


Unnamed: 0,id,target
0,600000,0.365186
1,600001,0.591646
2,600002,0.431123
3,600003,0.377906
4,600004,0.403135
