In [2]:
# %load genFeat_id_feat.py

"""
__file__

    genFeat_id_feat.py

__description__

    This file generates the following features for each run and fold, 
    and for the entire training and testing set.

        1. one-hot encoding of query ids (qid)

__author__

    Chenglong Chen < c.chenglong@gmail.com >

"""

import sys
# import cPickle
import dill
import pickle
from sklearn.preprocessing import LabelBinarizer
sys.path.append("../")
from param_config import config

In [36]:
## config
id_names = [ "qid" ]

###############
## Load Data ##
###############
## load data
with open(config.processed_train_data_path, "rb") as f:
    dfTrain = dill.load(f)
with open(config.processed_test_data_path, "rb") as f:
    dfTest = dill.load(f)
## load pre-defined stratified k-fold index
with open("%s/stratifiedKFold.%s1.pkl" % (config.data_folder, config.stratified_label), "rb") as f:
        skf = pickle.load(f, encoding='bytes')

In [26]:
len(skf)

3

In [37]:
#######################
## Generate Features ##
#######################
print("==================================================")
print("Generate id features...")

print("For cross-validation...")
for run in range(config.n_runs):
    ## use 33% for training and 67 % for validation
    ## so we switch trainInd and validInd
    for fold, (validInd, trainInd) in enumerate(skf[run]):
        print("Run: %d, Fold: %d" % (run+1, fold+1))
        print(validInd[:10])
        print(trainInd[:10])
        path = "%s/Run%d/Fold%d" % (config.feat_folder, run+1, fold+1)

        #################
        ## get id feat ##
        #################
        for id_name in id_names:
            lb = LabelBinarizer(sparse_output=True)
#             lb = LabelBinarizer()
            X_train = lb.fit_transform(dfTrain.iloc[trainInd][id_name])
            X_valid = lb.transform(dfTrain.iloc[validInd][id_name])
            with open("%s/train.%s.feat.pkl" % (path, id_name), "wb") as f:
                pickle.dump(X_train, f)
            with open("%s/valid.%s.feat.pkl" % (path, id_name), "wb") as f:
                pickle.dump(X_valid, f)

print("Done.")

Generate id features...
For cross-validation...
Run: 1, Fold: 1
[ 0  1  3  4  6  7  8 10 11 13]
[ 2  5  9 12 14 20 21 22 23 28]
Run: 1, Fold: 2
[ 2  4  5  7  9 10 11 12 14 17]
[ 0  1  3  6  8 13 15 16 18 24]
Run: 1, Fold: 3
[ 0  1  2  3  5  6  8  9 12 13]
[ 4  7 10 11 17 19 25 29 34 35]
Run: 2, Fold: 1
[ 1  3  5  6  7  8  9 10 11 13]
[ 0  2  4 12 17 18 22 28 32 33]
Run: 2, Fold: 2
[ 0  2  3  4 12 15 16 17 18 22]
[ 1  5  6  7  8  9 10 11 13 14]
Run: 2, Fold: 3
[ 0  1  2  4  5  6  7  8  9 10]
[ 3 15 16 24 26 27 31 34 35 40]
Run: 3, Fold: 1
[ 1  3  4  5  6  8  9 10 11 13]
[ 0  2  7 12 14 17 19 23 24 25]
Run: 3, Fold: 2
[ 0  2  4  5  6  7  8 12 13 14]
[ 1  3  9 10 11 16 18 20 30 31]
Run: 3, Fold: 3
[ 0  1  2  3  7  9 10 11 12 14]
[ 4  5  6  8 13 15 21 22 32 35]
Done.


In [38]:
print("For training and testing...")
path = "%s/All" % config.feat_folder
## use full version for X_train                
for id_name in id_names:
    X_train = lb.fit_transform(dfTrain[id_name])
    X_test = lb.transform(dfTest[id_name])
    with open("%s/train.%s.feat.pkl" % (path, id_name), "wb") as f:
        pickle.dump(X_train, f, -1)
    with open("%s/test.%s.feat.pkl" % (path, id_name), "wb") as f:
        pickle.dump(X_test, f, -1)
print("Done.")

print("All Done.")

For training and testing...
Done.
All Done.
