In [11]:
from xgboostextension import XGBRanker, XGBFeature
from model.config import Config
import pandas as pd
import numpy as np
from model.data_utils import get_trimmed_glove_vectors, \
    get_embedding, get_distances, get_pointwise_distances, \
    compute_lengths, sort_xgb_predictions, get_mean_NDCG
import pickle
import os

In [None]:
def save_submission(path_to_submission, dataframe, predictions):
    with open(path_to_sub,"w+") as f:
        for k, v in (zip(dataframe.context_id.values, predictions)):
            f.write("%s %s" % (k, v))
            f.write("\n")

In [None]:
conf = 0.9
label_to_num = {"good": 2, "neutral": 1, "bad": 1 - conf}

In [None]:
config = Config()

In [None]:
train = pd.read_csv(config.path_to_train)
val = pd.read_csv(config.path_to_val)
test = pd.read_csv(config.path_to_test)

In [None]:
vocab = get_trimmed_glove_vectors(config.filename_trimmed)

In [None]:
ranker = XGBRanker(n_estimators=150, learning_rate=0.1, subsample=0.9)#, objective='rank:pairwise')

In [None]:
X_train = np.array(get_pointwise_distances(train, vocab))

In [None]:
y_train = np.array([label_to_num[x] for x in train.label])

In [None]:
lengths_train = compute_lengths(train)

In [None]:
ranker.fit(X_train, y_train, lengths_train, eval_metric=['ndcg', 'map@5-'])

In [None]:
train_preds = ranker.predict(X_train)

In [None]:
sorted_train_preds = sort_xgb_predictions(train, train_preds)

In [None]:
train_ndcg = get_mean_NDCG(train, sorted_train_preds)
print ('train NDCG:', train_ndcg)

In [None]:
X_val = np.array(get_pointwise_distances(val, vocab))
lengths_val = compute_lengths(val)
val_preds = ranker.predict(X_val, lengths_val)
sorted_val_preds = sort_xgb_predictions(val, val_preds)
val_ndcg = get_mean_NDCG(val, sorted_val_preds)
print ('val NDCG:', val_ndcg)

In [None]:
lengths_val = compute_lengths(val)

In [None]:
val_preds = ranker.predict(X_val, lengths_val)

In [None]:
sorted_val_preds = sort_xgb_predictions(val, val_preds)

In [None]:
val_ndcg = get_mean_NDCG(val, sorted_val_preds)
print ('val NDCG:', val_ndcg)

In [None]:
X_test = np.array(get_pointwise_distances(test, vocab))
print ('Test dataframe was preprocessed')
lengths_test = compute_lengths(test)
test_preds = ranker.predict(X_test, lengths_test)
print ('Test predictions were computed')
sorted_test_preds = sort_xgb_predictions(test, test_preds)
test_ndcg = get_mean_NDCG(test, sorted_test_preds)
print ('Test NDCG:', test_ndcg)

# Save and load model

In [None]:
path_to_xgb_model = "saved_ranker.pickle.dat"
pickle.dump(ranker, open(path_to_xgb_model, "wb"))

In [None]:
loaded_model = pickle.load(open(path_to_xgb_model, "rb"))

## Check the results

In [12]:
def train_xgb_ranker(train, test, val, vocab, config, n_estimators=150, max_depth=3, learning_rate=0.1, \
                    subsample=0.9, conf=0.9):
    # train, test and val are the dataframes
    # vocab is a dictionary
    label_to_num = {"good": 2, "neutral": 1, "bad": 1 - conf}
    print ('label to num', label_to_num)
    ranker = XGBRanker(n_estimators=n_estimators, learning_rate=learning_rate, \
                       subsample=subsample, max_depth=max_depth)
    X_train = np.array(get_pointwise_distances(train, vocab))
    print ('Train was preprocessed')
    lengths_train = compute_lengths(train)
    y_train = np.array([label_to_num[x] for x in train.label])
    print ('Start training')
    ranker.fit(X_train, y_train, lengths_train, eval_metric=['ndcg', 'map@5-'])
    train_preds = ranker.predict(X_train, lengths_train)
    sorted_train_preds = sort_xgb_predictions(train, train_preds)
    train_ndcg = get_mean_NDCG(train, sorted_train_preds)
    print ('Train NDCG:', train_ndcg)
    
    print ('Start validation')
    X_val = np.array(get_pointwise_distances(val, vocab))
    lengths_val = compute_lengths(val)
    val_preds = ranker.predict(X_val, lengths_val)
    sorted_val_preds = sort_xgb_predictions(val, val_preds)
    val_ndcg = get_mean_NDCG(val, sorted_val_preds)
    print ('val NDCG:', val_ndcg)

    print ('Start test')
    X_test = np.array(get_pointwise_distances(test, vocab))
    print ('Test dataframe was preprocessed')
    lengths_test = compute_lengths(test)
    test_preds = ranker.predict(X_test, lengths_test)
    print ('Test predictions were computed')
    sorted_test_preds = sort_xgb_predictions(test, test_preds)
    test_ndcg = get_mean_NDCG(test, sorted_test_preds)
    print ('Test NDCG:', test_ndcg)
    
    path_to_xgb_model = config.path_to_xgb_models + "xgb_n_estimators_%s_depth_%s_lr_%s_subsample_%s_conf_%s_val_%s_test_%s.pickle.dat" % \
    (n_estimators, max_depth, learning_rate, subsample, conf, val_ndcg, test_ndcg)
    
    print ('Saving model')
    pickle.dump(ranker, open(path_to_xgb_model, "wb"))
    
    return train_ndcg, val_ndcg, test_ndcg, path_to_xgb_model

In [13]:
config = Config()
train = pd.read_csv(config.path_to_train)
val = pd.read_csv(config.path_to_val)
test = pd.read_csv(config.path_to_test)
vocab = get_trimmed_glove_vectors(config.filename_trimmed)
print ('Data is loaded')
n_estimators = 150
max_depth = 3
learning_rate = 0.1
subsample = 0.9
conf = 0.9
val_ndcg = 82118.08758614419
test_ndcg = 83248.08758614419
if not os.path.exists(config.path_to_xgb_models):
    os.makedirs(config.path_to_xgb_models)
for conf in [0.99, 0.5]:
    for max_depth in [3, 5, 10]:
        train_ndcg, val_ndcg, test_ndcg, path_to_xgb_model = train_xgb_ranker(train, test, val, vocab, config, \
                                                                             conf=conf, max_depth=max_depth)
        with open(config.path_to_xgb_log, 'a+') as log_file:
            log_file.write('train NDCG:' + str(train_ndcg) + '\n')
            log_file.write('val NDCG: ' + str(val_ndcg) + '\n')
            log_file.write('test NDCG: ' + str(test_ndcg) + '\n')
            log_file.write('_______________________\n')
            log_file.write('n estimators: ' + str(n_estimators) + '\n')
            log_file.write('max depth: ' + str(max_depth) + '\n')
            log_file.write('learning rate: ' + str(learning_rate) + '\n')
            log_file.write('subsample: ' + str(subsample) + '\n')
            log_file.write('conf: ' + str(conf) + '\n')
            #path_to_xgb_model = "xgb_n_estimators_%s_depth_%s_lr_%s_subsample_%s_conf_%s_val_%s_test_%s.pickle.dat" % (n_estimators, max_depth, learning_rate, subsample, conf, val_ndcg, test_ndcg)
            log_file.write('path to model: ' + path_to_xgb_model + ' \n')
            log_file.write('=======================\n')

Data is loaded
label to num {'neutral': 1, 'bad': 0.010000000000000009, 'good': 2}
Train was preprocessed
Start training
Train NDCG: 83141.777727717
Start validation
val NDCG: 82656.29002118148
Start test
Test dataframe was preprocessed
Test predictions were computed
Test NDCG: 82118.08758614419
Saving model
label to num {'neutral': 1, 'bad': 0.010000000000000009, 'good': 2}
Train was preprocessed
Start training
Train NDCG: 83058.52869175769
Start validation
val NDCG: 83229.33478587391
Start test
Test dataframe was preprocessed
Test predictions were computed
Test NDCG: 82300.57094307899
Saving model
label to num {'neutral': 1, 'bad': 0.010000000000000009, 'good': 2}
Train was preprocessed
Start training


KeyboardInterrupt: 