In [13]:
#import xgboost
from xgboost import XGBRegressor, XGBModel

In [14]:
from xgboostextension import XGBRanker, XGBFeature

In [15]:
import sklearn
import pandas as pd
import numpy as np
from model.config import Config
from model.data_utils import load_vocab, get_mean_NDCG
from ast import literal_eval
from scipy.spatial.distance import cosine, correlation, braycurtis, \
    euclidean, mahalanobis, minkowski, seuclidean, sqeuclidean, wminkowski 
from model.data_utils import get_trimmed_glove_vectors, get_mean_NDCG, ndcg_at_k, get_predictions
from fastText import load_model
import argparse
import errno
import tqdm
from collections import Counter
import scipy

In [39]:
conf = 0.99
label_to_num = {"good": 2, "neutral": 1, "bad": 1 - conf}

In [17]:
config = Config()

In [18]:
train = pd.read_csv(config.path_to_train)
val = pd.read_csv(config.path_to_val)
test = pd.read_csv(config.path_to_test)

In [19]:
vocab = get_trimmed_glove_vectors(config.filename_trimmed)

In [20]:
def get_embedding(indices, vocab):
    # indices must be a list of int indices
    try:
        indices = literal_eval(indices)
    except ValueError:
        indices = indices
    
    embedded_sentence = np.take(vocab, indices, axis=0)
    return embedded_sentence

In [21]:
def get_distances(dataframe, vocab, f=euclidean):
    total_distances = []
    for _id in list(dataframe.context_id.unique()):
        partition = dataframe.loc[dataframe['context_id'] == _id]
        distances = []
        context = literal_eval(partition.merged_contexts.iloc[0])
        context_vector = get_embedding(context, vocab)
        mean_context = np.mean(context_vector, axis=0)
        replies = [literal_eval(x) for x in partition.reply]
        for reply, reply_id in zip(replies, partition.reply_id):
            reply_vector = get_embedding(reply, vocab)
            mean_reply = np.mean(reply_vector, axis=0)
            #distance = (mean_context - mean_reply) ** 2
            distance = f(mean_context, mean_reply)
            #distances = np.append(distances, distance)
            distances.append(distance)
        total_distances.extend(distances)
    return total_distances   

In [22]:
def get_pointwise_distances(dataframe, vocab, f=euclidean):
    total_distances = []
    for _id in list(dataframe.context_id.unique()):
        partition = dataframe.loc[dataframe['context_id'] == _id]
        distances = []
        context = literal_eval(partition.merged_contexts.iloc[0])
        context_vector = get_embedding(context, vocab)
        mean_context = np.mean(context_vector, axis=0)
        replies = [literal_eval(x) for x in partition.reply]
        for reply, reply_id in zip(replies, partition.reply_id):
            reply_vector = get_embedding(reply, vocab)
            mean_reply = np.mean(reply_vector, axis=0)
            distance = (mean_context - mean_reply) ** 2
            #distance = f(mean_context, mean_reply)
            #distances = np.append(distances, distance)
            distances.append(distance)
        total_distances.extend(distances)
    return total_distances

In [23]:
def compute_lens(dataframe):
    lens = []
    for _id in list(dataframe.context_id.unique()):
        partition = dataframe.loc[dataframe['context_id'] == _id]
        lens.append(len(partition))
    return lens

In [24]:
def sort_xgb_predictions(dataframe, predictitons):
    total_predictions = []
    for _id in list(dataframe.context_id.unique()):
        partition = dataframe.loc[dataframe['context_id'] == _id]
        #print (partition.index)
        #print (np.take(l, partition.index, axis=0))
        partial_preds = np.take(predictitons, partition.index, axis=0)
        p = list(enumerate(partial_preds))
        #print (p)
        p = sorted(p, key=lambda x: -x[-1])
        predicted_indices = [x[0] for x in p]
        #print (predicted_indices)
        predicted_probas = [x[1] for x in p]
        #print (predicted_probas)
        total_predictions.extend(predicted_indices)
    return total_predictions

In [25]:
model = XGBRegressor(objective="rank:pairwise")
ranker = XGBRanker(n_estimators=150, learning_rate=0.1, subsample=0.9)#, objective='rank:pairwise')

In [16]:
X_val = np.array(get_pointwise_distances(val, vocab))

In [30]:
y_val = np.array([label_to_num[x] for x in val.label])

In [39]:
lens_val = compute_lens(val)

In [40]:
#model.fit(X, y_train)
ranker.fit(X_val, y_val, lens_val, eval_metric=['ndcg', 'map@5-'])

XGBRanker(base_score=0.5, booster='gbtree', colsample_bylevel=1,
     colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
     max_depth=3, min_child_weight=1, missing=None, n_estimators=150,
     n_jobs=-1, nthread=None, objective='rank:pairwise', random_state=0,
     reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
     subsample=0.9)

In [41]:
X_test = np.array(get_pointwise_distances(test, vocab))

In [43]:
lens_test = compute_lens(test)

In [45]:
y_preds = ranker.predict(X_test, lens_test)

In [46]:
y_preds

array([0.21180984, 0.6886587 , 0.6181052 , ..., 0.07714647, 0.7477301 ,
       0.65852785], dtype=float32)

In [47]:
y_preds[:6]

array([0.21180984, 0.6886587 , 0.6181052 , 0.49548346, 0.37640887,
       0.10057464], dtype=float32)

In [49]:
sorted_y_preds = sort_xgb_predictions(test, y_preds)

In [50]:
get_mean_NDCG(test, sorted_y_preds)

82169.31546564518

## X_train

In [52]:
X_train = np.array(get_pointwise_distances(train, vocab))

In [53]:
y_train = np.array([label_to_num[x] for x in train.label])

In [54]:
lens_train = compute_lens(train)

In [55]:
ranker.fit(X_train, y_train, lens_train, eval_metric=['ndcg', 'map@5-'])

XGBRanker(base_score=0.5, booster='gbtree', colsample_bylevel=1,
     colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
     max_depth=3, min_child_weight=1, missing=None, n_estimators=150,
     n_jobs=-1, nthread=None, objective='rank:pairwise', random_state=0,
     reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
     subsample=0.9)

In [56]:
X_test = np.array(get_pointwise_distances(test, vocab))

In [57]:
lens_test = compute_lens(test)

In [58]:
y_preds = ranker.predict(X_test, lens_test)

In [59]:
y_preds

array([0.28118122, 0.50867206, 0.5582693 , ..., 0.5609416 , 0.48988783,
       0.5289386 ], dtype=float32)

In [60]:
sorted_y_preds = sort_xgb_predictions(test, y_preds)

In [63]:
sorted_y_preds[:12]

[5, 2, 1, 4, 3, 0, 2, 0, 5, 3, 1, 4]

In [None]:
np.ran

In [61]:
get_mean_NDCG(test, sorted_y_preds)

82118.08758614419

## Same but with one number

In [62]:
X_train = np.array(get_distances(train, vocab))

In [70]:
X_train = X_train[:, np.newaxis]

In [71]:
#X_train = np.array(get_pointwise_distances(train, vocab))
#y_train = np.array([label_to_num[x] for x in train.label])
#lens_train = compute_lens(train)
ranker.fit(X_train, y_train, lens_train, eval_metric=['ndcg', 'map@5-'])

XGBRanker(base_score=0.5, booster='gbtree', colsample_bylevel=1,
     colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
     max_depth=3, min_child_weight=1, missing=None, n_estimators=150,
     n_jobs=-1, nthread=None, objective='rank:pairwise', random_state=0,
     reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
     subsample=0.9)

In [72]:
X_test = np.array(get_distances(test, vocab))

In [73]:
X_test = X_test[:, np.newaxis]

In [74]:
y_preds = ranker.predict(X_test, lens_test)

In [75]:
y_preds

array([0.506596  , 0.50798625, 0.51695615, ..., 0.49014026, 0.49723178,
       0.50464606], dtype=float32)

In [76]:
sorted_y_preds = sort_xgb_predictions(test, y_preds)

In [77]:
get_mean_NDCG(test, sorted_y_preds)

81913.16572027517

## Trial sumbmission

In [26]:
train = pd.read_csv(config.path_to_preprocessed_train)
X = np.array(get_pointwise_distances(train, vocab))

In [27]:
y_train = np.array([label_to_num[x] for x in train.label])
lens_train = compute_lens(train)

In [40]:
ranker = XGBRanker(n_estimators=500, learning_rate=0.1, subsample=0.9,
                  max_depth=10)#, objective='rank:pairwise')

In [41]:
ranker.fit(X, y_train, lens_train, eval_metric=['ndcg', 'map@5-'])

XGBRanker(base_score=0.5, booster='gbtree', colsample_bylevel=1,
     colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
     max_depth=10, min_child_weight=1, missing=None, n_estimators=500,
     n_jobs=-1, nthread=None, objective='rank:pairwise', random_state=0,
     reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
     subsample=0.9)

In [42]:
train_preds = ranker.predict(X)
sorted_train_preds = sort_xgb_predictions(train, train_preds)
train_ndcg = get_mean_NDCG(train, sorted_train_preds)
print ('train NDCG:', train_ndcg)

train NDCG: 79385.90735486381


In [48]:
import pickle
pickle.dump(ranker, open("../data/xgb_models/500_estimators.pickle.dat", "wb"))

In [43]:
test = pd.read_csv(config.path_to_preprocessed_test)

In [44]:
X_test = np.array(get_pointwise_distances(test, vocab))

In [45]:
lens_test = compute_lens(test)

In [46]:
preds = ranker.predict(X_test, lens_test)

In [49]:
preds

array([0.01572004, 0.8037063 , 0.4488865 , ..., 0.7053027 , 1.2431805 ,
       0.3948659 ], dtype=float32)

In [50]:
sorted_y_preds = sort_xgb_predictions(test, preds)

In [51]:
sorted_y_preds[:12]

[3, 1, 5, 2, 4, 0, 1, 0, 2, 3, 4, 5]

In [52]:
path_to_sub = "../data/ranking_xgb_500_depth_10_baseline.txt"
with open(path_to_sub,"w+") as f:
    for k, v in (zip(test.context_id.values, sorted_y_preds)):
        f.write("%s %s" % (k, v))
        f.write("\n")