In [55]:
import copy
import datetime
import matplotlib.pyplot as plt
import numpy as np
import xgboost as xgb
import pyarrow.parquet as pq

from tensorflow.keras import backend as K
from tensorflow.keras.models import Model, load_model
from xgboost import XGBClassifier, plot_importance

from src.train.utils import build_numpy, fetch_file_list


X_COLUMNS = slice(7, 76)
CONTEXT_COLUMNS = slice(0, 7)
np.random.seed(100)


TEST_DATA_DIR = './testData/set1/'
IS_CSV = False


In [56]:
model_path = '/Users/lucindazhao/strava/ml-local/models/V0/xgboost-noWeight20200219-124349'
print(model_path)
pieces = model_path.split('/')
model_id = pieces[-1]
print(model_id)
my_booster = xgb.Booster({})  # init model
my_booster.load_model(model_path) 
my_model = xgb.XGBClassifier()
my_model._Booster = my_booster


/Users/lucindazhao/strava/ml-local/models/V0/xgboost-noWeight20200219-124349
xgboost-noWeight20200219-124349


In [57]:
# load tf model
tf_model_path = '/Users/lucindazhao/strava/ml-local/models/V0/cnn_noWeight_lr=0.001_20200218-224657.h5'
print(tf_model_path)
tf_pieces = tf_model_path.split('/')
tf_model_id = tf_pieces[-1]
print(tf_model_id)

tf_model = load_model(tf_model_path)
print(tf_model.summary())
print(tf_model.metrics_names)
print(K.eval(tf_model.optimizer.lr))


/Users/lucindazhao/strava/ml-local/models/V0/cnn_noWeight_lr=0.001_20200218-224657.h5
cnn_noWeight_lr=0.001_20200218-224657.h5
Model: "model_47"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_48 (InputLayer)        [(None, 69)]              0         
_________________________________________________________________
reshape_22 (Reshape)         (None, 69, 1)             0         
_________________________________________________________________
conv0 (Conv1D)               (None, 69, 32)            128       
_________________________________________________________________
batch0 (BatchNormalization)  (None, 69, 32)            128       
_________________________________________________________________
dropout_32 (Dropout)         (None, 69, 32)            0         
_________________________________________________________________
conv1 (Conv1D)               (None, 69, 32)            3104    

In [None]:
test_file_list = fetch_file_list(data_dir=TEST_DATA_DIR, portion=1)
test_file_list

In [30]:
!mkdir ./result/xgboost-noWeight20200219-124349

In [60]:
for file in test_file_list[1:]:
    if IS_CSV:
        # important to specify dtype explicitly to make sure tf can consume it
        result = np.genfromtxt(file, dtype=np.float64, delimiter=',', skip_header=self.skip_header)
    else:
        # parquet
        table = pq.read_table(file)
        # important to specify dtype explicitly to make sure tf can consume it
        result = table.to_pandas().to_numpy(dtype=np.float64)

    X = result[:, X_COLUMNS]
    #     temp = result[:, Y_COLUMNS]
    #     # temp is 1d array. convert to 2d 'vector'
    #     y = np.reshape(temp, (temp.shape[0], 1))
    Z = result[:, CONTEXT_COLUMNS]

    y_hat_xgb = my_model.predict_proba(X)
    y_hat_tf = tf_model.predict(X)
    y_scores_xgb = y_hat_xgb[:, 1].reshape(-1, 1)
    y_scores_tf = y_hat_tf[:, 1].reshape(-1, 1)

    final_output = np.concatenate((Z, y_scores_xgb, y_scores_tf), axis=1)

    sorted_output =  final_output[final_output[:,7].argsort()[::-1]]

    print(file)
    tmp = file.split('/')
    file_id = tmp[-1]
    print(file_id)
    np.savetxt("./result/{}/{}_sorted.csv".format(model_id, file_id), sorted_output,
               fmt=['%1.0f', '%4.0f', '%11d', '%11d', '%1.0f', '%5d', '%5d', '%1.3f', '%1.3f'],
               delimiter=',', newline='\n',
               header='label,key,ACTIVITY_ID,ATHLETE_ID,SUPER_USER,NUM_KUDOS,NUM_MATCHING_ACTIVITIES,xgb_score,tf_score')


./testData/set1/part-00003-b3d99eb9-a9cf-4bdd-8c66-9942281e354e-c000.gz.parquet
part-00003-b3d99eb9-a9cf-4bdd-8c66-9942281e354e-c000.gz.parquet
./testData/set1/part-00001-b3d99eb9-a9cf-4bdd-8c66-9942281e354e-c000.gz.parquet
part-00001-b3d99eb9-a9cf-4bdd-8c66-9942281e354e-c000.gz.parquet
./testData/set1/part-00007-b3d99eb9-a9cf-4bdd-8c66-9942281e354e-c000.gz.parquet
part-00007-b3d99eb9-a9cf-4bdd-8c66-9942281e354e-c000.gz.parquet
./testData/set1/part-00005-b3d99eb9-a9cf-4bdd-8c66-9942281e354e-c000.gz.parquet
part-00005-b3d99eb9-a9cf-4bdd-8c66-9942281e354e-c000.gz.parquet
./testData/set1/part-00000-b3d99eb9-a9cf-4bdd-8c66-9942281e354e-c000.gz.parquet
part-00000-b3d99eb9-a9cf-4bdd-8c66-9942281e354e-c000.gz.parquet
./testData/set1/part-00002-b3d99eb9-a9cf-4bdd-8c66-9942281e354e-c000.gz.parquet
part-00002-b3d99eb9-a9cf-4bdd-8c66-9942281e354e-c000.gz.parquet
./testData/set1/part-00004-b3d99eb9-a9cf-4bdd-8c66-9942281e354e-c000.gz.parquet
part-00004-b3d99eb9-a9cf-4bdd-8c66-9942281e354e-c000.gz.

In [50]:
file = test_file_list[0]

if IS_CSV:
    # important to specify dtype explicitly to make sure tf can consume it
    result = np.genfromtxt(file, dtype=np.float64, delimiter=',', skip_header=self.skip_header)
else:
    # parquet
    table = pq.read_table(file)
    # important to specify dtype explicitly to make sure tf can consume it
    result = table.to_pandas().to_numpy(dtype=np.float64)

X = result[:, X_COLUMNS]
#     temp = result[:, Y_COLUMNS]
#     # temp is 1d array. convert to 2d 'vector'
#     y = np.reshape(temp, (temp.shape[0], 1))
Z = result[:, CONTEXT_COLUMNS]

# y_hat = my_model.predict_proba(X)
# y_scores = y_hat[:, 1].reshape(-1, 1)

# final_output = np.concatenate((Z, y_scores), axis=1)

# sorted_output =  final_output[final_output[:,7].argsort()[::-1]]

# print(file)
# tmp = file.split('/')
# file_id = tmp[-1]
# print(file_id)
# np.savetxt("./result/{}/{}_sorted.csv".format(model_id, file_id), sorted_output,
#            fmt=['%1.0f', '%4.0f', '%38d', '%38d', '%1.0f', '%10.0f', '%10.0f', '%1.3f'],
#            delimiter=',', newline='\n',
#            header='label, key, ACTIVITY_ID, ATHLETE_ID, SUPER_USER, NUM_KUDOS, NUM_MATCHING_ACTIVITIES, score')


In [52]:
Z[:,2]

array([2.78949846e+09, 2.76707502e+09, 2.81108123e+09, ...,
       2.81038845e+09, 2.76364584e+09, 2.75227262e+09])

In [51]:
Z[np.where(Z[:,2] == 2766317056)]


array([], shape=(0, 7), dtype=float64)