In [1]:
import os, gc
import pickle

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime

TARGET = "source_id"
BASE_MODELS_PATH = "./input/training-cfm"
BASE_DATA_PATH = "./input/make-features-cfm"

print(f"Kernel lancé le : {datetime.now().strftime('%d %b, %H h %M')}")

Kernel lancé le : 20 Jan, 19 h 10


In [2]:
# Read test_data with features
test_data = pd.read_csv(os.path.join(BASE_DATA_PATH, "test_data.csv")).set_index("ID")

# Read the features used
with open (os.path.join(BASE_MODELS_PATH, "features_list.pkl"), 'rb') as fp:
    FEATURES_LIST = pickle.load(fp)

In [3]:
# Load models
models = dict()
xgb_models_paths = [p for p in os.listdir(BASE_MODELS_PATH) if "xgb_model_" in p]

for k, model_path in enumerate(sorted(xgb_models_paths)):
    if "xgb_model_" in model_path:
        xgb_model_loaded = pickle.load(open(os.path.join(BASE_MODELS_PATH, model_path), "rb"))
        models[f"fold_{k}"] = xgb_model_loaded



In [4]:
def add_freq_encoding(fold_number: int, test_df: pd.DataFrame) -> pd.DataFrame:
    FE_data = pd.read_csv(os.path.join(BASE_MODELS_PATH, f"FE_fold{fold_number}.csv"))
    FE_data.index = test_df.index
    test_df = pd.concat([test_df, FE_data], axis=1)
    
    del FE_data; gc.collect()
    
    return test_df

In [5]:
print("Predicting...\n")

variables_to_encode = ["stock_id", "trade_quarter"]

predictions = list()

for k, model_key in enumerate(models.keys()):
    print(f"\t – Fold n°{k + 1}, model = {model_key}")
    
    test_data = add_freq_encoding(k, test_data)

    y_pred = models[model_key].predict_proba(test_data[FEATURES_LIST])
    predictions.append(y_pred)
    
    test_data = test_data.drop([c for c in test_data.columns if "FE_" in c], axis=1)

Predicting...

	 – Fold n°1, model = fold_0
	 – Fold n°2, model = fold_1
	 – Fold n°3, model = fold_2
	 – Fold n°4, model = fold_3
	 – Fold n°5, model = fold_4


In [6]:
print("Processing predictions")

prediction_mode = "proba" # "proba / mode"

proba_pred = np.array(predictions)

if prediction_mode == "mode":
    proba_mode = proba_pred.argmax(axis=2)
    mode_array = mode(np.array(proba_mode))   # ensembling : take the most common predicted class
    mode_pred = mode_array.mode[0]
    preds_csv = pd.Series(mode_pred).to_frame()

elif prediction_mode == "proba":
    proba_pred_reduced = proba_pred.mean(axis=0)    # sum over the folds
    preds_csv = pd.Series(proba_pred_reduced.argmax(axis=1)).to_frame()   # argmax over the venues to select the most probable one
    pred_proba_df = pd.DataFrame(proba_pred_reduced, columns=[f"venue_{i}" for i in range(6)])

else:
    raise ValueError
    
k_random_sub_id = np.random.randint(0, 1000)

preds_csv["ID"] = test_data.index
preds_csv.columns = ["source_id", "ID"]
preds_csv = preds_csv[preds_csv.columns[::-1]]
preds_csv.to_csv(f"pred_test_{prediction_mode}_{k_random_sub_id}.csv", index=False)

pred_proba_df["ID"] = test_data.index
pred_proba_df = pred_proba_df[[pred_proba_df.columns[-1]] + list(pred_proba_df.columns[:-1])]
pred_proba_df.to_csv(f"6venues_pred_test_{prediction_mode}_{k_random_sub_id}.csv", index=False)

print("Test prediction saved!")

Processing predictions
Test prediction saved!


In [7]:
pred_proba_df.head()

Unnamed: 0,ID,venue_0,venue_1,venue_2,venue_3,venue_4,venue_5
0,959506,0.03088,0.193981,0.083651,0.150268,0.361639,0.179581
1,1044642,0.032851,0.21227,0.112116,0.113624,0.237769,0.291371
2,1050806,0.044001,0.075749,0.280369,0.4132,0.134935,0.051746
3,1325166,0.076198,0.075818,0.188369,0.4023,0.219623,0.037692
4,1384745,0.034933,0.149966,0.109717,0.414812,0.180216,0.110355


In [9]:
# check for target distribution

train_data = pd.read_csv(os.path.join(BASE_DATA_PATH, "train_data.csv"))
cm = sns.light_palette("green", as_cmap=True)

num_sample = preds_csv.shape[0]
preds_distrib = 100 * pd.concat([preds_csv["source_id"].value_counts() / num_sample, train_data[TARGET].value_counts() / num_sample], axis=1)
preds_distrib.columns = ["predicted_distribution", "train_set_distribution"]

preds_distrib.style.format("{:.1f}%").background_gradient(cmap=cm, axis=0)

Unnamed: 0,predicted_distribution,train_set_distribution
0,3.0%,3.4%
1,5.9%,15.8%
2,34.8%,26.5%
3,10.1%,12.4%
4,43.7%,33.6%
5,2.6%,7.5%
