In [1]:
import pandas as pd 
from tqdm import tqdm
from sklearn.cluster import KMeans, HDBSCAN
import numpy as np
import os

def cluster_embeddings(embeddings, n_clusters = 9, standardize = False):
    if standardize:
        mean = embeddings.mean(axis=0)
        std = embeddings.std(axis=0)
        embeddings = (embeddings - mean) / std

    kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(embeddings)
    centroids_count = np.bincount(kmeans.labels_)
    return kmeans.cluster_centers_, centroids_count

In [39]:
from scipy.spatial.distance import cdist
def construct_transition_vector(clusters_start, counts_start, clusters_end, counts_end):
    distance_matrix = cdist(clusters_start, clusters_end)

    flattened = [
        [distance_matrix[i,j], i, j]
        for i in range(len(clusters_start))
        for j in range(len(clusters_end))
    ]

    flattened = sorted(flattened, key=lambda x: (x[1], x[0]))

    transition_vector = []


    for distance, start, end in flattened:
        transition_vector.append(distance * (counts_end[end] - counts_start[start])/counts_start[start])

    new_order = []
    seen_left_indexes = set()

    for match in flattened:
        _, left_index, right_index = match
        if left_index not in seen_left_indexes:
            seen_left_indexes.add(left_index)
            new_order.append(right_index)
    
    return transition_vector, new_order

In [49]:
def convert_to_intertime_embeddings(df):
    sorted_idx = np.argsort(df["Clusters"][0][1])
    prev_clusters = df["Clusters"][0][0][sorted_idx]
    prev_counts = df["Clusters"][0][1][sorted_idx]

    intertime_embeddings = []
    intertime_change = []

    for i in range(1, len(df)):
        transition_vector, new_order = construct_transition_vector(prev_clusters, prev_counts, df["Clusters"][i][0], df["Clusters"][i][1])
        prev_clusters = df["Clusters"][i][0][new_order]
        prev_counts = df["Clusters"][i][1][new_order]
        intertime_embeddings.append(transition_vector)
        intertime_change.append(df["EventType"][i])#intertime_change.append(np.abs(df["EventType"][i-1] - df["EventType"][i])) #Should this be the change or if the next event is a sub-event?

    intertime_embeddings = np.array(intertime_embeddings)
    intertime_change = np.array(intertime_change)
    return intertime_embeddings, intertime_change

In [50]:
tqdm.pandas()

files = os.listdir('processed_data/train_data')

all_intertime_embeddings = None
all_intertime_change = None

for i, file in enumerate(files):
    print(f"Processing {file} ({i+1}/{len(files)})")
    if file.endswith('.pkl'):
        df = pd.read_pickle(f'processed_data/train_data/{file}')
        df["Clusters"] = df["Embeddings"].progress_apply(cluster_embeddings)
        df.drop(columns=['Embeddings'], inplace=True)
        intertime_embeddings, intertime_change = convert_to_intertime_embeddings(df)

        if i == 0:
            all_intertime_embeddings = intertime_embeddings
            all_intertime_change = intertime_change
        else:
            all_intertime_embeddings = np.vstack((all_intertime_embeddings, intertime_embeddings))
            all_intertime_change = np.hstack((all_intertime_change, intertime_change))
        
        del df

Processing GermanyUSA57.pkl (1/16)


100%|██████████| 130/130 [00:05<00:00, 24.19it/s]


Processing ArgentinaBelgium72.pkl (2/16)


100%|██████████| 130/130 [00:06<00:00, 19.06it/s]


Processing ArgentinaGermanyFinal77.pkl (3/16)


100%|██████████| 180/180 [00:17<00:00, 10.31it/s]


Processing USASlovenia2010.pkl (4/16)


100%|██████████| 130/130 [00:03<00:00, 36.91it/s]


Processing GermanyAlgeria67.pkl (5/16)


100%|██████████| 170/170 [00:21<00:00,  8.02it/s]


Processing MexicoCroatia37.pkl (6/16)


100%|██████████| 130/130 [00:02<00:00, 59.93it/s]


Processing FranceGermany70.pkl (7/16)


100%|██████████| 130/130 [00:12<00:00, 10.58it/s]


Processing FranceNigeria66.pkl (8/16)


100%|██████████| 130/130 [00:08<00:00, 15.32it/s]


Processing AustraliaNetherlands29.pkl (9/16)


100%|██████████| 97/97 [00:01<00:00, 52.53it/s]


Processing HondurasSwitzerland54.pkl (10/16)


100%|██████████| 130/130 [00:00<00:00, 132.54it/s]


Processing AustraliaSpain34.pkl (11/16)


100%|██████████| 130/130 [00:01<00:00, 100.77it/s]


Processing PortugalGhana58.pkl (12/16)


100%|██████████| 130/130 [00:05<00:00, 22.59it/s]


Processing GermanyBrazil74.pkl (13/16)


100%|██████████| 130/130 [00:25<00:00,  5.08it/s]


Processing BelgiumSouthKorea59.pkl (14/16)


100%|██████████| 130/130 [00:02<00:00, 57.18it/s]


Processing CameroonBrazil36.pkl (15/16)


100%|██████████| 130/130 [00:02<00:00, 58.64it/s]


Processing NetherlandsChile35.pkl (16/16)


100%|██████████| 130/130 [00:01<00:00, 68.33it/s]


In [52]:
from sklearn.model_selection import train_test_split, cross_val_score

#X_train, X_test, y_train, y_test = train_test_split(all_intertime_embeddings, all_intertime_change, test_size=0.2, random_state=42)


In [58]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Create XGBoost
model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.1,
    max_depth=15,
    random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10, 15]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)

# Fit the grid search to the data
grid_search.fit(all_intertime_embeddings, all_intertime_change)

# Print the best parameters and best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.2s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.2s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.2s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.2s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=100; total time=   0.2s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.3s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.3s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.4s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.3s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=200; total time=   0.4s
[CV] END ..learning_rate=0.01, max_depth=3, n_estimators=300; total time=   0.5s
[CV] END ..learning_rate=0.01, max_depth=3, n_



[CV] END .learning_rate=0.01, max_depth=10, n_estimators=200; total time=   4.8s
[CV] END .learning_rate=0.01, max_depth=10, n_estimators=200; total time=   5.0s
[CV] END .learning_rate=0.01, max_depth=10, n_estimators=300; total time=   6.1s
[CV] END .learning_rate=0.01, max_depth=10, n_estimators=300; total time=   6.8s
[CV] END .learning_rate=0.01, max_depth=10, n_estimators=300; total time=   6.5s
[CV] END .learning_rate=0.01, max_depth=10, n_estimators=300; total time=   6.1s
[CV] END .learning_rate=0.01, max_depth=10, n_estimators=300; total time=   6.8s
[CV] END .learning_rate=0.01, max_depth=10, n_estimators=400; total time=   7.4s
[CV] END .learning_rate=0.01, max_depth=10, n_estimators=400; total time=   8.3s
[CV] END .learning_rate=0.01, max_depth=10, n_estimators=400; total time=   7.6s
[CV] END .learning_rate=0.01, max_depth=10, n_estimators=400; total time=   8.0s
[CV] END .learning_rate=0.01, max_depth=10, n_estimators=400; total time=   8.3s
[CV] END .learning_rate=0.01