In [66]:
import h5py
import pandas as pd
from tsfresh import select_features, extract_features
from tsfresh.feature_extraction import EfficientFCParameters
from sklearn.model_selection import GridSearchCV
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.cluster import KMeans
import multiprocessing
import pickle


#### Step 1: Parsear times series de train data

In [34]:
x_train = get_variable_from_file('train', 'x')
x_train_df = pd.DataFrame(x_train[:, 0, :])
x_train_parsed = parsear_time_series(x_train_df)


#### Step 2: Parsear labels de train data

In [39]:
y_train = get_variable_from_file('train', 'y')
y_train_parsed = parsear_y(x=x_train, y=y_train)

#### Step 3: Parsear times series de test data

In [22]:
x_test = get_variable_from_file('test', 'x')
x_test_df = pd.DataFrame(x_test[:, 0, :])
x_test_parsed = parsear_time_series(x_test_df)

#### Step 4: Extraer features de train parseado

In [38]:
features_train = extract_all_features(x_train_parsed) 

#### Step 5: Extraer features de test parseado

In [24]:
features_test = extract_all_features(x_test_parsed) 

#### Step 6: Filtrar features de los features de train

In [40]:
features_train_filtered = select_features(features_train, y_train_parsed)

#### Step 7: Filtrar features de los features de test

In [41]:
features_test_filtered = features_test[features_train_filtered.columns]

#### Step 8: Guardar todos los features y label

In [47]:
output_path_train_x = "./features_blocks/train_features_filtered.pkl"
output_path_train_y = "./features_blocks/train_labels.pkl"
output_path_test_x = "./features_blocks/test_features_filtered.pkl"

os.makedirs(os.path.dirname(output_path_train_x), exist_ok=True)
features_train_filtered.to_pickle(output_path_train_x, compression="xz")
os.makedirs(os.path.dirname(output_path_train_y), exist_ok=True)
temporary_y_series =  pd.Series(y_train_parsed)
temporary_y_series.to_pickle(output_path_train_y, compression="xz")

os.makedirs(os.path.dirname(output_path_test_x), exist_ok=True)
features_test_filtered.to_pickle(output_path_test_x, compression="xz")

#### Step 9: Entrenar modelos

In [63]:
features_train_filtered = pd.read_pickle("features_blocks/train_features_filtered.pkl", compression="xz")
features_test_filtered = pd.read_pickle("features_blocks/test_features_filtered.pkl", compression="xz")
y_train_parsed = pd.read_pickle("features_blocks/train_labels.pkl", compression="xz")
modelsName = ["tree"]
models = gridSearchCVOverModels(features_train_filtered.values, y_train_parsed.values)


🔍 Entrenando modelo: Tree
✅ Mejor score: 0.717988163688921 con {'max_depth': 8}


#### Step 10: Predecir y guardar predicciones

In [72]:
for i in range(len(modelsName)):
    model = models[i]
    y_pred = model.predict(features_test_filtered.values)
    np.save(f"predicciones_{modelsName[i]}_1.npy", y_pred)

#### Step 11: Parsear predicciones a formato .csv

In [73]:
y_pred = np.load("predicciones_tree_1.npy")
df = pd.DataFrame({
    "ID": np.arange(1, len(y_pred) + 1),
    "result": y_pred
})

df.to_csv("predicciones_tree_1.csv", index=False)

#### Funciones auxiliares

In [69]:
def get_variable_from_file(path, variable):
    with h5py.File(f'{path}.h5', 'r') as f:
        data = f[f'{variable}'][:]
        return data
modelos = {
    "Tree": (
        DecisionTree(),
        {"max_depth": [6, 8, 10]}  # Puedes ajustar según el caso
    ),
}

def gridSearchCVOverModels(features_train, y_blocks):
    best_models = []
    for nombre, (modelo, params) in modelos.items():
        print(f"\n🔍 Entrenando modelo: {nombre}")
        grid = GridSearchCV(estimator=modelo, param_grid=params, cv=5, scoring='accuracy')
        grid.fit(features_train, y_blocks)
        print(f"✅ Mejor score: {grid.best_score_} con {grid.best_params_}")
        best_models.append(grid.best_estimator_)
    return best_models
def contar_bloques_por_serie(longitud_serie, tam_bloque, solapamiento):
    paso = tam_bloque - solapamiento
    return (longitud_serie - tam_bloque) // paso + 1

def parsear_y(x, y, tam_bloque = 500, solapamiento = 225):
    longitud_serie = x.shape[2]
    n_bloques_por_serie = contar_bloques_por_serie(longitud_serie, tam_bloque, solapamiento)
    y_train_blocks = np.repeat(y, n_bloques_por_serie)
    return y_train_blocks

def parsear_time_series(dataset: pd.DataFrame, tam_bloque: int = 500, solapamiento: int = 225) -> pd.DataFrame:
    paso = tam_bloque - solapamiento
    parsed_frames = []
    nuevo_id = 0
    for _, serie in enumerate(dataset.values):
        longitud = serie.size
        if longitud < tam_bloque:
            continue
        for inicio in range(0, longitud - tam_bloque + 1, paso):
            fin = inicio + tam_bloque
            bloque = serie[inicio:fin]

            parsed_frames.append(
                pd.DataFrame(
                    {
                        "id": nuevo_id,
                        "time": np.arange(inicio, fin),  # índice absoluto
                        "valor": bloque,
                    }
                )
            )
            nuevo_id += 1
    return pd.concat(parsed_frames, ignore_index=True)

def parsear_time_series_without_blocks(dataset: pd.DataFrame, tam_bloque: int = 500, solapamiento: int = 225) -> pd.DataFrame:
    paso = tam_bloque - solapamiento
    parsed_frames = []
    nuevo_id = 0
    for _, serie in enumerate(dataset.values):
        longitud = serie.size
        if longitud < tam_bloque:
            continue
        for inicio in range(0, longitud - tam_bloque + 1, paso):
            fin = inicio + tam_bloque
            bloque = serie[inicio:fin]

            parsed_frames.append(
                pd.DataFrame(
                    {
                        "id": nuevo_id,
                        "time": np.arange(inicio, fin),  # índice absoluto
                        "valor": bloque,
                    }
                )
            )
            nuevo_id += 1
    return pd.concat(parsed_frames, ignore_index=True)


def extract_all_features(df: pd.DataFrame):
    features_all = extract_features(df,
                                    column_id="id",
                                    column_sort="time",
                                    default_fc_parameters=EfficientFCParameters(),
                                    disable_progressbar=True,
                                    n_jobs=max(
                                        1, multiprocessing.cpu_count() - 1)
                                    )
    features_all = features_all.fillna(0)
    return features_all

def calculate_clusters_based_on_variance(column):
    variance = np.var(column)
    
    if variance < 1:
        return 2 
    elif variance < 10:
        return 3  
    else:
        return 4 
    
def discretize_with_kmeans(column):
    n_clusters = calculate_clusters_based_on_variance(column)
    
    # Reshape para aplicar KMeans (n_samples, n_features)
    column_reshaped = column.values.reshape(-1, 1)
    
    # Ajustamos KMeans a la columna con el número de clusters calculado
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(column_reshaped)
    
    # Asignar a cada valor el índice del cluster
    return kmeans.labels_

def discretize_all(dataset):
    x_df_discretized = dataset

    for col in dataset.columns:
        n_clusters = 3  
        x_df_discretized[col] = discretize_with_kmeans(dataset[col])
    return x_df_discretized


#### Modelos

In [68]:
def gini_impurity(y):
    """Calcular la impureza Gini de un array de etiquetas"""
    if len(y) == 0:
        return 0
    # Proporción de cada clase
    clases = np.unique(y)
    probabilities = [np.mean(y == c) for c in clases]
    gini = 1 - sum(p ** 2 for p in probabilities)
    return gini

def entropy(y):
    """Calcular la entropía de un array de etiquetas"""
    if len(y) == 0:
        return 0
    # Proporción de cada clase
    clases = np.unique(y)
    probabilities = [np.mean(y == c) for c in clases]
    # Usamos log2 para calcular la entropía
    entropy_value = -sum(p * np.log2(p) for p in probabilities if p > 0)
    return entropy_value


class DecisionTree:
    def __init__(self, max_depth=None, impurity_function=gini_impurity,
                 n_bins=20, min_gain=1e-7, min_samples_split=2):
        self.max_depth = max_depth
        self.impurity_function = impurity_function
        self.n_bins = n_bins                 
        self.min_gain = min_gain             
        self.min_samples_split = min_samples_split
        self.tree = None

    # --------------- ENTRENAMIENTO ----------------
    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        n_samples, n_features = X.shape
        n_labels = len(np.unique(y))

        if (n_labels == 1
            or (self.max_depth and depth >= self.max_depth)
            or n_samples < self.min_samples_split):
            return np.bincount(y).argmax()    

        best_gain  = -1
        best_split = None

        for feature in range(n_features):
            col = X[:, feature]
            thresholds = np.percentile(col, np.linspace(0, 100, self.n_bins))
            thresholds = np.unique(thresholds)   #

            for thr in thresholds:
                mask = col <= thr
                left_idx = y[mask]
                right_idx = y[~mask]
                #left_idx  = np.where(col <= thr)[0]
                #right_idx = np.where(col >  thr)[0]

                if (len(left_idx) < self.min_samples_split or
                    len(right_idx) < self.min_samples_split):
                    continue

                gain = information_gain(y, mask, ~mask, self.impurity_function)

                if gain > best_gain:
                    best_gain = gain
                    best_split = (feature, thr)
                    best_left_mask = mask
                    best_right_mask = ~mask

        if best_gain < self.min_gain or best_split is None:
            return np.bincount(y).argmax()

        f, thr = best_split
        left = self._build_tree(X[best_left_mask], y[best_left_mask], depth + 1)
        right = self._build_tree(X[best_right_mask], y[best_right_mask], depth + 1)


        return {"feature": f, "threshold": thr,
                "left": left, "right": right}

    # --------------- PREDICCIÓN ----------------
    def predict_one(self, x, node=None):
        if node is None:
            node = self.tree
        if not isinstance(node, dict):
            return node
        if x[node["feature"]] <= node["threshold"]:
            return self.predict_one(x, node["left"])
        else:
            return self.predict_one(x, node["right"])

    def predict(self, X):
        return np.array([self.predict_one(x) for x in X])

    def get_params(self, deep=True):
        """Obtener los parámetros del modelo"""
        return {"max_depth": self.max_depth, "impurity_function": self.impurity_function}
    
    def set_params(self, **params):
        """Configurar los parámetros del modelo"""
        for param, value in params.items():
            setattr(self, param, value)
        return self
    def saveModel(self, filename="best_tree.pkl"):
        with open(filename, 'wb') as f:
            pickle.dump(self, f)
