# Category Recomendation Random Forest
---

In [285]:
# imports
import json
import time
import random
import pickle

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import top_k_accuracy_score, label_ranking_average_precision_score

In [286]:
data_path : str = "./dataset/"
cities =  ["Burgos", "León", "Palencia", "Salamanca", "Valladolid"]


In [287]:
def construct_dataset(data_path: str) -> dict:

    # Abrimos el fichero de datos y lo convertimos a un dataframe
    with open(data_path, "r") as file:
        data = json.load(file)
    df = pd.json_normalize(data)

    # Contamos las apariciones de cada categoría
    category_count = df['Categoria'].value_counts()

    # Obtenemos dataframes para aquellas categorías con una única aparición
    single_categories = df[df['Categoria'].isin(category_count[category_count == 1].index)]
    single_categories_attr = single_categories.drop('Categoria', axis=1)
    single_categories_y = single_categories['Categoria']

    # Eliminamos las categorías con una única aparición del conjunto de datos
    df = df.drop(single_categories.index)

    # Separamos en dataframes de atributos y etiquetas
    df_attr = df.drop('Categoria', axis=1)
    df_y = df['Categoria']

    dataset = {
        "single_categories_X" : single_categories_attr,
        "single_categories_Y" : single_categories_y,
        "X" : df_attr,
        "Y" : df_y
    }

    return dataset  

In [288]:
def mean_reciprocal_rank(real_categories, category_probs, categories) -> float:
    mrr: float = 0
    sample_size = len(category_probs)

    for real, probs in zip(real_categories, category_probs):
        ranking: list = [[category, prob]
                         for category, prob in zip(categories, probs)]
        ranking.sort(key=lambda x: x[1], reverse=True)

        for index, prob in enumerate(ranking, start=1):
            if prob[0] == real:
                mrr += 1/index
                break

    return mrr/sample_size


In [289]:
def cross_validate_random_forest(dataset, k=5, **kwargs):

    # Creamos el objeto para KFold para hacer validación cruzada
    kf = KFold(n_splits=k, shuffle=True, random_state=random.randint(0, 100))

    # Dataframes de atributos y etiquetas
    X = dataset["X"]
    Y = dataset["Y"]

    # Dataframes de categorías con una única aparición
    single_categories_x = dataset["single_categories_X"]
    single_categories_y = dataset["single_categories_Y"]

    folds_mrr = []

    if "city" in kwargs:
        print(f"[{kwargs['city']}]")

    for n_fold, (train, test) in enumerate(kf.split(X), start=1):
        print(f">>> Fold: {n_fold}")
        classifier = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
        fold_time = time.time()

        # Añadimos al conjunto de datos de entrenamiento las categorías con una sola aparición
        df_train_x = pd.concat([X.iloc[train], single_categories_x])
        df_train_y = pd.concat([Y.iloc[train], single_categories_y])

        # Definimos conjunto de test
        df_test_x = X.iloc[test]
        df_test_y = Y.iloc[test]

        # Entrenamos el modelo
        classifier.fit(df_train_x, df_train_y)

        # Predicciones
        preds = classifier.predict_proba(df_test_x)

        # Calculamos el MRR
        mrr = mean_reciprocal_rank(df_test_y, preds, classifier.classes_)
        folds_mrr.append(mrr)
        print(f"> MRR: {mrr}")
        print(f"> Tiempo fold {n_fold}: {time.time() - fold_time :.1f} s")

    return folds_mrr


In [290]:
cities_mrr = dict()

for city in cities:
    dataset = construct_dataset(f"{data_path}Q_{city}.json")
    cities_mrr[city] = cross_validate_random_forest(dataset, city=city, k=5)

print(cities_mrr)


[Burgos]
>>> Fold: 1
> MRR: 0.279678304926792
> Tiempo fold 1: 87.3 s
>>> Fold: 2
> MRR: 0.2805446374407585
> Tiempo fold 2: 93.0 s
>>> Fold: 3
> MRR: 0.27692012011684014
> Tiempo fold 3: 93.7 s
>>> Fold: 4
> MRR: 0.26977357818943204
> Tiempo fold 4: 94.4 s
>>> Fold: 5
> MRR: 0.2728845555580882
> Tiempo fold 5: 88.0 s
[León]
>>> Fold: 1
> MRR: 0.3564057188871915
> Tiempo fold 1: 4.8 s
>>> Fold: 2
> MRR: 0.38140156749338094
> Tiempo fold 2: 4.9 s
>>> Fold: 3
> MRR: 0.3899625789896216
> Tiempo fold 3: 5.0 s
>>> Fold: 4
> MRR: 0.4000012657797871
> Tiempo fold 4: 5.0 s
>>> Fold: 5
> MRR: 0.4061514574549984
> Tiempo fold 5: 5.1 s
[Palencia]
>>> Fold: 1
> MRR: 0.33392707324976584
> Tiempo fold 1: 8.7 s
>>> Fold: 2
> MRR: 0.28395809897386326
> Tiempo fold 2: 8.7 s
>>> Fold: 3
> MRR: 0.3303125095811895
> Tiempo fold 3: 8.8 s
>>> Fold: 4
> MRR: 0.24540892010671564
> Tiempo fold 4: 8.3 s
>>> Fold: 5
> MRR: 0.2943554943506483
> Tiempo fold 5: 8.4 s
[Salamanca]
>>> Fold: 1
> MRR: 0.217707390481770

In [291]:
with open("./MRR.json", "r", encoding="utf-8") as read_json,  open("./CompleteMRR.json", "w", encoding="utf-8") as write_json:
    other_mrr = json.load(read_json)

    for city in cities:
        other_mrr[city]["Qrandom_forest"] = np.average(cities_mrr[city])

    json.dump(other_mrr,write_json, indent=4, ensure_ascii=False)