# Category Recomendation Random Forest Transfer
---

In [1]:
# imports
import json
import time
import random
import joblib
import matplotlib.pyplot as plt
import itertools
import functools

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import top_k_accuracy_score, label_ranking_average_precision_score

In [2]:
data_path : str = "./dataset/"
models_path : str = "../web/models/transfer"
cities =  ["Burgos", "León", "Palencia", "Salamanca", "Valladolid"]

In [3]:
def mean_reciprocal_rank(real_categories, category_probs, categories) -> float:
    mrr: float = 0
    sample_size = len(category_probs)

    for real, probs in zip(real_categories, category_probs):
        ranking: list = [[category, prob]
                         for category, prob in zip(categories, probs)]
        ranking.sort(key=lambda x: x[1], reverse=True)

        for index, prob in enumerate(ranking, start=1):
            if prob[0] == real:
                mrr += 1/index
                break

    return mrr/sample_size


In [4]:
def construct_dataset(data_path: str, city: str) -> dict:


    # Abrimos el fichero de datos y lo convertimos a un dataframe
    with open(f"{data_path}Q_{city}.json", "r") as file:
        data = json.load(file)
    df = pd.json_normalize(data)

    # Separamos en dataframes de atributos y etiquetas
    X= df.drop('Categoria', axis=1)
    Y = df['Categoria']



    return X, Y

In [5]:
def get_intersection(i,j):
    return np.intersect1d(i.unique(),j.unique())

In [6]:
cities_dict = dict()

vals = []
for city in cities:
    x, y = construct_dataset(data_path,city)

    vals.append(y)


In [7]:
interseccion = functools.reduce(np.intersect1d, vals)

print(len(interseccion))
print(interseccion)

71
['Alcohol' 'Bakery' 'Bank' 'Bar' 'Beauty' 'Bed' 'Bicycle' 'Bicycle_rental'
 'Books' 'Butcher' 'Cafe' 'Car' 'Car_parts' 'Car_repair'
 'Charging_station' 'Chemist' 'Childcare' 'Clinic' 'Clothes'
 'Community_centre' 'Computer' 'Confectionery' 'Convenience' 'Copyshop'
 'Curtain' 'Dentist' 'Doityourself' 'Driving_school' 'Fast_food' 'Florist'
 'Fountain' 'Fuel' 'Furniture' 'Gift' 'Greengrocer' 'Hairdresser'
 'Hardware' 'Ice_cream' 'Jewelry' 'Kiosk' 'Language_school' 'Laundry'
 'Library' 'Lottery' 'Mobile_phone' 'Nightclub' 'Optician' 'Parking'
 'Pastry' 'Perfumery' 'Pet' 'Pharmacy' 'Photo' 'Place_of_worship'
 'Post_office' 'Pub' 'Restaurant' 'School' 'Seafood' 'Shoes'
 'Social_centre' 'Social_facility' 'Sports' 'Supermarket' 'Taxi' 'Tobacco'
 'Toilets' 'Travel_agency' 'University' 'Variety_store' 'Veterinary']


In [8]:
with open(f"{data_path}Q_Burgos.json", "r") as file:
    data = json.load(file)
    print(type(data))


dataset_final = []

for city in cities:
    with open(f"{data_path}Q_{city}.json", "r") as file:
        data = json.load(file)

    for entry in data:
        new_entry = {}
        if entry["Categoria"] not in interseccion:
            continue
        new_entry["Categoria"] = entry["Categoria"]
        new_entry_indices = dict()
        for cat, val in entry["QualityIndices"].items():
            if cat in interseccion:
                new_entry_indices[cat] = val
        new_entry["QualityIndices"] = new_entry_indices
        dataset_final.append(new_entry)



<class 'list'>


In [9]:
df = pd.json_normalize(dataset_final)

print(df.shape)
df.head()

(10400, 285)


Unnamed: 0,Categoria,QualityIndices.Chemist.Qjensen_raw,QualityIndices.Chemist.Qjensen,QualityIndices.Chemist.Qperm,QualityIndices.Chemist.Qperm_raw,QualityIndices.Jewelry.Qjensen_raw,QualityIndices.Jewelry.Qjensen,QualityIndices.Jewelry.Qperm,QualityIndices.Jewelry.Qperm_raw,QualityIndices.Confectionery.Qjensen_raw,...,QualityIndices.Doityourself.Qperm,QualityIndices.Doityourself.Qperm_raw,QualityIndices.Driving_school.Qjensen_raw,QualityIndices.Driving_school.Qjensen,QualityIndices.Driving_school.Qperm,QualityIndices.Driving_school.Qperm_raw,QualityIndices.Fuel.Qjensen_raw,QualityIndices.Fuel.Qjensen,QualityIndices.Fuel.Qperm,QualityIndices.Fuel.Qperm_raw
0,Mobile_phone,35.068196,23.386799,322.537765,552.356647,21.68391,4.536842,252.502075,562.01017,7.758976,...,-106.678484,-111.034128,-27.590111,-32.72631,-35.577908,-8.381273,-117.881834,-118.956658,-145.753964,-153.834858
1,Variety_store,-1.213434,-12.89483,-93.116073,136.702809,0.870369,-16.276699,-163.125833,146.382262,-0.157266,...,-64.203915,-68.559558,-2.966233,-8.102432,18.573371,45.770006,-141.511843,-142.586667,-99.146958,-107.227851
2,Clothes,-6.343462,-18.024858,98.314612,328.133495,34.819324,17.672256,331.074057,640.582152,9.069019,...,-163.262388,-167.618032,-31.56461,-36.70081,-127.83419,-100.637555,-72.998346,-74.07317,-254.133417,-262.21431
3,Supermarket,-5.596865,-17.278262,-109.785012,120.03387,-12.726443,-29.873511,-206.578592,102.929503,0.030916,...,-67.446453,-71.802097,-9.723039,-14.859238,28.472999,55.669634,-161.038041,-162.112865,-116.615563,-124.696456
4,Supermarket,-3.912906,-15.594302,-146.032177,83.786706,-15.984806,-33.131874,-255.922506,53.585589,4.16373,...,-52.317903,-56.673547,-1.259314,-6.395514,24.783156,51.979791,-131.633963,-132.708787,-91.842626,-99.923519


In [10]:
def cross_validate_random_forest(dataframe, k=5, **kwargs):

    # Creamos el objeto para KFold para hacer validación cruzada
    kf = KFold(n_splits=k, shuffle=True, random_state=random.randint(0, 100))

    # Dataframes de atributos y etiquetas
    X= dataframe.drop('Categoria', axis=1)
    Y = dataframe['Categoria']


    folds_mrr = []

    for n_fold, (train, test) in enumerate(kf.split(X), start=1):
        print(f">>> Fold: {n_fold}")
        classifier = RandomForestClassifier(n_estimators=100, n_jobs=-1)
        fold_time = time.time()


        # Definimos conjunto de test
        df_train_x = X.iloc[train]
        df_train_y = Y.iloc[train]

        # Definimos conjunto de test
        df_test_x = X.iloc[test]
        df_test_y = Y.iloc[test]

        # Entrenamos el modelo
        classifier.fit(df_train_x, df_train_y)

        # Predicciones
        preds = classifier.predict_proba(df_test_x)

        # Calculamos el MRR
        mrr = mean_reciprocal_rank(df_test_y, preds, classifier.classes_)
        folds_mrr.append(mrr)
        print(f"> MRR: {mrr}")
        print(f"> Tiempo fold {n_fold}: {time.time() - fold_time :.1f} s")

    return folds_mrr


In [11]:
cross_validate_random_forest(df, k=10)

>>> Fold: 1
> MRR: 0.29710331724087136
> Tiempo fold 1: 10.9 s
>>> Fold: 2
> MRR: 0.30650537600267264
> Tiempo fold 2: 10.4 s
>>> Fold: 3
> MRR: 0.29807977959725873
> Tiempo fold 3: 9.2 s
>>> Fold: 4
> MRR: 0.2918661934152525
> Tiempo fold 4: 9.1 s
>>> Fold: 5
> MRR: 0.3020883581193072
> Tiempo fold 5: 11.4 s


[0.29710331724087136,
 0.30650537600267264,
 0.29807977959725873,
 0.2918661934152525,
 0.3020883581193072]

In [12]:
def train_model(df):
    train_time = time.time()


    # Dataframes de atributos y etiquetas
    df_attr = df.drop('Categoria', axis=1)
    df_y = df['Categoria']

    classifier = RandomForestClassifier(n_estimators=100, n_jobs=-1)

    classifier.fit(df_attr, df_y)
    print(f"> Tiempo train: {time.time() - train_time :.1f} s")

    joblib.dump(classifier, f"{models_path}/transfer.gz", compress=5)

In [13]:

train_model(df)

> Tiempo train: 12.6 s
