In [1]:
from mlquantify.methods import *
import mlquantify as mq
from mlquantify.evaluation.protocol import APP
from sklearn.model_selection import train_test_split
import numpy as np
from joblib import Parallel, delayed
import os

In [2]:
FILES = os.listdir('datasets com score')
FILES

['bourbon.csv',
 'continental.csv',
 'foz_plaza.csv',
 'nadai.csv',
 'taroba.csv',
 'viale_cataratas.csv',
 'viale_tower.csv']

In [None]:
class HotelProtocol(APP):
    def __init__(self,     
                 models, 
                 batch_size,
                 learner = None, 
                 n_prevs = 50,
                 n_iterations = 3,
                 n_jobs = -1,
                 random_state = 32,
                 verbose = False,
                 return_type = "predictions",
                 measures = None):
        
        super().__init__(models=models,
                         batch_size=batch_size,
                         learner=learner,
                         n_jobs=n_jobs,
                         random_state=random_state,
                         verbose=verbose,
                         return_type=return_type,
                         measures=measures)
        self.n_prevs = n_prevs
        self.batch_size = batch_size if isinstance(batch_size, list) else [batch_size]
        self.n_prevs = n_prevs
        self.n_iterations = n_iterations
    
    
    def fit(self, X_train, y_train, train_scores):
        args = ((model, X_train, y_train, train_scores) for model in self.models)
        
        wrapper = tqdm if self.verbose else lambda x, **kwargs: x
    
        self.models = Parallel(n_jobs=self.n_jobs)(  # Parallel processing of models
            delayed(self._delayed_fit)(*arg) for arg in wrapper(args, desc="Fitting models", total=len(self.models))
        )
        return self
    
    
    def _predict(self, iteration, X, y, model, prev, batch_size, verbose):
        model_name = model.__class__.__name__
        if model_name == "Ensemble" and isinstance(model.base_quantifier, Quantifier):
            model_name = f"{model.__class__.__name__}_{model.base_quantifier.__class__.__name__}_{model.size}"
        
        X_sample, _ = self._new_sample(X, y, prev, batch_size)
          
        posteriors_test = pd.DataFrame({
            '0': abs(1 - X_sample["score"]),
            '1': X_sample["score"]
        })
        
        X_sample = X_sample.drop(columns=["score"])
        
        y_pred = (posteriors_test.iloc[:, 1] > 0.5).astype(int)
        
        
        mq.set_arguments(posteriors_test=posteriors_test,
                      y_pred=y_pred)
        
        prev_pred = np.asarray(list(model.predict(X_sample).values()))
        
        return (iteration+1, model_name, prev, prev_pred, batch_size)
    
    
    
    
    
    def _delayed_fit(self, model, X_train, y_train, train_scores):
        model_name = model.__class__.__name__
        if model_name == "Ensemble" and isinstance(model.base_quantifier, Quantifier):
            model_name = f"{model.__class__.__name__}_{model.base_quantifier.__class__.__name__}_{model.size}"

        y_train_pred = (train_scores.iloc[:, 1] > 0.5).astype(int)

        mq.set_arguments(posteriors_train=train_scores,
                        y_labels=y_train,
                        y_pred_train=y_train_pred
                        )
        model = model.fit(X=X_train, y=y_train)

        return model

In [None]:
final_table = pd.DataFrame()

mq.methods.METHODS.pop("ENSEMBLE", None)
mq.methods.METHODS.pop("DySsyn", None)
# mq.methods.METHODS.pop("HDx", None)
# mq.methods.METHODS.pop("GAC", None)

for file in FILES:
    name = file.split('.')[0]
    dataset = pd.read_csv(f"datasets com score/{file}")
    dataset['date'] = pd.to_datetime(dataset['date'])
    dataset.sort_values('date', inplace=True)
    i = 0
    aspects = dataset["aspect"].unique()
    for aspect in aspects:
        print(f"Processing aspect {i+1}/{len(aspects)} -> {aspect}")
        i = i + 1
        df = dataset[dataset["aspect"] == aspect]
        
        if df['class'].value_counts().min() < 2:
            train_df, test_df = train_test_split(df, test_size=0.3, random_state=32)
        else:
            train_df, test_df = train_test_split(df, test_size=0.3, stratify=df['class'], random_state=32)

        X_train = train_df.drop(['class', 'score', 'date'], axis=1)
        y_train = train_df['class']
        X_test = test_df.drop(['class', 'date'], axis=1)
        y_test = test_df['class']

        print(f"Train samples: {len(y_train)}, Test samples: {len(y_test)}")

        train_scores = pd.DataFrame({
            '0': abs(1 - train_df["score"]),
            '1': train_df["score"]
        })

        batch_sizes = list(np.linspace(25, len(y_test), 5).astype(int))

        mq.ARGUMENTS_SETTED = True
        hotel_protocol = HotelProtocol(models="all",
                                    n_prevs=50,
                                    batch_size=batch_sizes,
                                    return_type="table",
                                    measures=["mae", "rae"])
        
        hotel_protocol.fit(X_train, y_train, train_scores=train_scores)

        table = hotel_protocol.predict(X_test, y_test)
        
        table["hotel"] = name
        table["aspect"] = aspect

        final_table = pd.concat([final_table, table], ignore_index=True)

final_table.to_csv("quantification_results.csv", index=False)
final_table.head()


Processing aspect 1/5 -> Atendimento da equipe
Train samples: 367, Test samples: 158


Running APP:   1%|▏         | 54/4275 [00:00<00:21, 200.40it/s]
  """
  """
  """
  """


TypeError: '>=' not supported between instances of 'str' and 'float'