## The CrossTree

Voting Schema with multiple binary classification trees. The network implements a voting scheme based on three different trees:

* Cultural **Agnostic-Rappresentative** tree
* Cultural **Agnostic-Exclusive** tree
* Cultural **Exclusive-Rappresentative** tree

the most voted class will be the predicted class.

### Training Phase

The training process is quite standard and straight-forward: given the n G_features we want to directly predict the associated class.

### Employment Phase

The training model will be inserted in a wider model called X and utilized as a function for the computation of the G_Factor

## Dataset

Load the dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from CU_Dataset_Factory import Hf_Loader, CU_Dataset_Factory

def onehot_encode(
    df_train: pd.DataFrame,
    df_test: pd.DataFrame,
    cat_cols: list[str]|None = None,
    num_cols: list[str]|None = None,
    sparse: bool = False
) -> tuple[pd.DataFrame, pd.DataFrame, OneHotEncoder]:
    
    """
    Applies One-Hot Encoding to df_train and df_test guaranteeing the same
    set of columns, even if train is missing categories who are in the test set.

    Parameters
    ----------
    df_train : pd.DataFrame
        Training DataFrame.
    df_test : pd.DataFrame
        Testing DataFrame.
    cat_cols : list[str], optional
        List of categorical columns to encode.
        If None, all columns of type 'object' are taken.
    num_cols : list[str], optional
        List of numerical (or non-categorical) columns to preserve.
        If None, all columns not in cat_cols are taken. 
    handle_unknown : str, default="ignore"
        Beahavior on unknown values in test (typically "ignore").
    sparse : bool, default=False
        If True, returns sparse matrix, otherwise dense.

    Returns
    -------
    df_train_enc : pd.DataFrame
        Training DataFrame with One-Hot Encoding + original num_cols.
    df_test_enc : pd.DataFrame
        Testing DataFrame with One-Hot Encoding + original num_cols.
    encoder : OneHotEncoder
        The fitted OneHotEncoder object, useful for future transform.
    """
    
    # 1) Identify category and numerical columns (if not given)
    if cat_cols is None:
        cat_cols = df_train.select_dtypes(include="object").columns.tolist()
    if num_cols is None:
        num_cols = [c for c in df_train.columns if c not in cat_cols]

    # 2) Fit encoder on all category data (train + test)
    all_cats = pd.concat([df_train[cat_cols], df_test[cat_cols]], 
                         axis=0, ignore_index=True)
    encoder = OneHotEncoder(
        sparse_output=sparse
    ).fit(all_cats)

    # 3) Transform separatly train and test
    X_train_ohe = encoder.transform(df_train[cat_cols])
    X_test_ohe  = encoder.transform(df_test[cat_cols])

    # 4) Name the new columns
    ohe_cols = encoder.get_feature_names_out(cat_cols).tolist()

    # 5) Compose the final DataFrames
    df_train_enc = pd.DataFrame(
        np.hstack([X_train_ohe.toarray() if sparse else X_train_ohe,
                   df_train[num_cols].values]), # type: ignore
        columns=ohe_cols + num_cols,
        index=df_train.index
    )
    df_test_enc = pd.DataFrame(
        np.hstack([X_test_ohe.toarray() if sparse else X_test_ohe,
                   df_test[num_cols].values]),
        columns=ohe_cols + num_cols,
        index=df_test.index
    )

    return df_train_enc[ohe_cols], df_test_enc[ohe_cols], encoder


  from .autonotebook import tqdm as notebook_tqdm


## Produce the Dataset

In [2]:
print('Cultural Dataset argumentation start')
factory = CU_Dataset_Factory('.')
train_l = Hf_Loader("sapienzanlp/nlp2025_hw1_cultural_dataset", 'train')
validation_l = Hf_Loader("sapienzanlp/nlp2025_hw1_cultural_dataset", 'validation')

factory.produce(train_l, 'train.tsv', ['category', 'subcategory','type','languages','reference','num_langs', 'ambiguos', 'G', 'n_mod', 'back_links'], 'label', 16, False)
factory.produce(validation_l, 'validation.tsv', ['category', 'subcategory','type','languages','reference','num_langs', 'ambiguos', 'G', 'n_mod', 'back_links'], 'label', 16, False)
print('End process')

Cultural Dataset argumentation start


100%|██████████| 126/126 [00:00<00:00, 136.84it/s]
copy dataset: 100%|██████████| 10/10 [00:00<00:00, 1335.72it/s]
  prc_result.loc[:, feature] = prc_result[feature].astype('float64').add(delta, fill_value=0)
G:   3%|▎         | 176/6251 [03:36<3:10:16,  1.88s/it, batch=12]         

KeyboardInterrupt: 

In [None]:
train = pd.read_csv('train.tsv', sep='\t')
validation = pd.read_csv('validation.tsv', sep='\t')

In [None]:
train.head(5)

In [None]:
validation.head(5)

In [None]:
y_train = train[['label']]
y_validation = validation[['label']]

id_train = train[['wiki_name']]
id_validation = validation[['wiki_name']]

fe_train = train[['languages', 'num_langs', 'reference']]
fe_validation = validation[['languages', 'num_langs', 'reference']]

fe_str_train = train[['category', 'subcategory', 'type']]
fe_str_validation = validation[['category', 'subcategory', 'type']]

In [None]:
train_cat, validation_cat, _ =  onehot_encode(fe_str_train, fe_str_validation, ['category'] )
train_scat, validation_scat, _ = onehot_encode(fe_str_train, fe_str_validation, ['subcategory'] )
train_t, validation_t, _ = onehot_encode(fe_str_train, fe_str_validation, ['type'] )

In [None]:
print(validation_cat.shape)
print(validation_scat.shape)
print(validation_t.shape)

In [None]:
from sklearn.decomposition import PCA

In [None]:
# pca = PCA(n_components=10)
# validation_cat = pd.DataFrame(pca.fit_transform(validation_cat))
# train_cat = pd.DataFrame(pca.fit_transform(train_cat))
# pca = PCA(n_components=50)
# validation_scat = pd.DataFrame(pca.fit_transform(validation_scat))
# train_scat = pd.DataFrame(pca.fit_transform(train_scat))

In [None]:
print(validation_cat.shape)
print(validation_scat.shape)
print(validation_t.shape)

In [None]:
train = pd.concat([fe_train, train_cat, train_scat, train_t, y_train], axis=1)
validation = pd.concat([fe_validation, validation_cat, validation_scat, validation_t, y_validation], axis=1) 

print(train.shape)

In [None]:
train.head(3)

In [None]:
validation.head(3)

## Network

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

### Agnostic-Representative classifier

In [None]:
# prepare the dataset in order to take only two classes and eliminates the labels of the elements
d = train.query("label == 0 or label == 1")
y = d['label'].astype(int).to_numpy()
x = d.drop(['label'], axis=1).astype(float).to_numpy()

In [None]:
ar_tree = MLPClassifier().fit(x, y)

In [None]:
d = validation.query("label == 0 or label == 1")
y = d['label'].astype(int).to_numpy()
x = d.drop(['label'], axis=1).astype(float).to_numpy()

In [None]:
y_pred = ar_tree.predict(x)
print(classification_report(y, y_pred))

## Agnostic-Exclusive Classifier

In [None]:
# prepare the dataset in order to take only two classes and eliminates the labels of the elements
d = train.query("label == 0 or label == 2")
y = d['label'].astype(int).to_numpy()
x = d.drop(['label'], axis=1).astype(float).to_numpy()

In [None]:
ae_tree = MLPClassifier().fit(x, y)

In [None]:
d = validation.query("label == 0 or label == 2")
y = d['label'].astype(int).to_numpy()
x = d.drop(['label'], axis=1).astype(float).to_numpy()

In [None]:
y_pred = ae_tree.predict(x)
print(classification_report(y, y_pred))

## Representative-Exclusive

In [None]:
# prepare the dataset in order to take only two classes and eliminates the labels of the elements
d = train.query("label == 1 or label == 2")
y = d['label'].astype(int).to_numpy()
x = d.drop(['label'], axis=1).astype(float).to_numpy()

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
re_tree = RandomForestClassifier().fit(x, y)

In [None]:
d = validation.query("label == 1 or label == 2")
y = d['label'].astype(int).to_numpy()
x = d.drop(['label'], axis=1).astype(float).to_numpy()

In [None]:
y_pred = re_tree.predict(x)
print(classification_report(y, y_pred))

# Voting Schema


In [None]:
from scipy.stats import mode
from sklearn.base import ClassifierMixin, BaseEstimator
from sklearn.ensemble import IsolationForest

# Cultural-classification Network
class CABNet(BaseEstimator, ClassifierMixin):
    
    def __init__(self, ar_estimator, ae_estimator, re_estimator) -> None:
        self.ar = ar_estimator 
        self.ae = ae_estimator 
        self.re = re_estimator
        
    
    def __fit(self, e, X, y) -> BaseEstimator:
        return e.fit(X, y)


    def fit(self, X, y=None):
        d = X.query("label == 0 or label == 1 ")
        y = d['label'].astype(int).to_numpy()
        x = d.drop(['label'], axis=1).astype(float).to_numpy()
        self.ar = self.__fit(self.ar, x, y)
        d = X.query("label == 0 or label == 2")
        y = d['label'].astype(int).to_numpy()
        x = d.drop(['label'], axis=1).astype(float).to_numpy()
        self.ae = self.__fit(self.ae, x, y)
        d = X.query("label == 1 or label == 2")
        y = d['label'].astype(int).to_numpy()
        x = d.drop(['label'], axis=1).astype(float).to_numpy()
        self.re = self.__fit(self.re, x, y)
        return self
        
    def predict(self, X):
        v1 = self.ar.predict(X)
        v2 = self.ae.predict(X)
        v3 = self.re.predict(X)
  
        votes = np.vstack([v1, v2, v3])
        majority, _ = mode(votes, axis=0)
        return majority.ravel()

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, IsolationForest
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
model =  CABNet(RandomForestClassifier(n_estimators=100), RandomForestClassifier(n_estimators=500), RandomForestClassifier(n_estimators=500))

In [None]:
model = model.fit(train)

In [None]:
d = validation
y = d['label'].astype(int).to_numpy()
x = d.drop(['label'], axis=1).astype(float).to_numpy()

In [None]:
y_pred = model.predict(x)
print(classification_report(y, y_pred))