## The CrossTree

Voting Schema with multiple binary classification trees. The network implements a voting scheme based on three different trees:

* Cultural **Agnostic-Rappresentative** tree
* Cultural **Agnostic-Exclusive** tree
* Cultural **Exclusive-Rappresentative** tree

the most voted class will be the predicted class.

### Training Phase

The training process is quite standard and straight-forward: given the n G_features we want to directly predict the associated class.

### Employment Phase

The training model will be inserted in a wider model called X and utilized as a function for the computation of the G_Factor

## Dataset

Load the dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from CU_Dataset_Factory import Hf_Loader, CU_Dataset_Factory

def onehot_encode(
    df_train: pd.DataFrame,
    df_test: pd.DataFrame,
    cat_cols: list[str]|None = None,
    num_cols: list[str]|None = None,
    sparse: bool = False
) -> tuple[pd.DataFrame, pd.DataFrame, OneHotEncoder]:
    
    """
    Applies One-Hot Encoding to df_train and df_test guaranteeing the same
    set of columns, even if train is missing categories who are in the test set.

    Parameters
    ----------
    df_train : pd.DataFrame
        Training DataFrame.
    df_test : pd.DataFrame
        Testing DataFrame.
    cat_cols : list[str], optional
        List of categorical columns to encode.
        If None, all columns of type 'object' are taken.
    num_cols : list[str], optional
        List of numerical (or non-categorical) columns to preserve.
        If None, all columns not in cat_cols are taken. 
    handle_unknown : str, default="ignore"
        Beahavior on unknown values in test (typically "ignore").
    sparse : bool, default=False
        If True, returns sparse matrix, otherwise dense.

    Returns
    -------
    df_train_enc : pd.DataFrame
        Training DataFrame with One-Hot Encoding + original num_cols.
    df_test_enc : pd.DataFrame
        Testing DataFrame with One-Hot Encoding + original num_cols.
    encoder : OneHotEncoder
        The fitted OneHotEncoder object, useful for future transform.
    """
    
    # 1) Identify category and numerical columns (if not given)
    if cat_cols is None:
        cat_cols = df_train.select_dtypes(include="object").columns.tolist()
    if num_cols is None:
        num_cols = [c for c in df_train.columns if c not in cat_cols]

    # 2) Fit encoder on all category data (train + test)
    all_cats = pd.concat([df_train[cat_cols], df_test[cat_cols]], 
                         axis=0, ignore_index=True)
    encoder = OneHotEncoder(
        sparse_output=sparse
    ).fit(all_cats)

    # 3) Transform separatly train and test
    X_train_ohe = encoder.transform(df_train[cat_cols])
    X_test_ohe  = encoder.transform(df_test[cat_cols])

    # 4) Name the new columns
    ohe_cols = encoder.get_feature_names_out(cat_cols).tolist()

    # 5) Compose the final DataFrames
    df_train_enc = pd.DataFrame(
        np.hstack([X_train_ohe.toarray() if sparse else X_train_ohe,
                   df_train[num_cols].values]), # type: ignore
        columns=ohe_cols + num_cols,
        index=df_train.index
    )
    df_test_enc = pd.DataFrame(
        np.hstack([X_test_ohe.toarray() if sparse else X_test_ohe,
                   df_test[num_cols].values]),
        columns=ohe_cols + num_cols,
        index=df_test.index
    )

    return df_train_enc[ohe_cols], df_test_enc[ohe_cols], encoder


  from .autonotebook import tqdm as notebook_tqdm


## Produce the Dataset

In [2]:
print('Cultural Dataset argumentation start')
factory = CU_Dataset_Factory('.')
train_l = Hf_Loader("sapienzanlp/nlp2025_hw1_cultural_dataset", 'train')
validation_l = Hf_Loader("sapienzanlp/nlp2025_hw1_cultural_dataset", 'validation')

factory.produce(train_l, 'train.tsv', ['languages', 'num_langs', 'reference', 'n_mod', 'back_links'], 'label', 16, False)
factory.produce(validation_l, 'validation.tsv', ['languages', 'num_langs', 'reference', 'n_mod', 'back_links'], 'label', 16, False)
print('End process')

Cultural Dataset argumentation start


100%|██████████| 126/126 [00:00<00:00, 460.99it/s]
copy dataset: 100%|██████████| 5/5 [00:00<00:00, 1175.47it/s]
n_mod:   0%|          | 0/6251 [00:06<?, ?it/s, batch=1]        

KeyboardInterrupt: 

In [None]:
train = pd.read_csv('train.tsv', sep='\t')
validation = pd.read_csv('validation.tsv', sep='\t')

In [None]:
train.head(5)

Unnamed: 0.1,Unnamed: 0,back_links,n_mod,G_num_components,G_avg,G_nodes,G_largest_component_size,G_mean_pr,G_density,G_num_cliques,num_langs,reference,languages,type,subcategory,category,wiki_name,qid,label
0,0,36,79,1.0,1.0,1.0,1.0,1.0,0.0,1.0,6,5,2,entity,film,films,916 (film),Q32786,1
1,1,222,614,1.0,1.417582,182.0,182.0,0.004405,0.013903,179.0,30,40,8,entity,musical group,music,!!!,Q371,2
2,2,24,16,1.0,3.2,5.0,5.0,0.244156,0.9,2.0,4,1,3,entity,comics,comics and anime,¡Soborno!,Q3729947,2
3,3,227,2375,1.0,1.455357,224.0,224.0,0.003656,0.011451,231.0,38,67,9,entity,musical group,music,+44 (band),Q158611,2
4,4,85,30,1.0,1.339623,159.0,159.0,0.004751,0.015206,153.0,2,7,1,entity,building,architecture,"1 Monk Street, Monmouth",Q280375,1


In [None]:
validation.head(5)

Unnamed: 0.1,Unnamed: 0,back_links,n_mod,G_num_components,G_avg,G_nodes,G_largest_component_size,G_mean_pr,G_density,G_num_cliques,num_langs,reference,languages,type,subcategory,category,wiki_name,qid,label
0,0,2582,784,1.0,1.0,4.0,4.0,0.264605,0.5,3.0,62,40,10,entity,sports club,sports,1. FC Nürnberg,Q15786,2
1,1,27,25,1.0,1.0,1.0,1.0,1.0,0.0,1.0,7,4,4,entity,record label,music,77 Records,Q268530,1
2,2,841,2698,1.0,1.452888,329.0,329.0,0.002405,0.007525,343.0,68,166,10,entity,animated film,comics and anime,A Bug's Life,Q216153,2
3,3,21,36,1.0,1.0,1.0,1.0,1.0,0.0,1.0,12,3,7,entity,film,films,A Gang Story,Q593,1
4,4,1940,1653,1.0,1.43617,188.0,188.0,0.004044,0.013369,177.0,60,91,10,entity,choreographer,performing arts,Aaron Copland,Q192185,2


In [None]:
y_train = train[['label']]
y_validation = validation[['label']]

id_train = train[['wiki_name']]
id_validation = validation[['wiki_name']]

fe_train = train[['languages', 'num_langs', 'reference','G_mean_pr']]
fe_validation = validation[['languages', 'num_langs', 'reference','G_mean_pr']]

fe_str_train = train[['category', 'subcategory', 'type']]
fe_str_validation = validation[['category', 'subcategory', 'type']]

In [None]:
train_cat, validation_cat, _ =  onehot_encode(fe_str_train, fe_str_validation, ['category'] )
train_scat, validation_scat, _ = onehot_encode(fe_str_train, fe_str_validation, ['subcategory'] )
train_t, validation_t, _ = onehot_encode(fe_str_train, fe_str_validation, ['type'] )

In [None]:
print(validation_cat.shape)
print(validation_scat.shape)
print(validation_t.shape)

(300, 19)
(300, 112)
(300, 2)


In [None]:
train = pd.concat([fe_train, train_cat, train_scat, train_t, y_train], axis=1)
validation = pd.concat([fe_validation, validation_cat, validation_scat, validation_t, y_validation], axis=1) 

print(train.shape)

(6251, 138)


In [None]:
train.head(3)

Unnamed: 0,languages,num_langs,reference,G_mean_pr,category_architecture,category_biology,category_books,category_comics and anime,category_fashion,category_films,category_food,category_geography,category_gestures and habits,category_history,category_literature,category_media,category_music,category_performing arts,category_philosophy and religion,category_politics,category_sports,category_transportation,category_visual arts,subcategory_acting style,subcategory_actor,subcategory_animal,subcategory_animated film,subcategory_animation studio,subcategory_animation technique,subcategory_architect,subcategory_architectural structure,subcategory_architectural style,subcategory_archive,subcategory_art gallery,subcategory_art movement,subcategory_artist,subcategory_athlete,subcategory_automobile manufacturer,subcategory_biologist,subcategory_body language,...,subcategory_philosophical movement,subcategory_philosophy,subcategory_photographer,subcategory_plant,subcategory_poet,subcategory_poetry,subcategory_policy,subcategory_political party,subcategory_politician,subcategory_production company,subcategory_publisher,subcategory_record label,subcategory_recurring sporting event,subcategory_religion,subcategory_religious book,subcategory_religious leader,subcategory_religious movement,subcategory_ritual,subcategory_river,subcategory_sport,subcategory_sports club,subcategory_sports equipment,subcategory_sports team,subcategory_station,subcategory_streaming service,subcategory_television,subcategory_textile,subcategory_theatrical director,subcategory_theatrical genre,subcategory_tradition,subcategory_traditional costume,subcategory_transport,subcategory_transport company,subcategory_tree,subcategory_visual arts,subcategory_writer,subcategory_writing style,type_concept,type_entity,label
0,2,6,5,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,8,30,40,0.004405,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2
2,3,4,1,0.244156,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2


## Network

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

### Agnostic-Representative classifier

In [None]:
# prepare the dataset in order to take only two classes and eliminates the labels of the elements
d = train.query("label == 0 or label == 1")
y = d['label'].astype(int).to_numpy()
x = d.drop(['label'], axis=1).astype(float).to_numpy()

In [None]:
ar_tree = MLPClassifier().fit(x, y)



In [None]:
d = validation.query("label == 0 or label == 1")
y = d['label'].astype(int).to_numpy()
x = d.drop(['label'], axis=1).astype(float).to_numpy()

In [None]:
y_pred = ar_tree.predict(x)
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.88      0.90       117
           1       0.83      0.88      0.85        76

    accuracy                           0.88       193
   macro avg       0.87      0.88      0.88       193
weighted avg       0.88      0.88      0.88       193



## Agnostic-Exclusive Classifier

In [None]:
# prepare the dataset in order to take only two classes and eliminates the labels of the elements
d = train.query("label == 0 or label == 2")
y = d['label'].astype(int).to_numpy()
x = d.drop(['label'], axis=1).astype(float).to_numpy()

In [None]:
ae_tree = MLPClassifier().fit(x, y)



In [None]:
d = validation.query("label == 0 or label == 2")
y = d['label'].astype(int).to_numpy()
x = d.drop(['label'], axis=1).astype(float).to_numpy()

In [None]:
y_pred = ae_tree.predict(x)
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.82      0.79       117
           2       0.79      0.73      0.76       107

    accuracy                           0.78       224
   macro avg       0.78      0.77      0.78       224
weighted avg       0.78      0.78      0.78       224



## Representative-Exclusive

In [None]:
# prepare the dataset in order to take only two classes and eliminates the labels of the elements
d = train.query("label == 1 or label == 2")
y = d['label'].astype(int).to_numpy()
x = d.drop(['label'], axis=1).astype(float).to_numpy()

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
re_tree = RandomForestClassifier().fit(x, y)

In [None]:
d = validation.query("label == 1 or label == 2")
y = d['label'].astype(int).to_numpy()
x = d.drop(['label'], axis=1).astype(float).to_numpy()

In [None]:
y_pred = re_tree.predict(x)
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           1       0.69      0.80      0.74        76
           2       0.84      0.75      0.79       107

    accuracy                           0.77       183
   macro avg       0.77      0.78      0.77       183
weighted avg       0.78      0.77      0.77       183



# Voting Schema


In [None]:
from scipy.stats import mode
from sklearn.base import ClassifierMixin, BaseEstimator
from sklearn.ensemble import IsolationForest

# Cultural-classification Network
class CABNet(BaseEstimator, ClassifierMixin):
    
    def __init__(self, ar_estimator, ae_estimator, re_estimator) -> None:
        self.ar = ar_estimator 
        self.ae = ae_estimator 
        self.re = re_estimator
        self.detector = IsolationForest()
        
    
    def __fit(self, e, X, y) -> BaseEstimator:
        return e.fit(X, y)


    def fit(self, X, y=None):

        outliers = self.detector.fit_predict(X)
        print(outliers)

        d = X.query("label == 0 or label == 1 and @ouliers == 0")
        y = d['label'].astype(int).to_numpy()
        x = d.drop(['label'], axis=1).astype(float).to_numpy()
        self.ar = self.__fit(self.ar, x, y)
        d = X.query("label == 0 or label == 2")
        y = d['label'].astype(int).to_numpy()
        x = d.drop(['label'], axis=1).astype(float).to_numpy()
        self.ae = self.__fit(self.ae, x, y)
        d = X.query("label == 1 or label == 2")
        y = d['label'].astype(int).to_numpy()
        x = d.drop(['label'], axis=1).astype(float).to_numpy()
        self.re = self.__fit(self.re, x, y)
        return self
        
    def predict(self, X):
        v1 = self.ar.predict(X)
        v2 = self.ae.predict(X)
        v3 = self.re.predict(X)
     
  
        votes = np.vstack([v1, v2, v3])
        majority, _ = mode(votes, axis=0)
        return majority.ravel()

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, IsolationForest
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
model =  CABNet(RandomForestClassifier(n_estimators=100), RandomForestClassifier(n_estimators=550), RandomForestClassifier(n_estimators=100))

In [None]:
model = model.fit(train)

ValueError: all input arrays must have the same shape

In [None]:
d = validation
y = d['label'].astype(int).to_numpy()
x = d.drop(['label'], axis=1).astype(float).to_numpy()

In [None]:
y_pred = model.predict(x)
print(classification_report(y, y_pred))

[[ 1.          1.          0.         ...  1.          1.
   0.        ]
 [ 2.          2.          2.         ...  0.          2.
   2.        ]
 [ 1.          1.          2.         ...  1.          2.
   1.        ]
 [-0.34112483 -0.34260664 -0.36596736 ... -0.32239063 -0.33204853
  -0.32330366]
 [-0.36214006 -0.33588173 -0.34261531 ... -0.33643126 -0.32600381
  -0.33444503]
 [-0.36063496 -0.31978196 -0.32804783 ... -0.34991134 -0.33613358
  -0.32591378]]


ValueError: Classification metrics can't handle a mix of multiclass and continuous targets