# Proyecto CSI: predecir la palabra root de una oración
Autores: Oriol Catasús Llena, Pablo Arancibia Barahona

Fecha 14 de enero de 2024

In [273]:
# Imports
import csv
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import (
    confusion_matrix,
)
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [255]:
PATH_PUD = './pud26/English-all.heads'
LAL_FILE_PATH = './output_file.csv'

In [256]:
# Funcion para obtener los datos de archivos PUD
def get_data(path):
    f = open(path, "r")
    data = []
    
    for x in f:
        data += [[int(y) for y in x.split(' ')]]

    return data

# Libreria LAL
Primero miramos si el conjunto de arboles es correcto. Luego procesamos el conjunto de arboles y generamos un fichero CSV con sus métricas

In [4]:
import lal

def process_treebank(path, lal_file_path):
    errlist = lal.io.check_correctness_treebank(path)
    for err in errlist:
        print(err)
    err = lal.io.process_treebank(path, lal_file_path)
    if err:
        print(err)

In [5]:
process_treebank(PATH_PUD, LAL_FILE_PATH)




En caso de contar con un archivo de metricas producto de LAL, estas se cargan para su posterior uso

In [257]:
def get_metrics_from_lal(lal_file_path):
    metrics = []
    with open(lal_file_path, 'r') as file:
        csv_reader = csv.DictReader(file, delimiter='\t')
        for row in csv_reader:
            metrics.append(row)
    return metrics

In [347]:
def calcular_grados_hermanos_padre(grafo, vertex):
    padres = set(grafo)
    avg_vertex_sibling = [0] * len(grafo)
    avg_vertex_children = [0] * len(grafo)

    for padre in padres:
        index_hijos = []
        for i in range(len(grafo)):
            if grafo[i] == padre:
                index_hijos.append(i)
 
        vertex_degree = [vertex[hijo] for hijo in index_hijos]

        for hijo in index_hijos:
            avg_vertex_sibling[hijo] = np.mean(vertex_degree)

        if padre != 0:
            avg_vertex_children[padre - 1] = np.mean(vertex_degree)

    return avg_vertex_sibling, avg_vertex_children

# [3, 0, 2] => [0, 1, 1] => [0, 1, 1] => [0, 1, 0]
# [2, 0, 4, 2, 6, 2] => [0, 3, 0, 1, 0, 1] => [0.6, 3, 0, 0.6, 0, 0.6] => [0, 0.6, 0, 0, 0, 0]

In [324]:
def calcular_distancias(grafo):
    n = len(grafo)
    distancias = [[float('inf')] * n for _ in range(n)]

    # Inicializar las distancias conocidas
    for i in range(n):
        distancias[i][i] = 0
        vecino = grafo[i]
        if vecino != 0:
            distancias[i][vecino - 1] = 1  # Peso siempre es 1
            distancias[vecino - 1][i] = 1  # Asegurar bidireccionalidad

    # Calcular las distancias mínimas
    for k in range(n):
        for i in range(n):
            for j in range(n):
                distancias[i][j] = min(distancias[i][j], distancias[i][k] + distancias[k][j])

    return [sum(d)/n for d in distancias], [np.std(d) for d in distancias]

In [350]:
def create_matrix(data, metrics):
    dataframe_dict = {
        'vertex_degree': [],
        'vertex_distance': [],
        'vertex_distance_std': [],
        'sibling_average_degree': [],
        'children_average_degree': [],
        'centre': [],
        'centroid': [],
        'class': [],
    }
    
    for sentence, m in zip(data, metrics):
        dataframe_dict['class'] += [1 if word == 0 else 0 for word in sentence]
        dataframe_dict['centre'] += [1 if int(m['tree_centre1']) == i or int(m['tree_centre2']) == i else 0 for i in range(len(sentence))]
        dataframe_dict['centroid'] += [1 if int(m['tree_centroid1']) == i or int(m['tree_centroid2']) == i else 0 for i in range(len(sentence))]
        
        vertex_degree = [sentence.count(index + 1) for index in range(len(sentence))]
        sibling_degree, children_degree = calcular_grados_hermanos_padre(sentence, vertex_degree)
        dataframe_dict['vertex_degree'] += vertex_degree
        dataframe_dict['sibling_average_degree'] += sibling_degree
        dataframe_dict['children_average_degree'] += children_degree

        average, standar_desviation = calcular_distancias(sentence)
        dataframe_dict['vertex_distance'] += average
        dataframe_dict['vertex_distance_std'] += standar_desviation

    return pd.DataFrame(dataframe_dict)


In [334]:
def normalize_data(dataframe):
    scaler = MinMaxScaler()
    dataframe_normalized = scaler.fit_transform(dataframe)
    dataframe_normalized = pd.DataFrame(dataframe_normalized, columns=dataframe.columns)
    dataframe_normalized['centre'] = dataframe_normalized['centre'].astype(int)
    dataframe_normalized['centroid'] = dataframe_normalized['centroid'].astype(int)
    #dataframe_normalized['class'] = dataframe_normalized['class'].astype(int)

    return dataframe_normalized

# Creación de la matriz
Haciendo uso de las funciones anteriores se crea un matriz normalizada

In [357]:
data = get_data(PATH_PUD)
metrics = get_metrics_from_lal(LAL_FILE_PATH)
dataframe = create_matrix(data, metrics)
dataframe_normalized = dataframe
print(data[1])
dataframe.head(30)

[2, 16, 4, 2, 6, 7, 4, 10, 10, 7, 16, 16, 16, 15, 16, 0]


Unnamed: 0,vertex_degree,vertex_distance,vertex_distance_std,sibling_average_degree,children_average_degree,centre,centroid,class
0,0,3.533333,1.284091,1.0,0.0,0,0,0
1,1,3.266667,1.412641,1.0,3.0,0,0,0
2,0,4.933333,1.948219,0.0,0.0,0,0,0
3,0,4.933333,1.948219,0.0,0.0,0,0,0
4,0,4.933333,1.948219,0.0,0.0,0,0,0
5,3,4.0,1.807392,3.0,0.0,0,0,0
6,0,3.533333,1.284091,1.0,0.0,0,0,0
7,4,2.6,1.143095,2.5,1.0,1,0,0
8,0,4.266667,1.631632,0.0,0.0,0,0,0
9,0,4.266667,1.631632,0.0,0.0,0,0,0


# Balanceo de clases de la matriz

In [358]:
def balancear_clase_oversampling(dataframe, clase_objetivo, target_column='class', random_state=None):
    X = dataframe.loc[:, dataframe.columns != target_column]
    y = dataframe[target_column]
    # Encuentra los índices de la clase objetivo
    indices_clase_objetivo = np.where(y == clase_objetivo)[0]

    # Crea el oversampler
    oversampler = RandomOverSampler(sampling_strategy='auto', random_state=random_state)

    # Aplica oversampling solo a la clase objetivo
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    # Convierte los resultados a DataFrames si X y y son de tipo DataFrame
    if isinstance(X, pd.DataFrame):
        X_balanced = pd.DataFrame(X_resampled, columns=X.columns)
    else:
        X_balanced = np.array(X_resampled)

    if isinstance(y, pd.Series):
        y_balanced = pd.Series(y_resampled, name=y.name)
    else:
        y_balanced = np.array(y_resampled)

    X_balanced[target_column] = y_balanced
    return X_balanced


In [359]:
dataframe_normalized_balanced = balancear_clase_oversampling(dataframe_normalized, 1)
dataframe_normalized_balanced.head(30)

Unnamed: 0,vertex_degree,vertex_distance,vertex_distance_std,sibling_average_degree,children_average_degree,centre,centroid,class
0,0,3.533333,1.284091,1.0,0.0,0,0,0
1,1,3.266667,1.412641,1.0,3.0,0,0,0
2,0,4.933333,1.948219,0.0,0.0,0,0,0
3,0,4.933333,1.948219,0.0,0.0,0,0,0
4,0,4.933333,1.948219,0.0,0.0,0,0,0
5,3,4.0,1.807392,3.0,0.0,0,0,0
6,0,3.533333,1.284091,1.0,0.0,0,0,0
7,4,2.6,1.143095,2.5,1.0,1,0,0
8,0,4.266667,1.631632,0.0,0.0,0,0,0
9,0,4.266667,1.631632,0.0,0.0,0,0,0


In [360]:
print(dataframe_normalized['class'].value_counts())
print(dataframe_normalized_balanced['class'].value_counts())

class
0    17711
1      995
Name: count, dtype: int64
class
0    17711
1    17711
Name: count, dtype: int64


# Creación de pipeline de entrenamiento
Se crea una clase de evaluación de multiples modelos para una mejor escalabilidad

In [361]:
def division(numerador, denominador, valor_predeterminado=0):
    resultado = valor_predeterminado if denominador == 0 else numerador / denominador
    return resultado

class EvaluateModel:

    def __init__(self, models, verbose = 0):
        self._models = list(models.values())
        self._model_names = list(models.keys())
        self._verbose = verbose

    def _fit_cross_validation(self, model, X, y, num_folds=5):
        cv_scores = cross_validate(
            model,
            X,
            y,
            cv=num_folds,
            scoring=('accuracy', 'precision', 'recall', 'f1'),
        )
        model_fit = model.fit(X, y)

        return model_fit, cv_scores

    def _evaluate_with_dataframe(self, model, dataframe, target_column = 'class', iterations = 10):
        accuracies = []
        precisions = []
        recalls = []
        fscores = []

        for x in range(iterations):
            X_train, X_test, y_train, y_test = train_test_split(
                dataframe.loc[:, dataframe.columns != target_column],
                dataframe[target_column]
            )
            X_train = normalize_data(X_train)
            X_test = normalize_data(X_test)
            y_test = y_test.values.astype(int)

            model_fit, cv_scores = self._fit_cross_validation(model, X_train, y_train)
            predictions = model_fit.predict(X_test)

            cm = confusion_matrix(y_test, predictions)
            accuracy, precision, recall, fscore = self.get_metrics(cm)
    
            if self._verbose == 1:
                print(f'Metrics Iteration {x}')
                print(confusion_matrix(y_test, predictions))
                print(f"Accuracy = {accuracy}; Precision = {precision}; Recall = {recall}; fscore = {fscore}\n")
    
            accuracies.append(accuracy)
            precisions.append(precision)
            recalls.append(recall)
            fscores.append(fscore)

        mean_accuracy = np.mean(accuracies)
        mean_precision = np.mean(precisions)
        mean_recall = np.mean(recalls)
        mean_fscore = np.mean(fscores)
        
        print('Average')
        print(f"Accuracy = {mean_accuracy}; Precision = {mean_precision}; Recall = {mean_recall}; fscore = {mean_fscore}\n")

        return mean_accuracy, mean_precision, mean_recall, mean_fscore

    def _accuracy_custom(self, TN, FP, FN, TP):
        return division((TP + TN),(TP + TN + FP + FN))
    
    def _recall_custom(self, TN, FP, FN, TP):
        return division((TP),(TP + FN))
    
    def _precision_custom(self, TN, FP, FN, TP):
        return division((TP),(TP + FP))
    
    def _fscore_custom(self, recall_metric, precision_metric):
        return division(2, division(1, recall_metric) + (division(1, precision_metric)))

    def get_metrics(self, confusion_matrix):
        TN, FP, FN, TP = confusion_matrix.ravel()
        
        accuracy_metric = self._accuracy_custom(TN, FP, FN, TP)
        precision_metric = self._precision_custom(TN, FP, FN, TP)
        recall_metric = self._recall_custom(TN, FP, FN, TP)
        fscore_metric = self._fscore_custom(recall_metric, precision_metric)
        
        return accuracy_metric, precision_metric, recall_metric, fscore_metric
        
    def evaluate_models_with_dataframe(self, dataframe, target_column = 'class', iterations = 10):
        for i, model in enumerate(self._models):
            print(f'######### {self._model_names[i]} #########')
            self._evaluate_with_dataframe(model, dataframe, target_column, iterations)


# Evaluación de los modelos con datos NO balanceados

In [362]:
models = {
    'LOGISTIC REGRESION': LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'),
    'DECISION TREE': DecisionTreeClassifier(random_state=42),
    'KNN 3': KNeighborsClassifier(n_neighbors=3),
    'KNN 5': KNeighborsClassifier(n_neighbors=5),
    'KNN 7': KNeighborsClassifier(n_neighbors=7),
    'KNN 9': KNeighborsClassifier(n_neighbors=9),
    'MOST FREQUEST CLASS': DummyClassifier(strategy='most_frequent'),
    'RANDOM CLASS': DummyClassifier(strategy='uniform'),
}

evaluator = EvaluateModel(models, verbose = 0)
evaluator.evaluate_models_with_dataframe(dataframe_normalized)

######### LOGISTIC REGRESION #########
Average
Accuracy = 0.9823818687192645; Precision = 0.8347512383869761; Recall = 0.8510055048632909; fscore = 0.8372925955006794

######### DECISION TREE #########
Average
Accuracy = 0.9739790463972632; Precision = 0.8626662924096502; Recall = 0.608965813046195; fscore = 0.6893857025474074

######### KNN 3 #########
Average
Accuracy = 0.9849048535385933; Precision = 0.8247106671428714; Recall = 0.9048598971001779; fscore = 0.8617290515598892

######### KNN 5 #########
Average
Accuracy = 0.9845841351293563; Precision = 0.8195938285121407; Recall = 0.92518732833614; fscore = 0.8658076455883972

######### KNN 7 #########
Average
Accuracy = 0.9850331409022877; Precision = 0.8241432044823955; Recall = 0.9163683913538512; fscore = 0.8657848914492426

######### KNN 9 #########
Average
Accuracy = 0.9844986102202267; Precision = 0.821477831869965; Recall = 0.911777570550816; fscore = 0.862805141319031

######### MOST FREQUEST CLASS #########
Average
Accurac

# Evaluación de los modelos con datos balanceados

In [363]:
evaluator.evaluate_models_with_dataframe(dataframe_normalized_balanced)

######### LOGISTIC REGRESION #########
Average
Accuracy = 0.9729110207768745; Precision = 0.9630820240241125; Recall = 0.9836456375593958; fscore = 0.9732160730781866

######### DECISION TREE #########
Average
Accuracy = 0.9698622402890695; Precision = 0.9909130045857802; Recall = 0.9486172933228412; fscore = 0.9692350317391446

######### KNN 3 #########
Average
Accuracy = 0.9900406504065039; Precision = 0.9847083017885065; Recall = 0.9954576360820255; fscore = 0.9900513744233068

######### KNN 5 #########
Average
Accuracy = 0.9907746160794939; Precision = 0.9824487410626359; Recall = 0.9994353660826445; fscore = 0.9908673543759343

######### KNN 7 #########
Average
Accuracy = 0.9900745257452576; Precision = 0.9812749129939693; Recall = 0.9992011560937903; fscore = 0.990154478045428

######### KNN 9 #########
Average
Accuracy = 0.9874548328816621; Precision = 0.9758858042334939; Recall = 0.9996187485983405; fscore = 0.9876071214725959

######### MOST FREQUEST CLASS #########
Average
Ac

# Conclusiones generales preliminares
1. Modelo con mejor performance KNN con K = 5 dado que accuracy es 0.99 y fscore es 0.9908
2. Los features asociados a degree de los hermanos y padres hicieron que los modelos mejoraran bastantes