# Proyecto CSI: predecir la palabra root de una oración
Autores: Oriol Catasús Llena, Pablo Arancibia Barahona

Fecha 14 de enero de 2024

In [1]:
# Imports
import csv
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import (
    confusion_matrix,
)
from sklearn.exceptions import UndefinedMetricWarning

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [2]:
PATH_PUD = './pud26/English-all.heads'
LAL_FILE_PATH = './output_file.csv'

In [3]:
# Funcion para obtener los datos de archivos PUD
def get_data(path):
    f = open(path, "r")
    data = []
    
    for x in f:
        data += [[int(y) for y in x.split(' ')]]

    return data

# Libreria LAL
Primero miramos si el conjunto de arboles es correcto. Luego procesamos el conjunto de arboles y generamos un fichero CSV con sus métricas

In [4]:
import lal

def process_treebank(path, lal_file_path):
    errlist = lal.io.check_correctness_treebank(path)
    for err in errlist:
        print(err)
    err = lal.io.process_treebank(path, lal_file_path)
    if err:
        print(err)

In [5]:
process_treebank(PATH_PUD, LAL_FILE_PATH)




En caso de contar con un archivo de metricas producto de LAL, estas se cargan para su posterior uso

In [6]:
def get_metrics_from_lal(lal_file_path):
    metrics = []
    with open(lal_file_path, 'r') as file:
        csv_reader = csv.DictReader(file, delimiter='\t')
        for row in csv_reader:
            metrics.append(row)
    return metrics

In [7]:
def calcular_distancias(grafo):
    n = len(grafo)
    distancias = [[float('inf')] * n for _ in range(n)]

    # Inicializar las distancias conocidas
    for i in range(n):
        distancias[i][i] = 0
        vecino = grafo[i]
        if vecino != 0:
            distancias[i][vecino - 1] = 1  # Peso siempre es 1
            distancias[vecino - 1][i] = 1  # Asegurar bidireccionalidad

    # Calcular las distancias mínimas
    for k in range(n):
        for i in range(n):
            for j in range(n):
                distancias[i][j] = min(distancias[i][j], distancias[i][k] + distancias[k][j])

    return [sum(d)/n for d in distancias]

In [8]:
def create_matrix(data, metrics):
    dataframe_dict = {
        'vertex_degree': [],
        'vertex_distance': [],
        'centre': [],
        'centroid': [],
        'class': [],
    }
    
    for sentence, m in zip(data, metrics):
        dataframe_dict['class'] += [1 if word == 0 else 0 for word in sentence]
        dataframe_dict['centre'] += [1 if int(m['tree_centre1']) == i or int(m['tree_centre2']) == i else 0 for i in range(len(sentence))]
        dataframe_dict['centroid'] += [1 if int(m['tree_centroid1']) == i or int(m['tree_centroid2']) == i else 0 for i in range(len(sentence))]
        dataframe_dict['vertex_degree'] += [sentence.count(index + 1) for index in range(len(sentence))]
        dataframe_dict['vertex_distance'] += calcular_distancias(sentence)

    return pd.DataFrame(dataframe_dict)


In [9]:
def normalize_data(dataframe):
    scaler = MinMaxScaler()
    dataframe_normalized = scaler.fit_transform(dataframe)
    dataframe_normalized = pd.DataFrame(dataframe_normalized, columns=dataframe.columns)
    dataframe_normalized['centre'] = dataframe_normalized['centre'].astype(int)
    dataframe_normalized['centroid'] = dataframe_normalized['centroid'].astype(int)
    dataframe_normalized['class'] = dataframe_normalized['class'].astype(int)

    return dataframe_normalized

# Creación de la matriz
Haciendo uso de las funciones anteriores se crea un matriz normalizada

In [10]:
data = get_data(PATH_PUD)
metrics = get_metrics_from_lal(LAL_FILE_PATH)
dataframe = create_matrix(data, metrics)
dataframe_normalized = normalize_data(dataframe)

In [11]:
# Preview de los datos
print(data[1])
dataframe_normalized.head(30)

[2, 16, 4, 2, 6, 7, 4, 10, 10, 7, 16, 16, 16, 15, 16, 0]


Unnamed: 0,vertex_degree,vertex_distance,centre,centroid,class
0,0.0,0.375931,0,0,0
1,0.111111,0.339913,0,0,0
2,0.0,0.565022,0,0,0
3,0.0,0.565022,0,0,0
4,0.0,0.565022,0,0,0
5,0.333333,0.438961,0,0,0
6,0.0,0.375931,0,0,0
7,0.444444,0.24987,1,0,0
8,0.0,0.474978,0,0,0
9,0.0,0.474978,0,0,0


# Balanceo de clases de la matriz

In [12]:
dataframe_normalized_balanced = dataframe_normalized

# Creación de pipeline de entrenamiento
Se crea una clase de evaluación de multiples modelos para una mejor escalabilidad

In [13]:
def division(numerador, denominador, valor_predeterminado=0):
    resultado = valor_predeterminado if denominador == 0 else numerador / denominador
    return resultado

class EvaluateModel:

    def __init__(self, models, verbose = 0):
        self._models = list(models.values())
        self._model_names = list(models.keys())
        self._verbose = verbose

    def _fit_cross_validation(self, model, X, y, num_folds=5):
        cv_scores = cross_validate(
            model,
            X,
            y,
            cv=num_folds,
            scoring=('accuracy', 'precision', 'recall', 'f1'),
        )
        model_fit = model.fit(X, y)

        return model_fit, cv_scores

    def _evaluate_with_dataframe(self, model, dataframe, target_column = 'class', iterations = 10):
        accuracies = []
        precisions = []
        recalls = []
        fscores = []

        for x in range(iterations):
            X_train, X_test, y_train, y_test = train_test_split(
                dataframe.loc[:, dataframe.columns != target_column],
                dataframe[target_column]
            )
            y_test = y_test.values.astype(int)

            model_fit, cv_scores = self._fit_cross_validation(model, X_train, y_train)
            predictions = model_fit.predict(X_test)

            cm = confusion_matrix(y_test, predictions)
            accuracy, precision, recall, fscore = self.get_metrics(cm)
    
            if self._verbose == 1:
                print(f'Metrics Iteration {x}')
                print(confusion_matrix(y_test, predictions))
                print(f"Accuracy = {accuracy}; Precision = {precision}; Recall = {recall}; fscore = {fscore}\n")
    
            accuracies.append(accuracy)
            precisions.append(precision)
            recalls.append(recall)
            fscores.append(fscore)

        mean_accuracy = np.mean(accuracies)
        mean_precision = np.mean(precisions)
        mean_recall = np.mean(recalls)
        mean_fscore = np.mean(fscores)
        
        print('Average')
        print(f"Accuracy = {mean_accuracy}; Precision = {mean_precision}; Recall = {mean_recall}; fscore = {mean_fscore}\n")

        return mean_accuracy, mean_precision, mean_recall, mean_fscore

    def _accuracy_custom(self, TN, FP, FN, TP):
        return division((TP + TN),(TP + TN + FP + FN))
    
    def _recall_custom(self, TN, FP, FN, TP):
        return division((TP),(TP + FN))
    
    def _precision_custom(self, TN, FP, FN, TP):
        return division((TP),(TP + FP))
    
    def _fscore_custom(self, recall_metric, precision_metric):
        return division(2, division(1, recall_metric) + (division(1, precision_metric)))

    def get_metrics(self, confusion_matrix):
        TN, FP, FN, TP = confusion_matrix.ravel()
        
        accuracy_metric = self._accuracy_custom(TN, FP, FN, TP)
        precision_metric = self._precision_custom(TN, FP, FN, TP)
        recall_metric = self._recall_custom(TN, FP, FN, TP)
        fscore_metric = self._fscore_custom(recall_metric, precision_metric)
        
        return accuracy_metric, precision_metric, recall_metric, fscore_metric
        
    def evaluate_models_with_dataframe(self, dataframe, target_column = 'class', iterations = 10):
        for i, model in enumerate(self._models):
            print(f'######### {self._model_names[i]} #########')
            self._evaluate_with_dataframe(model, dataframe, target_column, iterations)


# Evaluación de los modelos con datos NO balanceados

In [14]:
models = {
    'LOGISTIC REGRESION': LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'),
    'MOST FREQUEST CLASS': DummyClassifier(strategy='most_frequent'),
    'RANDOM CLASS': DummyClassifier(strategy='uniform'),
}

evaluator = EvaluateModel(models, verbose = 0)
evaluator.evaluate_models_with_dataframe(dataframe_normalized)

NameError: name 'DummyClassifier' is not defined

# Evaluación de los modelos con datos balanceados

In [None]:
evaluator.evaluate_models_with_dataframe(dataframe_normalized_balanced)

# Conclusiones generales preliminares
1. bla
2. bla bla