# Proyecto CSI: predecir la palabra root de una oración
Autores: Oriol Catasús Llena, Pablo Arancibia Barahona

Fecha 14 de enero de 2024

In [30]:
# Imports
import os
import csv
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import (
    confusion_matrix,
)
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import RandomOverSampler

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [2]:
PATH_PUD = './pud26/'
PATH_METRICS_LAL = './metrics/'

In [3]:
# Funcion para obtener los datos de archivos PUD
def get_data(path):
    f = open(path, "r")
    data = []
    
    for x in f:
        data += [[int(y) for y in x.split(' ')]]

    return data

# Libreria LAL
Primero miramos si el conjunto de arboles es correcto. Luego procesamos el conjunto de arboles y generamos un fichero CSV con sus métricas

In [4]:
import lal

def process_treebank(path, lal_file_path):
    errlist = lal.io.check_correctness_treebank(path)
    for err in errlist:
        print(err)
    lal.io.process_treebank(path, lal_file_path)

In [5]:
files = os.listdir(PATH_PUD)
for file_name in files:
    file_path = os.path.join(PATH_PUD, file_name)
    lang = file_name.split('.')[0]
    metrics_file_name = lang + '.csv'
    if os.path.isfile(file_path):
        metrics_file_path = os.path.join(PATH_METRICS_LAL, metrics_file_name)
        process_treebank(file_path, metrics_file_path)

En caso de contar con un archivo de metricas producto de LAL, estas se cargan para su posterior uso

In [6]:
def get_metrics_from_lal(lal_file_path):
    metrics = []
    with open(lal_file_path, 'r') as file:
        csv_reader = csv.DictReader(file, delimiter='\t')
        for row in csv_reader:
            metrics.append(row)
    return metrics

In [7]:
def calcular_grados_hermanos_padre(grafo, vertex):
    padres = set(grafo)
    avg_vertex_sibling = [0] * len(grafo)
    avg_vertex_children = [0] * len(grafo)

    for padre in padres:
        index_hijos = []
        for i in range(len(grafo)):
            if grafo[i] == padre:
                index_hijos.append(i)
 
        vertex_degree = [vertex[hijo] for hijo in index_hijos]

        for hijo in index_hijos:
            avg_vertex_sibling[hijo] = np.mean(vertex_degree)

        if padre != 0:
            avg_vertex_children[padre - 1] = np.mean(vertex_degree)

    return avg_vertex_sibling, avg_vertex_children

# [3, 0, 2] => [0, 1, 1] => [0, 1, 1] => [0, 1, 0]
# [2, 0, 4, 2, 6, 2] => [0, 3, 0, 1, 0, 1] => [0.6, 3, 0, 0.6, 0, 0.6] => [0, 0.6, 0, 0, 0, 0]

In [8]:
def calcular_distancias(grafo):
    n = len(grafo)
    distancias = [[float('inf')] * n for _ in range(n)]

    # Inicializar las distancias conocidas
    for i in range(n):
        distancias[i][i] = 0
        vecino = grafo[i]
        if vecino != 0:
            distancias[i][vecino - 1] = 1  # Peso siempre es 1
            distancias[vecino - 1][i] = 1  # Asegurar bidireccionalidad

    # Calcular las distancias mínimas
    for k in range(n):
        for i in range(n):
            for j in range(n):
                distancias[i][j] = min(distancias[i][j], distancias[i][k] + distancias[k][j])

    return [sum(d)/n for d in distancias], [np.std(d) for d in distancias]

In [9]:
def create_matrix(data, metrics):
    dataframe_dict = {
        'vertex_degree': [],
        'vertex_distance': [],
        'vertex_distance_std': [],
        'sibling_average_degree': [],
        'children_average_degree': [],
        'centre': [],
        'centroid': [],
        'class': [],
    }
    
    for sentence, m in zip(data, metrics):
        dataframe_dict['class'] += [1 if word == 0 else 0 for word in sentence]
        dataframe_dict['centre'] += [1 if int(m['tree_centre1']) == i or int(m['tree_centre2']) == i else 0 for i in range(len(sentence))]
        dataframe_dict['centroid'] += [1 if int(m['tree_centroid1']) == i or int(m['tree_centroid2']) == i else 0 for i in range(len(sentence))]
        
        vertex_degree = [sentence.count(index + 1) for index in range(len(sentence))]
        sibling_degree, children_degree = calcular_grados_hermanos_padre(sentence, vertex_degree)
        dataframe_dict['vertex_degree'] += vertex_degree
        dataframe_dict['sibling_average_degree'] += sibling_degree
        dataframe_dict['children_average_degree'] += children_degree

        average, standar_desviation = calcular_distancias(sentence)
        dataframe_dict['vertex_distance'] += average
        dataframe_dict['vertex_distance_std'] += standar_desviation

    return pd.DataFrame(dataframe_dict)


In [10]:
def normalize_data(dataframe):
    scaler = MinMaxScaler()
    dataframe_normalized = scaler.fit_transform(dataframe)
    dataframe_normalized = pd.DataFrame(dataframe_normalized, columns=dataframe.columns)
    dataframe_normalized['centre'] = dataframe_normalized['centre'].astype(int)
    dataframe_normalized['centroid'] = dataframe_normalized['centroid'].astype(int)
    #dataframe_normalized['class'] = dataframe_normalized['class'].astype(int)

    return dataframe_normalized

# Creación de la matriz
Haciendo uso de las funciones anteriores se crea un matriz normalizada

In [14]:
data = {}
files = os.listdir(PATH_PUD)
for file_name in files:
    file_path = os.path.join(PATH_PUD, file_name)
    if os.path.isfile(file_path):
        lang = file_name.split('.')[0]
        metrics_file_name = lang + '.csv'
        metrics_file_path = os.path.join(PATH_METRICS_LAL, metrics_file_name)
        lang_data = get_data(file_path)
        metrics = get_metrics_from_lal(metrics_file_path)
        data[lang] = create_matrix(lang_data, metrics)
        
len(data.keys())
print(data['Portuguese-all'].head())

{'Portuguese-all':        vertex_degree  vertex_distance  vertex_distance_std  \
0                  0         3.923077             1.421170   
1                  0         4.358974             1.860344   
2                  0         4.358974             1.860344   
3                  4         3.410256             1.764580   
4                  0         5.153846             2.142829   
...              ...              ...                  ...   
20798              0         3.320000             1.617900   
20799              6         2.400000             1.496663   
20800              0         4.080000             1.874460   
20801              0         4.080000             1.874460   
20802              2         3.160000             1.736203   

       sibling_average_degree  children_average_degree  centre  centroid  \
0                    1.666667                 0.000000       0         0   
1                    1.500000                 0.000000       0         0   
2       

# Balanceo de clases de la matriz

In [27]:
def balancear_clase_oversampling(dataframe, clase_objetivo, target_column='class', random_state=None):
    X = dataframe.loc[:, dataframe.columns != target_column]
    y = dataframe[target_column]
    # Encuentra los índices de la clase objetivo
    indices_clase_objetivo = np.where(y == clase_objetivo)[0]

    # Crea el oversampler
    oversampler = RandomOverSampler(sampling_strategy='auto', random_state=random_state)

    # Aplica oversampling solo a la clase objetivo
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    # Convierte los resultados a DataFrames si X y y son de tipo DataFrame
    if isinstance(X, pd.DataFrame):
        X_balanced = pd.DataFrame(X_resampled, columns=X.columns)
    else:
        X_balanced = np.array(X_resampled)

    if isinstance(y, pd.Series):
        y_balanced = pd.Series(y_resampled, name=y.name)
    else:
        y_balanced = np.array(y_resampled)

    X_balanced[target_column] = y_balanced
    return X_balanced


In [28]:
dataframe_normalized_balanced = balancear_clase_oversampling(dataframe_normalized, 1)
dataframe_normalized_balanced.head(30)

NameError: name 'dataframe_normalized' is not defined

In [None]:
print(dataframe_normalized['class'].value_counts())
print(dataframe_normalized_balanced['class'].value_counts())

# Creación de pipeline de entrenamiento
Se crea una clase de evaluación de multiples modelos para una mejor escalabilidad

In [22]:
def division(numerador, denominador, valor_predeterminado=0):
    resultado = valor_predeterminado if denominador == 0 else numerador / denominador
    return resultado

class EvaluateModel:

    def __init__(self, models, verbose = 0):
        self._models = list(models.values())
        self._model_names = list(models.keys())
        self._verbose = verbose

    def _fit_cross_validation(self, model, X, y, num_folds=5):
        cv_scores = cross_validate(
            model,
            X,
            y,
            cv=num_folds,
            scoring=('accuracy', 'precision', 'recall', 'f1'),
        )
        model_fit = model.fit(X, y)

        return model_fit, cv_scores

    def _evaluate_with_dataframe(self, model, dataframe, target_column = 'class', iterations = 10):
        accuracies = []
        precisions = []
        recalls = []
        fscores = []

        for x in range(iterations):
            X_train, X_test, y_train, y_test = train_test_split(
                dataframe.loc[:, dataframe.columns != target_column],
                dataframe[target_column]
            )
            X_train = normalize_data(X_train)
            X_test = normalize_data(X_test)
            y_test = y_test.values.astype(int)

            model_fit, cv_scores = self._fit_cross_validation(model, X_train, y_train)
            predictions = model_fit.predict(X_test)

            cm = confusion_matrix(y_test, predictions)
            accuracy, precision, recall, fscore = self.get_metrics(cm)
    
            if self._verbose == 1:
                print(f'Metrics Iteration {x}')
                print(confusion_matrix(y_test, predictions))
                print(f"Accuracy = {accuracy}; Precision = {precision}; Recall = {recall}; fscore = {fscore}\n")
    
            accuracies.append(accuracy)
            precisions.append(precision)
            recalls.append(recall)
            fscores.append(fscore)

        mean_accuracy = np.mean(accuracies)
        mean_precision = np.mean(precisions)
        mean_recall = np.mean(recalls)
        mean_fscore = np.mean(fscores)
        
        print(f"Accuracy = {mean_accuracy}; Precision = {mean_precision}; Recall = {mean_recall}; fscore = {mean_fscore}\n")

        return mean_accuracy, mean_precision, mean_recall, mean_fscore

    def _accuracy_custom(self, TN, FP, FN, TP):
        return division((TP + TN),(TP + TN + FP + FN))
    
    def _recall_custom(self, TN, FP, FN, TP):
        return division((TP),(TP + FN))
    
    def _precision_custom(self, TN, FP, FN, TP):
        return division((TP),(TP + FP))
    
    def _fscore_custom(self, recall_metric, precision_metric):
        return division(2, division(1, recall_metric) + (division(1, precision_metric)))

    def get_metrics(self, confusion_matrix):
        TN, FP, FN, TP = confusion_matrix.ravel()
        
        accuracy_metric = self._accuracy_custom(TN, FP, FN, TP)
        precision_metric = self._precision_custom(TN, FP, FN, TP)
        recall_metric = self._recall_custom(TN, FP, FN, TP)
        fscore_metric = self._fscore_custom(recall_metric, precision_metric)
        
        return accuracy_metric, precision_metric, recall_metric, fscore_metric
        
    def evaluate_models_with_dataframe(self, dataframe, target_column = 'class', iterations = 10):
        for i, model in enumerate(self._models):
            print(f'######### {self._model_names[i]} #########')
            self._evaluate_with_dataframe(model, dataframe, target_column, iterations)


# Evaluación de los modelos con datos NO balanceados

In [23]:
models = {
    'LOGISTIC REGRESION': LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'),
    'DECISION TREE': DecisionTreeClassifier(random_state=42),
    'KNN 3': KNeighborsClassifier(n_neighbors=3),
    'KNN 5': KNeighborsClassifier(n_neighbors=5),
    'KNN 7': KNeighborsClassifier(n_neighbors=7),
    'KNN 9': KNeighborsClassifier(n_neighbors=9),
    'MOST FREQUEST CLASS': DummyClassifier(strategy='most_frequent'),
    'RANDOM CLASS': DummyClassifier(strategy='uniform'),
}

evaluator = EvaluateModel(models, verbose = 0)
#evaluator.evaluate_models_with_dataframe(dataframe_normalized)

# Evaluación de los modelos con datos balanceados

In [None]:
evaluator.evaluate_models_with_dataframe(dataframe_normalized_balanced)

In [32]:
def pipeline_model(dataframe, evaluator):
    dataframe_balanced = balancear_clase_oversampling(dataframe, 1)
    print(dataframe['class'].value_counts())
    print(dataframe_balanced['class'].value_counts())
    print('STARTING WITH NOT BALANCED')
    evaluator.evaluate_models_with_dataframe(dataframe)
    print('STARTING WITH BALANCED')
    evaluator.evaluate_models_with_dataframe(dataframe_balanced)
    print()

In [None]:
for lang in data.keys():
    print(f"Evaluating with: {lang}")
    pipeline_model(data[lang], evaluator)

Evaluating with: Portuguese-all
class
0    19808
1      995
Name: count, dtype: int64
class
0    19808
1    19808
Name: count, dtype: int64
STARTING WITH NOT BALANCED
######### LOGISTIC REGRESION #########
Accuracy = 0.9755047106325707; Precision = 0.7571920647236781; Recall = 0.7801513573035079; fscore = 0.7482354525584434

######### DECISION TREE #########
Accuracy = 0.9717938857911939; Precision = 0.7840238412152172; Recall = 0.5653730083742281; fscore = 0.6423232185430048

######### KNN 3 #########
Accuracy = 0.9800038454143433; Precision = 0.7728731105078164; Recall = 0.8516288439012717; fscore = 0.8077116867215468

######### KNN 5 #########
Accuracy = 0.9785425879638531; Precision = 0.7312594090252423; Recall = 0.8979628345565471; fscore = 0.8027236869322115

######### KNN 7 #########
Accuracy = 0.9801576619880791; Precision = 0.7686761897585108; Recall = 0.8832738756331164; fscore = 0.8139787266036848

######### KNN 9 #########
Accuracy = 0.9791194001153624; Precision = 0.743973

# Conclusiones generales preliminares
1. Modelo con mejor performance KNN con K = 5 dado que accuracy es 0.99 y fscore es 0.9908
2. Los features asociados a degree de los hermanos y padres hicieron que los modelos mejoraran bastantes