# Proyecto CSI: predecir la palabra root de una oración
Autores: Oriol Catasús Llena, Pablo Arancibia Barahona

Fecha 14 de enero de 2024

In [365]:
# Imports
import os
import csv
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import (
    confusion_matrix,
)
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import RandomOverSampler

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [366]:
PATH_PUD = './pud26/'
PATH_METRICS_LAL = './metrics/'

In [367]:
# Funcion para obtener los datos de archivos PUD
def get_data(path):
    f = open(path, "r")
    data = []
    
    for x in f:
        data += [[int(y) for y in x.split(' ')]]

    return data

# Libreria LAL
Primero miramos si el conjunto de arboles es correcto. Luego procesamos el conjunto de arboles y generamos un fichero CSV con sus métricas

In [4]:
import lal

def process_treebank(path, lal_file_path):
    errlist = lal.io.check_correctness_treebank(path)
    for err in errlist:
        print(err)
    lal.io.process_treebank(path, lal_file_path)

In [5]:
files = os.listdir(PATH_PUD)
for file_name in files:
    file_path = os.path.join(PATH_PUD, file_name)
    lang = file_name.split('.')[0]
    metrics_file_name = lang + '.csv'
    if os.path.isfile(file_path):
        metrics_file_path = os.path.join(PATH_METRICS_LAL, metrics_file_name)
        process_treebank(file_path, metrics_file_path)

En caso de contar con un archivo de metricas producto de LAL, estas se cargan para su posterior uso

In [368]:
def get_metrics_from_lal(lal_file_path):
    metrics = []
    with open(lal_file_path, 'r') as file:
        csv_reader = csv.DictReader(file, delimiter='\t')
        for row in csv_reader:
            metrics.append(row)
    return metrics

In [369]:
def calcular_grados_hermanos_padre(grafo, vertex):
    padres = set(grafo)
    avg_vertex_sibling = [0] * len(grafo)
    avg_vertex_children = [0] * len(grafo)

    for padre in padres:
        index_hijos = []
        for i in range(len(grafo)):
            if grafo[i] == padre:
                index_hijos.append(i)
 
        vertex_degree = [vertex[hijo] for hijo in index_hijos]

        for hijo in index_hijos:
            avg_vertex_sibling[hijo] = np.mean(vertex_degree)

        if padre != 0:
            avg_vertex_children[padre - 1] = np.mean(vertex_degree)

    return avg_vertex_sibling, avg_vertex_children

In [370]:
def calcular_distancias(grafo):
    n = len(grafo)
    distancias = [[float('inf')] * n for _ in range(n)]

    # Inicializar las distancias conocidas
    for i in range(n):
        distancias[i][i] = 0
        vecino = grafo[i]
        if vecino != 0:
            distancias[i][vecino - 1] = 1  # Peso siempre es 1
            distancias[vecino - 1][i] = 1  # Asegurar bidireccionalidad

    # Calcular las distancias mínimas
    for k in range(n):
        for i in range(n):
            for j in range(n):
                distancias[i][j] = min(distancias[i][j], distancias[i][k] + distancias[k][j])

    return [sum(d)/n for d in distancias], [np.std(d) for d in distancias]

In [371]:
def create_matrix(data, metrics):
    dataframe_dict = {
        'vertex_degree': [],
        'vertex_distance': [],
        'vertex_distance_std': [],
        'sibling_average_degree': [],
        'children_average_degree': [],
        'centre': [],
        'centroid': [],
        'class': [],
    }
    
    for sentence, m in zip(data, metrics):
        dataframe_dict['class'] += [1 if word == 0 else 0 for word in sentence]
        dataframe_dict['centre'] += [1 if int(m['tree_centre1']) == i or int(m['tree_centre2']) == i else 0 for i in range(len(sentence))]
        dataframe_dict['centroid'] += [1 if int(m['tree_centroid1']) == i or int(m['tree_centroid2']) == i else 0 for i in range(len(sentence))]
        
        vertex_degree = [sentence.count(index + 1) for index in range(len(sentence))]
        sibling_degree, children_degree = calcular_grados_hermanos_padre(sentence, vertex_degree)
        dataframe_dict['vertex_degree'] += vertex_degree
        dataframe_dict['sibling_average_degree'] += sibling_degree
        dataframe_dict['children_average_degree'] += children_degree

        average, standar_desviation = calcular_distancias(sentence)
        dataframe_dict['vertex_distance'] += average
        dataframe_dict['vertex_distance_std'] += standar_desviation

    return pd.DataFrame(dataframe_dict)


In [373]:
def normalize_data(dataframe):
    scaler = MinMaxScaler()
    dataframe_normalized = scaler.fit_transform(dataframe)
    dataframe_normalized = pd.DataFrame(dataframe_normalized, columns=dataframe.columns)
    dataframe_normalized['centre'] = dataframe_normalized['centre'].astype(int)
    dataframe_normalized['centroid'] = dataframe_normalized['centroid'].astype(int)

    return dataframe_normalized

# Creación de la matriz
Haciendo uso de las funciones anteriores se crea un matriz normalizada

In [375]:
data = {}
files = os.listdir(PATH_PUD)
for file_name in files:
    file_path = os.path.join(PATH_PUD, file_name)
    if os.path.isfile(file_path):
        lang = file_name.split('.')[0]
        metrics_file_name = lang + '.csv'
        metrics_file_path = os.path.join(PATH_METRICS_LAL, metrics_file_name)
        lang_data = get_data(file_path)
        metrics = get_metrics_from_lal(metrics_file_path)
        data[lang] = create_matrix(lang_data, metrics)
        
print(len(data.keys()))
data['English-all'].head()

20


Unnamed: 0,vertex_degree,vertex_distance,vertex_distance_std,sibling_average_degree,children_average_degree,centre,centroid,class
0,0,3.533333,1.284091,1.0,0.0,0,0,0
1,1,3.266667,1.412641,1.0,3.0,0,0,0
2,0,4.933333,1.948219,0.0,0.0,0,0,0
3,0,4.933333,1.948219,0.0,0.0,0,0,0
4,0,4.933333,1.948219,0.0,0.0,0,0,0


# Balanceo de clases de la matriz

In [376]:
def balancear_clase_oversampling(dataframe, clase_objetivo, target_column='class', random_state=None):
    X = dataframe.loc[:, dataframe.columns != target_column]
    y = dataframe[target_column]
    # Encuentra los índices de la clase objetivo
    indices_clase_objetivo = np.where(y == clase_objetivo)[0]

    # Crea el oversampler
    oversampler = RandomOverSampler(sampling_strategy='auto', random_state=random_state)

    # Aplica oversampling solo a la clase objetivo
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    # Convierte los resultados a DataFrames si X y y son de tipo DataFrame
    if isinstance(X, pd.DataFrame):
        X_balanced = pd.DataFrame(X_resampled, columns=X.columns)
    else:
        X_balanced = np.array(X_resampled)

    if isinstance(y, pd.Series):
        y_balanced = pd.Series(y_resampled, name=y.name)
    else:
        y_balanced = np.array(y_resampled)

    X_balanced[target_column] = y_balanced
    return X_balanced


In [377]:
dataframe_normalized = data['English-all']
dataframe_normalized_balanced = balancear_clase_oversampling(dataframe_normalized, 1)
dataframe_normalized_balanced.head(30)

Unnamed: 0,vertex_degree,vertex_distance,vertex_distance_std,sibling_average_degree,children_average_degree,centre,centroid,class
0,0,3.533333,1.284091,1.0,0.0,0,0,0
1,1,3.266667,1.412641,1.0,3.0,0,0,0
2,0,4.933333,1.948219,0.0,0.0,0,0,0
3,0,4.933333,1.948219,0.0,0.0,0,0,0
4,0,4.933333,1.948219,0.0,0.0,0,0,0
5,3,4.0,1.807392,3.0,0.0,0,0,0
6,0,3.533333,1.284091,1.0,0.0,0,0,0
7,4,2.6,1.143095,2.5,1.0,1,0,0
8,0,4.266667,1.631632,0.0,0.0,0,0,0
9,0,4.266667,1.631632,0.0,0.0,0,0,0


In [378]:
print(dataframe_normalized['class'].value_counts())
print(dataframe_normalized_balanced['class'].value_counts())

class
0    17711
1      995
Name: count, dtype: int64
class
0    17711
1    17711
Name: count, dtype: int64


# Creación de pipeline de entrenamiento
Se crea una clase de evaluación de multiples modelos para una mejor escalabilidad

In [401]:
def division(numerador, denominador, valor_predeterminado=0):
    resultado = valor_predeterminado if denominador == 0 else numerador / denominador
    return resultado

class EvaluateModel:

    def __init__(self, models, verbose = 0):
        self._models = list(models.values())
        self._model_names = list(models.keys())
        self._verbose = verbose

    def _fit_cross_validation(self, model, X, y, num_folds=5):
        cv_scores = cross_validate(
            model,
            X,
            y,
            cv=num_folds,
            scoring=('accuracy', 'precision', 'recall', 'f1'),
        )
        model_fit = model.fit(X, y)

        return model_fit, cv_scores

    def _evaluate_with_dataframe(self, model, dataframe, target_column = 'class', iterations = 10):
        accuracies = []
        precisions = []
        recalls = []
        fscores = []

        for x in range(iterations):
            X_train, X_test, y_train, y_test = train_test_split(
                dataframe.loc[:, dataframe.columns != target_column],
                dataframe[target_column]
            )
            X_train = normalize_data(X_train)
            X_test = normalize_data(X_test)
            y_test = y_test.values.astype(int)

            model_fit, cv_scores = self._fit_cross_validation(model, X_train, y_train)
            predictions = model_fit.predict(X_test)

            cm = confusion_matrix(y_test, predictions)
            accuracy, precision, recall, fscore = self.get_metrics(cm)
    
            if self._verbose == 1:
                print(f'Metrics Iteration {x}')
                print(confusion_matrix(y_test, predictions))
                print(f"Accuracy = {accuracy}; Precision = {precision}; Recall = {recall}; fscore = {fscore}\n")
    
            accuracies.append(accuracy)
            precisions.append(precision)
            recalls.append(recall)
            fscores.append(fscore)

        mean_accuracy = np.mean(accuracies)
        mean_precision = np.mean(precisions)
        mean_recall = np.mean(recalls)
        mean_fscore = np.mean(fscores)
        
        print(f"Accuracy = {mean_accuracy}; Precision = {mean_precision}; Recall = {mean_recall}; fscore = {mean_fscore}\n")

        return mean_accuracy, mean_precision, mean_recall, mean_fscore

    def _accuracy_custom(self, TN, FP, FN, TP):
        return division((TP + TN),(TP + TN + FP + FN))
    
    def _recall_custom(self, TN, FP, FN, TP):
        return division((TP),(TP + FN))
    
    def _precision_custom(self, TN, FP, FN, TP):
        return division((TP),(TP + FP))
    
    def _fscore_custom(self, recall_metric, precision_metric):
        return division(2, division(1, recall_metric) + (division(1, precision_metric)))

    def get_metrics(self, confusion_matrix):
        TN, FP, FN, TP = confusion_matrix.ravel()
        
        accuracy_metric = self._accuracy_custom(TN, FP, FN, TP)
        precision_metric = self._precision_custom(TN, FP, FN, TP)
        recall_metric = self._recall_custom(TN, FP, FN, TP)
        fscore_metric = self._fscore_custom(recall_metric, precision_metric)
        
        return accuracy_metric, precision_metric, recall_metric, fscore_metric
        
    def evaluate_models_with_dataframe(self, dataframe, target_column = 'class', iterations = 10):
        metrics_model = {}
        for i, model in enumerate(self._models):
            print(f'######### {self._model_names[i]} #########')
            accuracy, precision, recall, fscore = self._evaluate_with_dataframe(model, dataframe, target_column, iterations)

            metrics_model[self._model_names[i]] = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'fscore': fscore,
            }
        return metrics_model


# Evaluación de los modelos con datos NO balanceados

In [403]:
models = {
    'LOGISTIC REGRESION': LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'),
    'DECISION TREE': DecisionTreeClassifier(random_state=42),
    'KNN 3': KNeighborsClassifier(n_neighbors=3),
    'KNN 5': KNeighborsClassifier(n_neighbors=5),
    'KNN 7': KNeighborsClassifier(n_neighbors=7),
    'KNN 9': KNeighborsClassifier(n_neighbors=9),
    'MOST FREQUEST CLASS': DummyClassifier(strategy='most_frequent'),
    'RANDOM CLASS': DummyClassifier(strategy='uniform'),
}

evaluator = EvaluateModel(models, verbose = 0)

In [404]:
evaluator.evaluate_models_with_dataframe(dataframe_normalized)

######### LOGISTIC REGRESION #########
Accuracy = 0.9824887748556768; Precision = 0.8089854768730198; Recall = 0.8874998322345119; fscore = 0.8430278610879386

######### DECISION TREE #########
Accuracy = 0.9657900363480862; Precision = 0.8835250942425859; Recall = 0.44027920217531646; fscore = 0.5604140438308265

######### KNN 3 #########
Accuracy = 0.984327560401967; Precision = 0.8197106811085468; Recall = 0.9060047535462694; fscore = 0.8593661960719329

######### KNN 5 #########
Accuracy = 0.983793029719906; Precision = 0.8039419431062924; Recall = 0.9253942795323209; fscore = 0.8591022305247222

######### KNN 7 #########
Accuracy = 0.9827667308103486; Precision = 0.7832296195305173; Recall = 0.9366786076731259; fscore = 0.8517027186214176

######### KNN 9 #########
Accuracy = 0.9860594398118453; Precision = 0.841809444518527; Recall = 0.9173064810664254; fscore = 0.8763177090137896

######### MOST FREQUEST CLASS #########
Accuracy = 0.9459696386572588; Precision = 0.0; Recall = 0.

# Evaluación de los modelos con datos balanceados

In [382]:
evaluator.evaluate_models_with_dataframe(dataframe_normalized_balanced)

######### LOGISTIC REGRESION #########
Accuracy = 0.9726400180668472; Precision = 0.9647287224681804; Recall = 0.9810504133293669; fscore = 0.9727804864280344

######### DECISION TREE #########
Accuracy = 0.9732497741644082; Precision = 0.9924453205004504; Recall = 0.953981596915753; fscore = 0.9726864203843503

######### KNN 3 #########
Accuracy = 0.9905149051490515; Precision = 0.9850967177427703; Recall = 0.996160450445621; fscore = 0.9905824220553061

######### KNN 5 #########
Accuracy = 0.991576332429991; Precision = 0.9839250745639646; Recall = 0.9995021517262922; fscore = 0.9916493231915606

######### KNN 7 #########
Accuracy = 0.9903116531165311; Precision = 0.9812398466577175; Recall = 0.9997980538233696; fscore = 0.9904304733199982

######### KNN 9 #########
Accuracy = 0.9891937669376694; Precision = 0.9790511272676727; Recall = 0.9997751345344927; fscore = 0.9893031910555783

######### MOST FREQUEST CLASS #########
Accuracy = 0.49620596205962064; Precision = 0.19911924119241

In [405]:
def pipeline_model(dataframe, evaluator):
    dataframe_balanced = balancear_clase_oversampling(dataframe, 1)
    print(dataframe['class'].value_counts())
    print(dataframe_balanced['class'].value_counts())
    
    print('\n STARTING WITH NOT BALANCED')
    metrics_no_balanced = evaluator.evaluate_models_with_dataframe(dataframe)
    
    print('STARTING WITH BALANCED')
    metrics_balanced = evaluator.evaluate_models_with_dataframe(dataframe_balanced)
    print()

    return {
        'no_balanced': metrics_no_balanced,
        'balanced': metrics_balanced,
    }

In [415]:
langs = [
    'Arabic-all',
    'Chinese-all',
    'Czech-all',
    'English-all',
    'Finnish-all',
    'French-all',
    'German-all',
    'Hindi-all',
    'Icelandic-all',
    'Indonesian-all',
    'Italian-all',
    'Japanese-all',
    'Korean-all',
    'Polish-all',
    'Portuguese-all',
    'Russian-all',
    'Spanish-all',
    'Swedish-all',
    'Thai-all',
    'Turkish-all'
]

In [416]:
languages_metrics = {}
for lang in langs:
    print(f"EVALUATING WITH: {lang} \n")
    languages_metrics[lang] = pipeline_model(data[lang], evaluator)

EVALUATING WITH: Arabic-all 

class
0    17514
1      995
Name: count, dtype: int64
class
1    17514
0    17514
Name: count, dtype: int64

 STARTING WITH NOT BALANCED
######### LOGISTIC REGRESION #########
Accuracy = 0.9676966292134832; Precision = 0.6911524312273389; Recall = 0.8063001949581944; fscore = 0.7282976802778509

######### DECISION TREE #########
Accuracy = 0.9624459809853068; Precision = 0.7258586032892305; Recall = 0.4663581803754021; fscore = 0.5460023399935429

######### KNN 3 #########
Accuracy = 0.9716508210890232; Precision = 0.7278451923992841; Recall = 0.7651513451444995; fscore = 0.7340196226835253

######### KNN 5 #########
Accuracy = 0.9741356957649092; Precision = 0.7429060651062424; Recall = 0.8112219641936006; fscore = 0.7646889456382389

######### KNN 7 #########
Accuracy = 0.9751512532411409; Precision = 0.7629389952524583; Recall = 0.8086061456714372; fscore = 0.7739048165579469

######### KNN 9 #########
Accuracy = 0.9696629213483146; Precision = 0.667605

In [417]:
languages_metrics

{'Arabic-all': {'no_balanced': {'LOGISTIC REGRESION': {'accuracy': 0.9676966292134832,
    'precision': 0.6911524312273389,
    'recall': 0.8063001949581944,
    'fscore': 0.7282976802778509},
   'DECISION TREE': {'accuracy': 0.9624459809853068,
    'precision': 0.7258586032892305,
    'recall': 0.4663581803754021,
    'fscore': 0.5460023399935429},
   'KNN 3': {'accuracy': 0.9716508210890232,
    'precision': 0.7278451923992841,
    'recall': 0.7651513451444995,
    'fscore': 0.7340196226835253},
   'KNN 5': {'accuracy': 0.9741356957649092,
    'precision': 0.7429060651062424,
    'recall': 0.8112219641936006,
    'fscore': 0.7646889456382389},
   'KNN 7': {'accuracy': 0.9751512532411409,
    'precision': 0.7629389952524583,
    'recall': 0.8086061456714372,
    'fscore': 0.7739048165579469},
   'KNN 9': {'accuracy': 0.9696629213483146,
    'precision': 0.6676055199341759,
    'recall': 0.884206402936838,
    'fscore': 0.7595837191108503},
   'MOST FREQUEST CLASS': {'accuracy': 0.9454

In [430]:
dataframe_metrics = pd.DataFrame.from_dict(
    {(i, j, k): 
        languages_metrics[i][j][k]
        for i in languages_metrics.keys()
        for j in languages_metrics[i].keys()
        for k in languages_metrics[i][j].keys()
    },
    orient='index'
)

dataframe_metrics.to_csv('evaluation_metrics.csv')
dataframe_metrics.head(20)

Unnamed: 0,Unnamed: 1,Unnamed: 2,accuracy,precision,recall,fscore
Arabic-all,no_balanced,LOGISTIC REGRESION,0.967697,0.691152,0.8063,0.728298
Arabic-all,no_balanced,DECISION TREE,0.962446,0.725859,0.466358,0.546002
Arabic-all,no_balanced,KNN 3,0.971651,0.727845,0.765151,0.73402
Arabic-all,no_balanced,KNN 5,0.974136,0.742906,0.811222,0.764689
Arabic-all,no_balanced,KNN 7,0.975151,0.762939,0.808606,0.773905
Arabic-all,no_balanced,KNN 9,0.969663,0.667606,0.884206,0.759584
Arabic-all,no_balanced,MOST FREQUEST CLASS,0.945484,0.0,0.0,0.0
Arabic-all,no_balanced,RANDOM CLASS,0.499957,0.054466,0.49689,0.098115
Arabic-all,balanced,LOGISTIC REGRESION,0.945301,0.933903,0.9587,0.945994
Arabic-all,balanced,DECISION TREE,0.906109,0.983903,0.824792,0.896406


# Conclusiones generales preliminares
1. Modelo con mejor performance KNN con K = 5 dado que accuracy es 0.99 y fscore es 0.9908
2. Los features asociados a degree de los hermanos y padres hicieron que los modelos mejoraran bastantes