# Proyecto CSI: predecir la palabra root de una oración
Autores: Oriol Catasús Llena, Pablo Arancibia Barahona

Fecha 14 de enero de 2024

In [1]:
# Imports
import os
import csv
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import (
    confusion_matrix,
)
from sklearn.exceptions import UndefinedMetricWarning
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import RandomOverSampler

warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

In [2]:
PATH_PUD = './pud26/'
PATH_METRICS_LAL = './metrics/'

In [3]:
# Funcion para obtener los datos de archivos PUD
def get_data(path):
    f = open(path, "r")
    data = []
    
    for x in f:
        data += [[int(y) for y in x.split(' ')]]

    return data

# Libreria LAL
Primero miramos si el conjunto de arboles es correcto. Luego procesamos el conjunto de arboles y generamos un fichero CSV con sus métricas

In [4]:
import lal

def process_treebank(path, lal_file_path):
    errlist = lal.io.check_correctness_treebank(path)
    for err in errlist:
        print(err)
    lal.io.process_treebank(path, lal_file_path)

In [5]:
files = os.listdir(PATH_PUD)
for file_name in files:
    file_path = os.path.join(PATH_PUD, file_name)
    lang = file_name.split('.')[0]
    metrics_file_name = lang + '.csv'
    if os.path.isfile(file_path):
        metrics_file_path = os.path.join(PATH_METRICS_LAL, metrics_file_name)
        process_treebank(file_path, metrics_file_path)

En caso de contar con un archivo de metricas producto de LAL, estas se cargan para su posterior uso

In [4]:
def get_metrics_from_lal(lal_file_path):
    metrics = []
    with open(lal_file_path, 'r') as file:
        csv_reader = csv.DictReader(file, delimiter='\t')
        for row in csv_reader:
            metrics.append(row)
    return metrics

In [5]:
def calcular_grados_hermanos_padre(grafo, vertex):
    padres = set(grafo)
    avg_vertex_sibling = [0] * len(grafo)
    avg_vertex_children = [0] * len(grafo)

    for padre in padres:
        index_hijos = []
        for i in range(len(grafo)):
            if grafo[i] == padre:
                index_hijos.append(i)
 
        vertex_degree = [vertex[hijo] for hijo in index_hijos]

        for hijo in index_hijos:
            avg_vertex_sibling[hijo] = np.mean(vertex_degree)

        if padre != 0:
            avg_vertex_children[padre - 1] = np.mean(vertex_degree)

    return avg_vertex_sibling, avg_vertex_children

In [6]:
def calcular_distancias(grafo):
    n = len(grafo)
    distancias = [[float('inf')] * n for _ in range(n)]

    # Inicializar las distancias conocidas
    for i in range(n):
        distancias[i][i] = 0
        vecino = grafo[i]
        if vecino != 0:
            distancias[i][vecino - 1] = 1  # Peso siempre es 1
            distancias[vecino - 1][i] = 1  # Asegurar bidireccionalidad

    # Calcular las distancias mínimas
    for k in range(n):
        for i in range(n):
            for j in range(n):
                distancias[i][j] = min(distancias[i][j], distancias[i][k] + distancias[k][j])

    return [sum(d)/n for d in distancias], [np.std(d) for d in distancias]

In [73]:
def create_matrix(data, metrics):
    dataframe_dict = {
        'vertex_degree': [],
        'vertex_distance': [],
        'centre': [],
        'centroid': [],
        'vertex_distance_std': [],
        'sibling_average_degree': [],
        'children_average_degree': [],
        'class': [],
    }
    
    for sentence, m in zip(data, metrics):
        dataframe_dict['class'] += [1 if word == 0 else 0 for word in sentence]
        dataframe_dict['centre'] += [1 if int(m['tree_centre1']) == i or int(m['tree_centre2']) == i else 0 for i in range(len(sentence))]
        dataframe_dict['centroid'] += [1 if int(m['tree_centroid1']) == i or int(m['tree_centroid2']) == i else 0 for i in range(len(sentence))]
        
        vertex_degree = [sentence.count(index + 1) for index in range(len(sentence))]
        sibling_degree, children_degree = calcular_grados_hermanos_padre(sentence, vertex_degree)
        dataframe_dict['vertex_degree'] += vertex_degree
        dataframe_dict['sibling_average_degree'] += sibling_degree
        dataframe_dict['children_average_degree'] += children_degree

        average, standar_desviation = calcular_distancias(sentence)
        dataframe_dict['vertex_distance'] += average
        dataframe_dict['vertex_distance_std'] += standar_desviation

    return pd.DataFrame(dataframe_dict)


In [52]:
def normalize_data(dataframe):
    scaler = MinMaxScaler()
    dataframe_normalized = scaler.fit_transform(dataframe)
    dataframe_normalized = pd.DataFrame(dataframe_normalized, columns=dataframe.columns)
    dataframe_normalized['centre'] = dataframe_normalized['centre'].astype(int)
    dataframe_normalized['centroid'] = dataframe_normalized['centroid'].astype(int)

    return dataframe_normalized

# Creación de la matriz
Haciendo uso de las funciones anteriores se crea un matriz normalizada

In [74]:
data = {}
files = os.listdir(PATH_PUD)
use_only_english = True # En caso de solo usar los archivos de ingles

for file_name in files:
    if use_only_english:
        file_name = 'English-all.heads'
    file_path = os.path.join(PATH_PUD, file_name)
    if os.path.isfile(file_path):
        lang = file_name.split('.')[0]
        metrics_file_name = lang + '.csv'
        metrics_file_path = os.path.join(PATH_METRICS_LAL, metrics_file_name)
        lang_data = get_data(file_path)
        metrics = get_metrics_from_lal(metrics_file_path)
        data[lang] = create_matrix(lang_data, metrics)
    if use_only_english:
        break
        
print(f'Se cargaron {len(data.keys())} lenguajes')
data['English-all'].head()

Se cargaron 1 lenguajes


Unnamed: 0,vertex_degree,vertex_distance,centre,centroid,vertex_distance_std,sibling_average_degree,children_average_degree,class
0,0,3.533333,0,0,1.284091,1.0,0.0,0
1,1,3.266667,0,0,1.412641,1.0,3.0,0
2,0,4.933333,0,0,1.948219,0.0,0.0,0
3,0,4.933333,0,0,1.948219,0.0,0.0,0
4,0,4.933333,0,0,1.948219,0.0,0.0,0


In [75]:
data['English-all'].to_csv('dataframe_with_7_features.csv')

# Balanceo de clases de la matriz

In [54]:
def balancear_clase_oversampling(dataframe, clase_objetivo, target_column='class', random_state=None):
    X = dataframe.loc[:, dataframe.columns != target_column]
    y = dataframe[target_column]
    # Encuentra los índices de la clase objetivo
    indices_clase_objetivo = np.where(y == clase_objetivo)[0]

    # Crea el oversampler
    oversampler = RandomOverSampler(sampling_strategy='auto', random_state=random_state)

    # Aplica oversampling solo a la clase objetivo
    X_resampled, y_resampled = oversampler.fit_resample(X, y)

    # Convierte los resultados a DataFrames si X y y son de tipo DataFrame
    if isinstance(X, pd.DataFrame):
        X_balanced = pd.DataFrame(X_resampled, columns=X.columns)
    else:
        X_balanced = np.array(X_resampled)

    if isinstance(y, pd.Series):
        y_balanced = pd.Series(y_resampled, name=y.name)
    else:
        y_balanced = np.array(y_resampled)

    X_balanced[target_column] = y_balanced
    return X_balanced


In [55]:
dataframe_normalized = data['English-all']
dataframe_normalized_balanced = balancear_clase_oversampling(dataframe_normalized, 1)
dataframe_normalized_balanced.head(30)

Unnamed: 0,vertex_degree,vertex_distance,centre,centroid,vertex_distance_std,sibling_average_degree,children_average_degree,class
0,0,3.533333,0,0,1.284091,1.0,0.0,0
1,1,3.266667,0,0,1.412641,1.0,3.0,0
2,0,4.933333,0,0,1.948219,0.0,0.0,0
3,0,4.933333,0,0,1.948219,0.0,0.0,0
4,0,4.933333,0,0,1.948219,0.0,0.0,0
5,3,4.0,0,0,1.807392,3.0,0.0,0
6,0,3.533333,0,0,1.284091,1.0,0.0,0
7,4,2.6,1,0,1.143095,2.5,1.0,0
8,0,4.266667,0,0,1.631632,0.0,0.0,0
9,0,4.266667,0,0,1.631632,0.0,0.0,0


In [56]:
print(dataframe_normalized['class'].value_counts())
print(dataframe_normalized_balanced['class'].value_counts())

class
0    17711
1      995
Name: count, dtype: int64
class
0    17711
1    17711
Name: count, dtype: int64


# Creación de pipeline de entrenamiento
Se crea una clase de evaluación de multiples modelos para una mejor escalabilidad

In [57]:
def division(numerador, denominador, valor_predeterminado=0):
    resultado = valor_predeterminado if denominador == 0 else numerador / denominador
    return resultado

class EvaluateModel:

    def __init__(self, models, verbose = 0):
        self._models = list(models.values())
        self._model_names = list(models.keys())
        self._verbose = verbose

    def _fit_cross_validation(self, model, X, y, num_folds=5):
        cv_scores = cross_validate(
            model,
            X,
            y,
            cv=num_folds,
            scoring=('accuracy', 'precision', 'recall', 'f1'),
        )
        model_fit = model.fit(X, y)

        return model_fit, cv_scores

    def _evaluate_with_dataframe(self, model, dataframe, target_column = 'class', iterations = 10):
        accuracies = []
        precisions = []
        recalls = []
        fscores = []

        for x in range(iterations):
            X_train, X_test, y_train, y_test = train_test_split(
                dataframe.loc[:, dataframe.columns != target_column],
                dataframe[target_column]
            )
            X_train = normalize_data(X_train)
            X_test = normalize_data(X_test)
            y_test = y_test.values.astype(int)

            model_fit, cv_scores = self._fit_cross_validation(model, X_train, y_train)
            predictions = model_fit.predict(X_test)

            cm = confusion_matrix(y_test, predictions)
            accuracy, precision, recall, fscore = self.get_metrics(cm)
    
            if self._verbose == 1:
                print(f'Metrics Iteration {x}')
                print(confusion_matrix(y_test, predictions))
                print(f"Accuracy = {accuracy}; Precision = {precision}; Recall = {recall}; fscore = {fscore}\n")
    
            accuracies.append(accuracy)
            precisions.append(precision)
            recalls.append(recall)
            fscores.append(fscore)

        mean_accuracy = np.mean(accuracies)
        mean_precision = np.mean(precisions)
        mean_recall = np.mean(recalls)
        mean_fscore = np.mean(fscores)
        
        print(f"Accuracy = {mean_accuracy}; Precision = {mean_precision}; Recall = {mean_recall}; fscore = {mean_fscore}\n")

        return mean_accuracy, mean_precision, mean_recall, mean_fscore

    def _accuracy_custom(self, TN, FP, FN, TP):
        return division((TP + TN),(TP + TN + FP + FN))
    
    def _recall_custom(self, TN, FP, FN, TP):
        return division((TP),(TP + FN))
    
    def _precision_custom(self, TN, FP, FN, TP):
        return division((TP),(TP + FP))
    
    def _fscore_custom(self, recall_metric, precision_metric):
        return division(2, division(1, recall_metric) + (division(1, precision_metric)))

    def get_metrics(self, confusion_matrix):
        TN, FP, FN, TP = confusion_matrix.ravel()
        
        accuracy_metric = self._accuracy_custom(TN, FP, FN, TP)
        precision_metric = self._precision_custom(TN, FP, FN, TP)
        recall_metric = self._recall_custom(TN, FP, FN, TP)
        fscore_metric = self._fscore_custom(recall_metric, precision_metric)
        
        return accuracy_metric, precision_metric, recall_metric, fscore_metric
        
    def evaluate_models_with_dataframe(self, dataframe, target_column = 'class', iterations = 10):
        metrics_model = {}
        for i, model in enumerate(self._models):
            print(f'######### {self._model_names[i]} #########')
            accuracy, precision, recall, fscore = self._evaluate_with_dataframe(model, dataframe, target_column, iterations)

            metrics_model[self._model_names[i]] = {
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'fscore': fscore,
            }
        return metrics_model


# Evaluación de los modelos con datos NO balanceados

In [58]:
models = {
    'LOGISTIC REGRESION': LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial'),
    'DECISION TREE': DecisionTreeClassifier(random_state=42),
    'KNN 3': KNeighborsClassifier(n_neighbors=3),
    'KNN 5': KNeighborsClassifier(n_neighbors=5),
    'KNN 7': KNeighborsClassifier(n_neighbors=7),
    'KNN 9': KNeighborsClassifier(n_neighbors=9),
    'MOST FREQUEST CLASS': DummyClassifier(strategy='most_frequent'),
    'RANDOM CLASS': DummyClassifier(strategy='uniform'),
}

evaluator = EvaluateModel(models, verbose = 0)

In [21]:
evaluator.evaluate_models_with_dataframe(dataframe_normalized)

######### LOGISTIC REGRESION #########
Accuracy = 0.9465469317938849; Precision = 0.5430030135963971; Recall = 0.3082642362054151; fscore = 0.3711024482628394

######### DECISION TREE #########
Accuracy = 0.9359846055163568; Precision = 0.380156913836601; Recall = 0.3197304375734148; fscore = 0.34663998285392833

######### KNN 3 #########
Accuracy = 0.9379944408809064; Precision = 0.4129039168285856; Recall = 0.34953020144997904; fscore = 0.37554160364292477

######### KNN 5 #########
Accuracy = 0.945969638657259; Precision = 0.46505485176506667; Recall = 0.30492925611838373; fscore = 0.3669914849967673

######### KNN 7 #########
Accuracy = 0.9442591404746633; Precision = 0.482363731663279; Recall = 0.28694832036613416; fscore = 0.3567985582283237

######### KNN 9 #########
Accuracy = 0.9472525122942057; Precision = 0.48973114149240127; Recall = 0.2719977786514248; fscore = 0.3435455317563457

######### MOST FREQUEST CLASS #########
Accuracy = 0.9472097498396408; Precision = 0.0; Recal

{'LOGISTIC REGRESION': {'accuracy': 0.9465469317938849,
  'precision': 0.5430030135963971,
  'recall': 0.3082642362054151,
  'fscore': 0.3711024482628394},
 'DECISION TREE': {'accuracy': 0.9359846055163568,
  'precision': 0.380156913836601,
  'recall': 0.3197304375734148,
  'fscore': 0.34663998285392833},
 'KNN 3': {'accuracy': 0.9379944408809064,
  'precision': 0.4129039168285856,
  'recall': 0.34953020144997904,
  'fscore': 0.37554160364292477},
 'KNN 5': {'accuracy': 0.945969638657259,
  'precision': 0.46505485176506667,
  'recall': 0.30492925611838373,
  'fscore': 0.3669914849967673},
 'KNN 7': {'accuracy': 0.9442591404746633,
  'precision': 0.482363731663279,
  'recall': 0.28694832036613416,
  'fscore': 0.3567985582283237},
 'KNN 9': {'accuracy': 0.9472525122942057,
  'precision': 0.48973114149240127,
  'recall': 0.2719977786514248,
  'fscore': 0.3435455317563457},
 'MOST FREQUEST CLASS': {'accuracy': 0.9472097498396408,
  'precision': 0.0,
  'recall': 0.0,
  'fscore': 0.0},
 'RAN

# Evaluación de los modelos con datos balanceados

In [22]:
evaluator.evaluate_models_with_dataframe(dataframe_normalized_balanced)

######### LOGISTIC REGRESION #########
Accuracy = 0.8764001806684734; Precision = 0.8674992935626419; Recall = 0.887797716416614; fscore = 0.877500504281912

######### DECISION TREE #########
Accuracy = 0.7217140921409213; Precision = 0.8786673095224586; Recall = 0.512915490387406; fscore = 0.6473609291196045

######### KNN 3 #########
Accuracy = 0.7965560072267389; Precision = 0.8850229084089577; Recall = 0.6829622796949321; fscore = 0.7705339904980771

######### KNN 5 #########
Accuracy = 0.8316508581752485; Precision = 0.8774731217857965; Recall = 0.7713347083779484; fscore = 0.8209206847382801

######### KNN 7 #########
Accuracy = 0.8492321589882564; Precision = 0.8704906541800197; Recall = 0.821882701392226; fscore = 0.8454201668326207

######### KNN 9 #########
Accuracy = 0.8536924119241192; Precision = 0.859706840797176; Recall = 0.8450894522304608; fscore = 0.8522056142980119

######### MOST FREQUEST CLASS #########
Accuracy = 0.4955736224028907; Precision = 0.09901761517615176

{'LOGISTIC REGRESION': {'accuracy': 0.8764001806684734,
  'precision': 0.8674992935626419,
  'recall': 0.887797716416614,
  'fscore': 0.877500504281912},
 'DECISION TREE': {'accuracy': 0.7217140921409213,
  'precision': 0.8786673095224586,
  'recall': 0.512915490387406,
  'fscore': 0.6473609291196045},
 'KNN 3': {'accuracy': 0.7965560072267389,
  'precision': 0.8850229084089577,
  'recall': 0.6829622796949321,
  'fscore': 0.7705339904980771},
 'KNN 5': {'accuracy': 0.8316508581752485,
  'precision': 0.8774731217857965,
  'recall': 0.7713347083779484,
  'fscore': 0.8209206847382801},
 'KNN 7': {'accuracy': 0.8492321589882564,
  'precision': 0.8704906541800197,
  'recall': 0.821882701392226,
  'fscore': 0.8454201668326207},
 'KNN 9': {'accuracy': 0.8536924119241192,
  'precision': 0.859706840797176,
  'recall': 0.8450894522304608,
  'fscore': 0.8522056142980119},
 'MOST FREQUEST CLASS': {'accuracy': 0.4955736224028907,
  'precision': 0.09901761517615176,
  'recall': 0.2,
  'fscore': 0.13

In [59]:
def pipeline_model(dataframe, evaluator):
    dataframe_balanced = balancear_clase_oversampling(dataframe, 1)
    print(dataframe['class'].value_counts())
    print(dataframe_balanced['class'].value_counts())
    
    print('\n STARTING WITH NOT BALANCED')
    metrics_no_balanced = evaluator.evaluate_models_with_dataframe(dataframe)
    
    print('STARTING WITH BALANCED')
    metrics_balanced = evaluator.evaluate_models_with_dataframe(dataframe_balanced)
    print()

    return {
        'no_balanced': metrics_no_balanced,
        'balanced': metrics_balanced,
    }

In [36]:
langs = [
    'Arabic-all',
    'Chinese-all',
    'Czech-all',
    'English-all',
    'Finnish-all',
    'French-all',
    'German-all',
    'Hindi-all',
    'Icelandic-all',
    'Indonesian-all',
    'Italian-all',
    'Japanese-all',
    'Korean-all',
    'Polish-all',
    'Portuguese-all',
    'Russian-all',
    'Spanish-all',
    'Swedish-all',
    'Thai-all',
    'Turkish-all'
]

In [60]:
languages_metrics = {}
if use_only_english: # variable definida más arriba
    langs = ['English-all']

for lang in langs:
    print(f"EVALUATING WITH: {lang} \n")
    languages_metrics[lang] = pipeline_model(data[lang], evaluator)

EVALUATING WITH: English-all 

class
0    17711
1      995
Name: count, dtype: int64
class
0    17711
1    17711
Name: count, dtype: int64

 STARTING WITH NOT BALANCED
######### LOGISTIC REGRESION #########
Accuracy = 0.9824032499465469; Precision = 0.8192437902620924; Recall = 0.8706401869385498; fscore = 0.8388637576660777

######### DECISION TREE #########
Accuracy = 0.9757750694889886; Precision = 0.8723575698236543; Recall = 0.646505173487735; fscore = 0.7235621970240323

######### KNN 3 #########
Accuracy = 0.9856318152661963; Precision = 0.8463731619393275; Recall = 0.8976876258832259; fscore = 0.8691698848339326

######### KNN 5 #########
Accuracy = 0.9856531964934788; Precision = 0.8249005795633542; Recall = 0.9250378536317297; fscore = 0.8696660863396876

######### KNN 7 #########
Accuracy = 0.986572589266624; Precision = 0.8500621587136596; Recall = 0.915486577319162; fscore = 0.8799921701739575

######### KNN 9 #########
Accuracy = 0.9830660679923028; Precision = 0.80090227

In [61]:
languages_metrics

{'English-all': {'no_balanced': {'LOGISTIC REGRESION': {'accuracy': 0.9824032499465469,
    'precision': 0.8192437902620924,
    'recall': 0.8706401869385498,
    'fscore': 0.8388637576660777},
   'DECISION TREE': {'accuracy': 0.9757750694889886,
    'precision': 0.8723575698236543,
    'recall': 0.646505173487735,
    'fscore': 0.7235621970240323},
   'KNN 3': {'accuracy': 0.9856318152661963,
    'precision': 0.8463731619393275,
    'recall': 0.8976876258832259,
    'fscore': 0.8691698848339326},
   'KNN 5': {'accuracy': 0.9856531964934788,
    'precision': 0.8249005795633542,
    'recall': 0.9250378536317297,
    'fscore': 0.8696660863396876},
   'KNN 7': {'accuracy': 0.986572589266624,
    'precision': 0.8500621587136596,
    'recall': 0.915486577319162,
    'fscore': 0.8799921701739575},
   'KNN 9': {'accuracy': 0.9830660679923028,
    'precision': 0.800902276290689,
    'recall': 0.919200701394425,
    'fscore': 0.8529658161160321},
   'MOST FREQUEST CLASS': {'accuracy': 0.9481291

In [62]:
dataframe_metrics = pd.DataFrame.from_dict(
    {(i, j, k): 
        languages_metrics[i][j][k]
        for i in languages_metrics.keys()
        for j in languages_metrics[i].keys()
        for k in languages_metrics[i][j].keys()
    },
    orient='index'
)

dataframe_metrics.to_csv('evaluation_metrics_english7.csv')
dataframe_metrics.head(20)

Unnamed: 0,Unnamed: 1,Unnamed: 2,accuracy,precision,recall,fscore
English-all,no_balanced,LOGISTIC REGRESION,0.982403,0.819244,0.87064,0.838864
English-all,no_balanced,DECISION TREE,0.975775,0.872358,0.646505,0.723562
English-all,no_balanced,KNN 3,0.985632,0.846373,0.897688,0.86917
English-all,no_balanced,KNN 5,0.985653,0.824901,0.925038,0.869666
English-all,no_balanced,KNN 7,0.986573,0.850062,0.915487,0.879992
English-all,no_balanced,KNN 9,0.983066,0.800902,0.919201,0.852966
English-all,no_balanced,MOST FREQUEST CLASS,0.948129,0.0,0.0,0.0
English-all,no_balanced,RANDOM CLASS,0.498781,0.053058,0.497437,0.09584
English-all,balanced,LOGISTIC REGRESION,0.973092,0.967256,0.979401,0.973267
English-all,balanced,DECISION TREE,0.971003,0.992091,0.949673,0.970358


# Conclusiones generales preliminares
1. Modelo con mejor performance KNN con K = 5 dado que accuracy es 0.99 y fscore es 0.9908
2. Los features asociados a degree de los hermanos y padres hicieron que los modelos mejoraran bastantes