## Practical Work 3
### Session 2: Validation of the results
#### Lluis Pellicer Juan y Jorge De la Cruz Martínez

In [1]:
import json
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import cross_val_predict, KFold, StratifiedKFold
from sklearn.metrics import f1_score

In [2]:
# Cargar datos
carpeta_datos = "C:/Users/JUAN LENOVO/Desktop/3 GCD/2º cuatrimestre/LNR/PRÁCRICA 7/Dataset-Oppositional"

# Rutas de archivos
ruta_entrenamiento_en = os.path.join(carpeta_datos, "training", "dataset_oppositional", "dataset_en_train.json")
ruta_entrenamiento_es = os.path.join(carpeta_datos, "training", "dataset_oppositional", "dataset_es_train.json")
ruta_test_en = os.path.join(carpeta_datos, "test", "dataset_oppositional_test_nolabels", "dataset_en_official_test_nolabels.json")
ruta_test_es = os.path.join(carpeta_datos, "test", "dataset_oppositional_test_nolabels", "dataset_es_official_test_nolabels.json")

In [3]:
# Función para cargar datos de un archivo JSON
def cargar_datos(ruta):
    with open(ruta, "r", encoding="utf-8") as archivo:
        datos = json.load(archivo)
    return datos

# Cargar datos
datos_entrenamiento_en = cargar_datos(ruta_entrenamiento_en)
datos_entrenamiento_es = cargar_datos(ruta_entrenamiento_es)
datos_test_en = cargar_datos(ruta_test_en)
datos_test_es = cargar_datos(ruta_test_es)


In [4]:
# Extraer textos y etiquetas
texts_es_train = [dato["text"] for dato in datos_entrenamiento_es]
labels_es_train = [dato["category"] for dato in datos_entrenamiento_es]

texts_en_train = [dato["text"] for dato in datos_entrenamiento_en]
labels_en_train = [dato["category"] for dato in datos_entrenamiento_en]

texts_es_test = [dato["text"] for dato in datos_test_es]
texts_en_test = [dato["text"] for dato in datos_test_en]

In [5]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# Preprocesamiento para inglés
preprocessed_texts_en_train = [preprocess_text(text) for text in texts_en_train]
preprocessed_texts_en_test = [preprocess_text(text) for text in texts_en_test]

vectorizer_en = CountVectorizer(analyzer='word', max_features=4000, lowercase=True)
X_en_train = vectorizer_en.fit_transform(preprocessed_texts_en_train)
X_en_test = vectorizer_en.transform(preprocessed_texts_en_test)

X_en_train = X_en_train.toarray()
X_en_test = X_en_test.toarray()

Y_en_train = np.array(labels_en_train)

# Preprocesamiento para español
preprocessed_texts_es_train = [preprocess_text(text) for text in texts_es_train]
preprocessed_texts_es_test = [preprocess_text(text) for text in texts_es_test]

vectorizer_es = CountVectorizer(analyzer='word', max_features=4000, lowercase=True)
X_es_train = vectorizer_es.fit_transform(preprocessed_texts_es_train)
X_es_test = vectorizer_es.transform(preprocessed_texts_es_test)

X_es_train = X_es_train.toarray()
X_es_test = X_es_test.toarray()

Y_es_train = np.array(labels_es_train)

**Best model for english texts -> Stacking**

In [6]:
# Mejor modelo para textos en inglés -> Stacking
X_train_en, X_test_en, y_train_en, y_test_en = train_test_split(X_en_train, Y_en_train, test_size=0.1, random_state=1234)

svc_params_en = {'C': 0.1, 'kernel': 'linear'}
lr_params_en = {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
dt_params_en = {'criterion': 'gini', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'}

base_models_en = [('svc', SVC(**svc_params_en)), ('lr', LogisticRegression(**lr_params_en)), ('dt', DecisionTreeClassifier(**dt_params_en))]

meta_model_en = LogisticRegression()
ensemble_en = StackingClassifier(estimators=base_models_en, final_estimator=meta_model_en)


**Best model for spanish texts -> Logistic regression**

In [7]:
# Mejor modelo para textos en español -> Regresión Logística
X_train_es, X_test_es, y_train_es, y_test_es = train_test_split(X_es_train, Y_es_train, test_size=0.1, random_state=1234)

mejores_parametros_es = {'penalty': 'l2', 'C': 10, 'solver': 'lbfgs'}
clf_es = LogisticRegression(**mejores_parametros_es)

## Validation of the models

**K-fold validation**

In [8]:
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics import matthews_corrcoef, f1_score
import numpy as np

def evaluate_with_kfold(model, X, y, n_folds=10):
    kf = KFold(n_splits=n_folds)
    mcc_scores = []
    f1_scores = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        mcc = matthews_corrcoef(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        mcc_scores.append(mcc)
        f1_scores.append(f1)
    
    avg_mcc = np.mean(mcc_scores)
    avg_f1 = np.mean(f1_scores)
    
    return avg_mcc, avg_f1

avg_mcc_en, avg_f1_en = evaluate_with_kfold(ensemble_en, X_en_train, Y_en_train)
print("MCC promedio para inglés (validación 10-fold):", avg_mcc_en)
print("F1-score promedio para inglés (validación 10-fold):", avg_f1_en)

avg_mcc_es, avg_f1_es = evaluate_with_kfold(clf_es, X_es_train, Y_es_train)
print("MCC promedio para español (validación 10-fold):", avg_mcc_es)
print("F1-score promedio para español (validación 10-fold):", avg_f1_es)

MCC promedio para inglés (validación 10-fold): 0.6875396667673901
F1-score promedio para inglés (validación 10-fold): 0.8589294100616099


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

MCC promedio para español (validación 10-fold): 0.5796499673844874
F1-score promedio para español (validación 10-fold): 0.8060910345686896


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


**Stratified K-Fold Validation**

In [9]:
from sklearn.model_selection import StratifiedKFold
import numpy as np

def evaluate_with_stratified_kfold(model, X, y, n_folds=10):
    skf = StratifiedKFold(n_splits=n_folds)
    mcc_scores = []
    f1_scores = []
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        mcc = matthews_corrcoef(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        mcc_scores.append(mcc)
        f1_scores.append(f1)
    
    avg_mcc = np.mean(mcc_scores)
    avg_f1 = np.mean(f1_scores)
    
    return avg_mcc, avg_f1

avg_mcc_en_stratified, avg_f1_en_stratified = evaluate_with_stratified_kfold(ensemble_en, X_en_train, Y_en_train)
print("MCC promedio para inglés (validación estratificada 10-fold):", avg_mcc_en_stratified)
print("F1-score promedio para inglés (validación estratificada 10-fold):", avg_f1_en_stratified)

avg_mcc_es_stratified, avg_f1_es_stratified = evaluate_with_stratified_kfold(clf_es, X_es_train, Y_es_train)
print("MCC promedio para español (validación estratificada 10-fold):", avg_mcc_es_stratified)
print("F1-score promedio para español (validación estratificada 10-fold):", avg_f1_es_stratified)

MCC promedio para inglés (validación estratificada 10-fold): 0.6888953371936559
F1-score promedio para inglés (validación estratificada 10-fold): 0.8594829661249328


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

MCC promedio para español (validación estratificada 10-fold): 0.565969129186883
F1-score promedio para español (validación estratificada 10-fold): 0.7992309750079963


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
