In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier
import matplotlib.pyplot as plt

# Realización de prueba y modelado para la base de datos: **Arbol de decisión**


## Iniciamos extrayendo la base de datos

In [2]:
file_path = "../Bases/base_cvs/baseCVsIngVar.csv"  # Ajusta el path si es necesario
df = pd.read_csv(file_path)
df = df.drop(columns=["Unnamed: 0", "Fuente principal"], errors="ignore")  # errors="ignore" evita fallos si ya no existe


In [3]:
df

Unnamed: 0,Páginas,professional_profile,education,work_experience,skills,certifications,achievements,languages,projects,training_courses,...,tiene_perfil,tiene_educacion,tiene_certificaciones,tiene_proyectos,tiene_links_relevantes,tiene_skills,tiene_achievements,tiene_languages,tiene_projects,secciones_completas
0,2,39,28,72,98,69,0,0,13,48,...,1,1,1,1,0,1,0,0,1,7
1,6,306,369,80,0,57,9,4,182,389,...,1,1,1,1,0,0,1,1,1,8
2,1,26,56,10,99,0,0,65,0,23,...,1,1,0,0,0,1,0,1,0,6
3,1,71,14,80,28,0,0,2,0,0,...,1,1,0,0,1,1,0,1,0,6
4,4,70,79,0,4,0,0,229,0,123,...,1,1,0,0,0,1,0,1,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,6,30,50,7,0,0,18,56,395,367,...,1,1,0,1,1,0,1,1,1,8
618,10,126,329,36,24,928,130,0,409,279,...,1,1,1,1,0,1,1,0,1,8
619,3,12,15,82,23,0,228,16,34,27,...,1,1,0,1,0,1,1,1,1,8
620,1,20,22,33,19,0,0,30,0,14,...,1,1,0,0,0,1,0,1,0,6


### Información descriptiva

In [4]:
print(df.info())
print(df["Passed"].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 622 entries, 0 to 621
Data columns (total 52 columns):
 #   Column                                            Non-Null Count  Dtype  
---  ------                                            --------------  -----  
 0   Páginas                                           622 non-null    int64  
 1   professional_profile                              622 non-null    int64  
 2   education                                         622 non-null    int64  
 3   work_experience                                   622 non-null    int64  
 4   skills                                            622 non-null    int64  
 5   certifications                                    622 non-null    int64  
 6   achievements                                      622 non-null    int64  
 7   languages                                         622 non-null    int64  
 8   projects                                          622 non-null    int64  
 9   training_courses     

### Separar las bases en variables X y Y

In [5]:
X = df.drop(columns=["Passed"])
y = df["Passed"]

In [6]:
X

Unnamed: 0,Páginas,professional_profile,education,work_experience,skills,certifications,achievements,languages,projects,training_courses,...,tiene_perfil,tiene_educacion,tiene_certificaciones,tiene_proyectos,tiene_links_relevantes,tiene_skills,tiene_achievements,tiene_languages,tiene_projects,secciones_completas
0,2,39,28,72,98,69,0,0,13,48,...,1,1,1,1,0,1,0,0,1,7
1,6,306,369,80,0,57,9,4,182,389,...,1,1,1,1,0,0,1,1,1,8
2,1,26,56,10,99,0,0,65,0,23,...,1,1,0,0,0,1,0,1,0,6
3,1,71,14,80,28,0,0,2,0,0,...,1,1,0,0,1,1,0,1,0,6
4,4,70,79,0,4,0,0,229,0,123,...,1,1,0,0,0,1,0,1,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
617,6,30,50,7,0,0,18,56,395,367,...,1,1,0,1,1,0,1,1,1,8
618,10,126,329,36,24,928,130,0,409,279,...,1,1,1,1,0,1,1,0,1,8
619,3,12,15,82,23,0,228,16,34,27,...,1,1,0,1,0,1,1,1,1,8
620,1,20,22,33,19,0,0,30,0,14,...,1,1,0,0,0,1,0,1,0,6


In [7]:
y

0      1
1      1
2      1
3      1
4      1
      ..
617    0
618    0
619    0
620    0
621    0
Name: Passed, Length: 622, dtype: int64

### Separar características (X) y variable objetivo (y)

### Dividimos para entrenar

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69420, stratify=y)

# Revisar la nueva estructura de los datos
print("Forma de x_train:", X_train.shape)
print("Forma de x_test:", X_test.shape)
print("Forma de y_train:", y_train.shape)
print("Forma de y:", y_test.shape)

Forma de x_train: (497, 51)
Forma de x_test: (125, 51)
Forma de y_train: (497,)
Forma de y: (125,)


## Entrenamiento del modelo

### Árbol de decisión

In [9]:
def train_decision_tree(X_train, y_train, X_test, y_test):
    model = DecisionTreeClassifier(max_depth=5)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred),
        "ROC": roc_auc_score(y_test, y_pred)
    }

resultados_ab = {
    "Arbol de decisión": train_decision_tree(X_train, y_train, X_test, y_test),
}

In [10]:
resultados_ab

{'Arbol de decisión': {'Accuracy': 0.504,
  'Precision': 0.3902439024390244,
  'Recall': 0.3018867924528302,
  'F1-Score': 0.3404255319148936,
  'ROC': np.float64(0.477332285115304)}}

#### Ajuste hiperparametros arbol

In [11]:
tree = DecisionTreeClassifier(class_weight="balanced", random_state=42)

param_grid = {
    "max_depth": [3, 5, 10, None],
    "min_samples_leaf": [1, 5, 10],
    "max_features": ["sqrt", "log2", None]
}

grid_search = GridSearchCV(tree, param_grid, cv=5, scoring="f1", n_jobs=-1)
grid_search.fit(X_train, y_train)

best_tree = grid_search.best_estimator_
print("Mejores hiperparámetros:", grid_search.best_params_)

Mejores hiperparámetros: {'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 10}


In [12]:
def train_decision_tree(X_train, y_train, X_test, y_test):
    model = DecisionTreeClassifier(max_depth=3, max_features='log2', min_samples_leaf=10)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred),
        "ROC": roc_auc_score(y_test, y_pred)
    }

resultados_ab = {
    "Arbol de decisión": train_decision_tree(X_train, y_train, X_test, y_test),
}

In [13]:
resultados_ab

{'Arbol de decisión': {'Accuracy': 0.552,
  'Precision': 0.36363636363636365,
  'Recall': 0.07547169811320754,
  'F1-Score': 0.125,
  'ROC': np.float64(0.48912473794549266)}}

### Maquina de soporte vectorial

In [14]:
def train_svm(X_train, y_train, X_test, y_test):
    model = SVC(kernel="linear")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred),
        "ROC": roc_auc_score(y_test, y_pred)
    }

resultados_msv = {
    "Maquina de soporte vectorial": train_svm(X_train, y_train, X_test, y_test),
}

In [15]:
resultados_msv

{'Maquina de soporte vectorial': {'Accuracy': 0.496,
  'Precision': 0.3684210526315789,
  'Recall': 0.2641509433962264,
  'F1-Score': 0.3076923076923077,
  'ROC': np.float64(0.46540880503144655)}}

### Random forest

In [16]:
def train_random_forest(X_train, y_train, X_test, y_test):
    model = RandomForestClassifier(n_estimators=50, max_depth=5)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred)
    }

resultados_rf = {
    "Random Forest": train_logistic_regression(X_train_scaled, y_train, X_test_scaled, y_test),
}

NameError: name 'train_logistic_regression' is not defined

In [91]:
resultados_rf

{'Random Forest': {'Accuracy': 0.6,
  'Precision': 0.5517241379310345,
  'Recall': 0.3018867924528302,
  'F1-Score': 0.3902439024390244,
  'ROC': np.float64(0.5606656184486373)}}

### XGBoost

In [92]:
# Función para entrenar y evaluar XGBoost
def train_xgboost(X_train, y_train, X_test, y_test):
    model = XGBClassifier(n_estimators=50, max_depth=3, use_label_encoder=False, eval_metric="logloss")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred)
    }

resultados_xgb = {
    "Random Forest": train_logistic_regression(X_train_scaled, y_train, X_test_scaled, y_test),
}

In [93]:
resultados_xgb

{'Random Forest': {'Accuracy': 0.6,
  'Precision': 0.5517241379310345,
  'Recall': 0.3018867924528302,
  'F1-Score': 0.3902439024390244,
  'ROC': np.float64(0.5606656184486373)}}