In [1]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [2]:
# Nombre del archivo JSON
errorPath = "./data/error_data.json"
correctPath = "./data/correct_data.json"

# Abrir el archivo en modo lectura
with open(errorPath, "r") as file:
    errorData = json.load(file)

with open(correctPath, "r") as file:
    correctData = json.load(file)


In [3]:
def convert_to_new_format(input_data, value):
    new_data = {
        'response': value,
        'body': input_data
    }
    return new_data

In [4]:
data_correct = [convert_to_new_format(d, True) for d in correctData]
data_false = [convert_to_new_format(d, False) for d in errorData]
data = data_correct + data_false

In [5]:
# Extraer las características y etiquetas de los datos
X = [d["body"] for d in data]
y = [d["response"] for d in data]

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorizar las características utilizando DictVectorizer
vectorizer = DictVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Entrenar un modelo de clasificación (por ejemplo, Regresión Logística)
model = LogisticRegression()
model.fit(X_train_vec, y_train)

coeficientes = model.coef_
nombres_caracteristicas = vectorizer.get_feature_names_out()


In [6]:
df_coeficientes = pd.DataFrame(zip(vectorizer.get_feature_names_out(), model.coef_[0]), columns=['Característica', 'Coeficiente'])
df_coeficientes = df_coeficientes.sort_values(by='Coeficiente')
df_coeficientes

Unnamed: 0,Característica,Coeficiente
4,company=,-0.469437
29,name=Robertñ,-0.417236
27,name=Pauño,-0.401127
18,lastName=Choque,-0.401127
16,email=p@12.com,-0.401127
25,name=Cristiano,-0.309627
22,lastName=Ronaldo,-0.309627
3,birthday=2023-09-06,-0.115248
9,country=Bolivia,-0.115248
12,country=Portugal,-0.052169
