# Modulo - NPL

In [None]:
# Módulo de Procesamiento de NLP separado en pasos
%pip install spacy
import spacy

In [3]:
# Paso 1: Carga y preprocesamiento de texto
class TextPreprocessor:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')

    def preprocess_text(self, text):
        """Limpieza y normalización del texto."""
        doc = self.nlp(text)
        return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

In [4]:
# Paso 2: Extracción de términos clave
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

class KeyTermExtractor:
    def __init__(self):
        self.vectorizer = None

    def extract_key_terms(self, documents, max_features=100):
        """Extraer términos clave usando TF-IDF."""
        self.vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
        tfidf_matrix = self.vectorizer.fit_transform(documents)
        terms = self.vectorizer.get_feature_names_out()
        scores = np.asarray(tfidf_matrix.sum(axis=0)).flatten()
        term_scores = list(zip(terms, scores))
        return sorted(term_scores, key=lambda x: x[1], reverse=True)



In [5]:
# Paso 3: Análisis de co-ocurrencias
class CooccurrenceAnalyzer:
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor

    def analyze_cooccurrences(self, documents, threshold=0.1):
        """Analizar co-ocurrencias de términos clave."""
        doc_terms = [set(self.preprocessor.preprocess_text(doc).split()) for doc in documents]
        cooccurrence_matrix = {}

        for i, terms1 in enumerate(doc_terms):
            for j, terms2 in enumerate(doc_terms):
                if i != j:
                    common_terms = terms1.intersection(terms2)
                    for term in common_terms:
                        cooccurrence_matrix[term] = cooccurrence_matrix.get(term, 0) + 1

        return {k: v for k, v in cooccurrence_matrix.items() if v > threshold}



In [6]:
# Paso 4: Actualización de la ontología
class OntologyUpdater:
    def update_ontology(self, extracted_terms, ontology_terms):
        """Actualizar la ontología con nuevos términos."""
        new_terms = set(term for term, score in extracted_terms) - set(ontology_terms)
        updated_ontology = list(set(ontology_terms).union(new_terms))
        return updated_ontology, new_terms



In [17]:
# Paso 5: Cálculo de métricas
class OntologyMetrics:
    def calculate_semantic_precision(self, ontology_terms, extracted_terms):
        """Calcular la precisión semántica."""
        extracted_set = set(term for term, score in extracted_terms)
        correct_terms = extracted_set.intersection(set(ontology_terms))
        precision = len(correct_terms) / len(extracted_set) if extracted_set else 0
        return precision

    def calculate_coverage(self, ontology_terms, extracted_terms):
        """Calcular la cobertura ontológica."""
        extracted_set = set(term for term, score in extracted_terms)
        coverage = len(extracted_set.intersection(set(ontology_terms))) / len(ontology_terms) if ontology_terms else 0
        return coverage

    def calculate_consistency(self, ontology_terms):
        """Calcular la consistencia ontológica."""
        return 1.0  # Placeholder, dependería de reglas de consistencia específicas

In [13]:
import pandas as pd

def main():
    # Descargar el modelo de spaCy
    #!python -m spacy download en_core_web_sm

    # Datos de prueba
    sample_texts = [
        "The customer applied for a car loan with an interest rate of 5.2%.",
        "Home loans typically have lower interest rates than personal loans.",
        "Education loans require verification of enrollment in a recognized institution.",
    ]

    # Ontología inicial
    initial_ontology = ["car", "loan", "interest rate", "home", "education"]

    # Instanciar los componentes
    preprocessor = TextPreprocessor()
    extractor = KeyTermExtractor()
    analyzer = CooccurrenceAnalyzer(preprocessor)
    updater = OntologyUpdater()

    # Preprocesar textos
    preprocessed_texts = [preprocessor.preprocess_text(text) for text in sample_texts]

    # Extraer términos clave
    key_terms = extractor.extract_key_terms(preprocessed_texts)
    print("Términos clave:", key_terms)

    # Analizar co-ocurrencias
    cooccurrences = analyzer.analyze_cooccurrences(sample_texts)
    print("Co-ocurrencias:", cooccurrences)

    # Actualizar ontología
    updated_ontology, new_terms = updater.update_ontology(key_terms, initial_ontology)
    print("Ontología actualizada:", updated_ontology)
    print("Nuevos términos agregados:", new_terms)

    # Crear DataFrame para mostrar los resultados
    key_terms_df = pd.DataFrame(key_terms, columns=['Term', 'Score'])
    cooccurrences_df = pd.DataFrame(list(cooccurrences.items()), columns=['Term', 'Cooccurrence'])

    # Mostrar los resultados en tablas
    print("\nTérminos clave en tabla:")
    display(key_terms_df)

    print("\nCo-ocurrencias en tabla:")
    display(cooccurrences_df)

    print("\nOntología actualizada en tabla:")
    updated_ontology_df = pd.DataFrame(updated_ontology, columns=['Term'])
    display(updated_ontology_df)

if __name__ == '__main__':
    main()

Términos clave: [('loan', 1.0157281407543406), ('rate', 0.6949357650865606), ('apply', 0.5046113401371842), ('car', 0.5046113401371842), ('customer', 0.5046113401371842), ('home', 0.40914567838389126), ('low', 0.40914567838389126), ('personal', 0.40914567838389126), ('typically', 0.40914567838389126), ('education', 0.3968745408286403), ('enrollment', 0.3968745408286403), ('institution', 0.3968745408286403), ('recognize', 0.3968745408286403), ('require', 0.3968745408286403), ('verification', 0.3968745408286403)]
Co-ocurrencias: {'rate': 2, 'interest': 2, 'loan': 6}
Ontología actualizada: ['rate', 'personal', 'customer', 'education', 'institution', 'home', 'enrollment', 'typically', 'recognize', 'low', 'verification', 'apply', 'require', 'interest rate', 'loan', 'car']
Nuevos términos agregados: {'typically', 'recognize', 'rate', 'personal', 'customer', 'low', 'institution', 'verification', 'apply', 'require', 'enrollment'}

Términos clave en tabla:


Unnamed: 0,Term,Score
0,loan,1.015728
1,rate,0.694936
2,apply,0.504611
3,car,0.504611
4,customer,0.504611
5,home,0.409146
6,low,0.409146
7,personal,0.409146
8,typically,0.409146
9,education,0.396875



Co-ocurrencias en tabla:


Unnamed: 0,Term,Cooccurrence
0,rate,2
1,interest,2
2,loan,6



Ontología actualizada en tabla:


Unnamed: 0,Term
0,rate
1,personal
2,customer
3,education
4,institution
5,home
6,enrollment
7,typically
8,recognize
9,low


In [16]:
# Extraer textos del dataset para probar el módulo de NLP
file_path = "D:/datasets/loan_dataset.csv"
loan_data = pd.read_csv(file_path)

sample_texts_from_data = loan_data['loan_purpose'].unique()  # Tomar los propósitos de préstamo como ejemplos de texto

# Ontología inicial simulada
initial_ontology = ['loan', 'interest', 'credit', 'car', 'education', 'home']

# Instanciar los componentes del módulo de NLP
preprocessor = TextPreprocessor()
extractor = KeyTermExtractor()
analyzer = CooccurrenceAnalyzer(preprocessor)
updater = OntologyUpdater()

# Preprocesar los textos
preprocessed_texts = [preprocessor.preprocess_text(text) for text in sample_texts_from_data]

# Extraer términos clave
key_terms = extractor.extract_key_terms(preprocessed_texts)

# Analizar co-ocurrencias
cooccurrences = analyzer.analyze_cooccurrences(sample_texts_from_data)

# Actualizar la ontología
updated_ontology, new_terms = updater.update_ontology(key_terms, initial_ontology)

key_terms, cooccurrences, updated_ontology, new_terms


([('business', 1.0), ('car', 1.0), ('education', 1.0), ('home', 1.0)],
 {},
 ['business', 'home', 'credit', 'loan', 'education', 'interest', 'car'],
 {'business'})

In [18]:
# Redefinir las clases del módulo para ajustarlas al análisis actual
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

class TextPreprocessor:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')

    def preprocess_text(self, text):
        """Limpieza y normalización del texto."""
        doc = self.nlp(text)
        return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

class KeyTermExtractor:
    def __init__(self):
        self.vectorizer = None

    def extract_key_terms(self, documents, max_features=100):
        """Extraer términos clave usando TF-IDF."""
        self.vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
        tfidf_matrix = self.vectorizer.fit_transform(documents)
        terms = self.vectorizer.get_feature_names_out()
        scores = np.asarray(tfidf_matrix.sum(axis=0)).flatten()
        term_scores = list(zip(terms, scores))
        return sorted(term_scores, key=lambda x: x[1], reverse=True)

class CooccurrenceAnalyzer:
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor

    def analyze_cooccurrences(self, documents, threshold=0.1):
        """Analizar co-ocurrencias de términos clave."""
        doc_terms = [set(self.preprocessor.preprocess_text(doc).split()) for doc in documents]
        cooccurrence_matrix = {}

        for i, terms1 in enumerate(doc_terms):
            for j, terms2 in enumerate(doc_terms):
                if i != j:
                    common_terms = terms1.intersection(terms2)
                    for term in common_terms:
                        cooccurrence_matrix[term] = cooccurrence_matrix.get(term, 0) + 1

        return {k: v for k, v in cooccurrence_matrix.items() if v > threshold}

class OntologyUpdater:
    def update_ontology(self, extracted_terms, ontology_terms):
        """Actualizar la ontología con nuevos términos."""
        new_terms = set(term for term, score in extracted_terms) - set(ontology_terms)
        updated_ontology = list(set(ontology_terms).union(new_terms))
        return updated_ontology, new_terms

class OntologyMetrics:
    def calculate_semantic_precision(self, ontology_terms, extracted_terms):
        """Calcular la precisión semántica."""
        extracted_set = set(term for term, score in extracted_terms)
        correct_terms = extracted_set.intersection(set(ontology_terms))
        precision = len(correct_terms) / len(extracted_set) if extracted_set else 0
        return precision

    def calculate_coverage(self, ontology_terms, extracted_terms):
        """Calcular la cobertura ontológica."""
        extracted_set = set(term for term, score in extracted_terms)
        coverage = len(extracted_set.intersection(set(ontology_terms))) / len(ontology_terms) if ontology_terms else 0
        return coverage

    def calculate_consistency(self, ontology_terms):
        """Calcular la consistencia ontológica."""
        return 1.0  # Placeholder para reglas específicas



In [19]:
# Probar el módulo con el dataset cargado
preprocessor = TextPreprocessor()
extractor = KeyTermExtractor()
analyzer = CooccurrenceAnalyzer(preprocessor)
updater = OntologyUpdater()
metrics = OntologyMetrics()

# Preprocesar textos del dataset
preprocessed_texts = [preprocessor.preprocess_text(text) for text in sample_texts_from_data]

# Extraer términos clave
key_terms = extractor.extract_key_terms(preprocessed_texts)

# Analizar co-ocurrencias
cooccurrences = analyzer.analyze_cooccurrences(sample_texts_from_data)

# Actualizar ontología
updated_ontology, new_terms = updater.update_ontology(key_terms, initial_ontology)

# Calcular métricas
precision = metrics.calculate_semantic_precision(initial_ontology, key_terms)
coverage = metrics.calculate_coverage(initial_ontology, key_terms)
consistency = metrics.calculate_consistency(initial_ontology)

# Mostrar resultados
key_terms[:10], cooccurrences, updated_ontology, new_terms, precision, coverage, consistency


([('business', 1.0), ('car', 1.0), ('education', 1.0), ('home', 1.0)],
 {},
 ['business', 'home', 'credit', 'loan', 'education', 'interest', 'car'],
 {'business'},
 0.75,
 0.5,
 1.0)