In [1]:
import re
import razdel
import pandas as pd
from sklearn.model_selection import train_test_split
import torch  # Importar torchfrom torch.utils.data import Dataset, DataLoader
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from isanlp.pipeline_common import PipelineCommon
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import AutoModelForCausalLM, AutoTokenizer, BertForSequenceClassification, pipeline
from tqdm import tqdm
import gc
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score
import torch.nn.functional as F

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from razdel import sentenize
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from transformers import logging
from transformers import AutoTokenizer
logging.set_verbosity_error()

import matplotlib.pyplot as plt
import numpy as np


nltk.download('punkt_tab')
nltk.download('punkt')


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
class TrainingConfig:
    def __init__(self,
                 model_name,
                 max_length=128,
                 batch_size=64,
                 epochs=3,
                 learning_rate=2e-5,
                 num_repeats=6,
                 test_size=0.2,
                 threshold=0.5,
                 model_type='bert'):
        """
        Configuración para el entrenamiento del modelo.
        
        Args:
            model_name (str): Nombre del modelo preentrenado
            max_length (int): Longitud máxima de las secuencias
            batch_size (int): Tamaño del batch
            epochs (int): Número de épocas
            learning_rate (float): Tasa de aprendizaje
            num_repeats (int): Número de repeticiones con diferentes seeds
            test_size (float): Proporción del conjunto de prueba
            threshold (float): Umbral de similitud
        """
        self.model_name = model_name
        self.max_length = max_length
        self.batch_size = batch_size
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.num_repeats = num_repeats
        self.test_size = test_size
        self.threshold = threshold
        self.model_type = model_type
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        #self.tokenizer = BertTokenizer.from_pretrained(model_name)#creo que hay que usar el tokenizer de ruso
        self.tokenizer = AutoTokenizer.from_pretrained(model_name) 
        if self.model_type == 'gpt':
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.tokenizer.padding_side = 'left'

In [3]:
def cargar_datos(archivo, etiqueta):
    with open(archivo, 'r', encoding='utf-8') as f:
        textos = f.readlines()
    return [(texto.strip(), etiqueta) for texto in textos]

In [4]:
def tokenizar_fuction(texto):
    """Tokeniza el texto eliminando puntuación y convirtiendo a minúsculas."""
    texto_preprocesado = re.sub(r'[^\w\s]', '', texto.lower())
    tokens = nltk.word_tokenize(texto_preprocesado)
    return tokens

In [5]:
def calcular_similitud_mayoria_optimizado(textos1, textos2, threshold):
    """
        Calcula similitudes entre dos listas de textos usando GPU.
        Args:
            textos1: Lista de textos (original_datos_procesados).
            textos2: Lista de textos (train_data).
        Returns:
            Matriz booleana [len(textos1), len(textos2)] indicando similitudes.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    #print(f"Usando dispositivo: {device}")

    # Preprocesar todos los textos en batch
    textos1_preprocesados = [' '.join(tokenizar_fuction(t)) for t in textos1]
    textos2_preprocesados = [' '.join(tokenizar_fuction(t)) for t in textos2]

    # Crear representación binaria de tokens
    vectorizer = CountVectorizer(binary=True, tokenizer=nltk.word_tokenize)
    vectorizer.fit(textos1_preprocesados + textos2_preprocesados)

    # Convertir a matrices de tokens
    matriz_tokens1 = vectorizer.transform(textos1_preprocesados).toarray()
    matriz_tokens2 = vectorizer.transform(textos2_preprocesados).toarray()

    # Mover a GPU
    matriz_tokens1 = torch.from_numpy(matriz_tokens1).to(device)
    matriz_tokens2 = torch.from_numpy(matriz_tokens2).to(device)

     # Calcular intersecciones
    interseccion = torch.matmul(matriz_tokens1.float(), matriz_tokens2.T.float())

    # Tamaños de los conjuntos de tokens
    tamano_tokens1 = matriz_tokens1.sum(dim=1).unsqueeze(1)
    tamano_tokens2 = matriz_tokens2.sum(dim=1).unsqueeze(0)

    # Evitar división por cero
    tamano_tokens1 = tamano_tokens1.clamp(min=1)
    tamano_tokens2 = tamano_tokens2.clamp(min=1)

     # Calcular proporciones
    prop_1_en_2 = interseccion / tamano_tokens2
    prop_2_en_1 = interseccion / tamano_tokens1

    similitud = (prop_1_en_2 > threshold) | (prop_2_en_1 > threshold)





    return similitud

In [6]:
class DataLoader_raw:
    def __init__(self, path1, path2):
        self.path1 = path1
        self.path2 = path2
        self.path_original1 = '../dataset/Первый_жанр_исходная.txt'
        self.path_original2 = '../dataset/Второй_жанр_исходная.txt'    
    
    def __call__(self):
        datos_genero1 = cargar_datos(self.path1, 0)
        datos_genero2 = cargar_datos(self.path2, 1)
        datos = datos_genero1 + datos_genero2
        
        datos_original1 = cargar_datos(self.path_original1, 0)
        datos_original2 = cargar_datos(self.path_original2, 1)
        datos_originales = datos_original1 + datos_original2
        
        return {
            'datos_raw': datos,
            'original_datos_raw': datos_originales
        }

In [7]:
class SentenceSplitterAndCleaner:
    def __init__(self, tokenizer, min_length_threshold=6):
        """Inicializa el procesador con un tokenizador y un umbral de longitud mínima."""
        self.tokenizer = tokenizer
        self.min_length_threshold = min_length_threshold
    
    def _clean_and_split_text(self, texto):
        """
        Limpia y divide un texto en oraciones procesadas.
        
        Args:
            texto (str): Texto crudo a procesar.
        
        Returns:
            list: Lista de oraciones limpias.
        """
        # Paso 1: Marcar puntos para evitar fusiones no deseadas
        texto_limpio = re.sub(r'\.,', '. Ok999999999 ,', texto)  # Caso 1: Punto seguido de coma
        texto_limpio = re.sub(r'\.;', '. Ok999999999 ', texto_limpio)  # Caso 2: Punto seguido de punto y coma
        texto_limpio = re.sub(r'\. ([a-zа-я])', r'. Ok999999999 \1', texto_limpio)  # Caso 3: Punto seguido de minúscula
        texto_limpio = re.sub(r'(\w)([А-Я])', r'\1. \2', texto_limpio)  # Caso 4: Minúscula seguida de mayúscula
        
        #creo que hay que agregar una nueva condivion  дл.,0 
        #", 2-3 см шир.",0 
        #", 1 см шир.",0 si hay un punto uego numeros quitar el punto
        
        # Paso 2: Eliminar corchetes y dividir en oraciones
        texto_sin_corchetes = re.sub(r'\[.*?\]', '', texto_limpio).strip()
        oraciones = [oracion.text for oracion in sentenize(texto_sin_corchetes)]
        
        # Paso 3: Restaurar espacios y limpiar marcadores temporales
        oraciones_limpias = [re.sub(r'\s*Ok999999999', ' ', oracion).strip() for oracion in oraciones]
        
        return oraciones_limpias
    
    def _process_sentences(self, datos, target_list):
        """
        Procesa un conjunto de datos y agrega oraciones válidas a la lista objetivo.
        
        Args:
            datos (list): Lista de tuplas (texto, etiqueta).
            target_list (list): Lista donde se almacenarán las oraciones procesadas.
        """
        for texto, etiqueta in datos:
            oraciones = self._clean_and_split_text(texto)
            for oracion in oraciones:
                # Filtrar oraciones cortas basadas en el número de tokens
                if len(self.tokenizer.encode(oracion, truncation=False)) >= self.min_length_threshold:
                    target_list.append((oracion, etiqueta))
    
    def __call__(self, data_raw, original_data_raw):
        """
        Procesa los datos crudos y originales, devolviendo dos conjuntos limpios.
        
        Args:
            data_raw (list): Datos modificados crudos.
            original_data_raw (list): Datos originales crudos.
        
        Returns:
            dict: Diccionario con datos procesados y originales procesados.
        """
        datos_procesados = []
        original_datos_procesados = []
        
        # Procesar datos modificados
        self._process_sentences(data_raw, datos_procesados)
        
        # Procesar datos originales
        self._process_sentences(original_data_raw, original_datos_procesados)
        
        return {
            'datos_procesados': datos_procesados,
            'original_datos_procesados': original_datos_procesados
        }

In [8]:
class DataProcessor:
    def __init__(self,
                 tokenizer,
                 max_length,
                 random_state,
                 test_size=0.2,
                 name='data_set_name',
                 threshold=0.5,
                 model_type='bert'):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.test_size = test_size
        self.random_state = random_state
        self.name = name
        self.threshold = threshold
        self.model_type = model_type
        
    def __call__(self, datos_procesados, original_datos_procesados):
        df = pd.DataFrame(datos_procesados, columns=["text", "label"])
        train_data, test_data = train_test_split(df,
                                                 test_size=self.test_size,
                                                 random_state=self.random_state)
        
        if self.model_type == 'gpt':
            train_texts = train_data["text"].tolist()
            train_labels = train_data["label"].tolist()
            
            original_texts = [oracion for oracion, _ in original_datos_procesados]
            original_labels = [etiqueta for _, etiqueta in original_datos_procesados]
            
            # Calcular similitudes para filtrar el conjunto de prueba
            similitudes = calcular_similitud_mayoria_optimizado(original_texts, train_texts, self.threshold)
            interseccion = similitudes.any(dim=1)
            mask = ~interseccion.cpu().numpy()
            nuevo_test = [(texto, etiqueta) for texto, etiqueta, keep in zip(original_texts, original_labels, mask) if keep]
            
            # Tokenizar entrenamiento
            train_encodings = self.tokenizer(
                train_texts,
                padding="max_length",
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            )
            train_labels = torch.tensor(train_labels)
            
            # Tokenizar prueba
            test_encodings = self.tokenizer(
                [texto for texto, _ in nuevo_test],
                padding="max_length",
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            )
            test_labels = torch.tensor([etiqueta for _, etiqueta in nuevo_test])
            
            # Guardar datos si random_state == 0
            if self.random_state == 0:
                train_data.to_csv(f'сокращение по частотности/train_{self.name}.csv', index=False)
                print(f"train_{self.name}.csv")
                nuevo_test_df = pd.DataFrame(nuevo_test, columns=["text", "label"])
                nuevo_test_df.to_csv(f'сокращение по частотности/test_{self.name}.csv', index=False)
                print(f"test_{self.name}.csv")
            
        else:
            train_texts = train_data["text"].tolist()
            original_texts = [oracion for oracion, _ in original_datos_procesados]
            original_labels = [etiqueta for _, etiqueta in original_datos_procesados]

            similitudes = calcular_similitud_mayoria_optimizado(original_texts,
                                                                train_texts,
                                                                self.threshold)
            interseccion = similitudes.any(dim=1)
            mask = ~interseccion.cpu().numpy()
            nuevo_test = [(texto, etiqueta) for texto, etiqueta, keep in zip(original_texts, original_labels, mask) if keep]

            if self.random_state == 0:
                train_data.to_csv(f'сокращение по частотности/train_{self.name}.csv', index=False)
                print(f"train_{self.name}.csv")
                nuevo_test_df = pd.DataFrame(nuevo_test, columns=["text", "label"])
                nuevo_test_df.to_csv(f'сокращение по частотности/test_{self.name}.csv', index=False)
                print(f"test_{self.name}.csv")

            # Tokenizar entrenamiento
            train_encodings = self.tokenizer(
                train_data["text"].tolist(),
                padding="max_length",
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            )
            train_labels = torch.tensor(train_data["label"].values)
            test_encodings = self.tokenizer(
                [texto for texto, _ in nuevo_test],
                padding="max_length",
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            )
            test_labels = torch.tensor([etiqueta for _, etiqueta in nuevo_test])
        
        # Depuración: Verificar forma y contenido de las etiquetas
        print(f"Train labels sample: {train_labels[:5]}, Shape: {train_labels.shape}")
        print(f"Test labels sample: {test_labels[:5]}, Shape: {test_labels.shape}")
        
        return {
            'train_encodings': train_encodings,
            'train_labels': train_labels,
            'test_encodings': test_encodings,
            'test_labels': test_labels
        }

In [9]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

In [10]:
class DatasetCreator:
    def __init__(self, batch_size=8):
        self.batch_size = batch_size
    
    def __call__(self, train_encodings, train_labels, test_encodings, test_labels):
        train_dataset = TextDataset(train_encodings, train_labels)
        test_dataset = TextDataset(test_encodings, test_labels)
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False)
        return {'train_loader': train_loader, 'test_loader': test_loader}

In [11]:
class ZeroShotConfig:
    def __init__(self,
                 model_name,
                 candidate_labels=["жанр0", "жанр1"],
                 hypothesis_template="Este texto es sobre {}."):
        self.model_name = model_name
        self.candidate_labels = candidate_labels
        self.hypothesis_template = hypothesis_template
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [12]:
def zero_shot_evaluate(config, test_loader, tokenizer):
    # Cargar el pipeline de zero-shot
    classifier = pipeline(
        "zero-shot-classification",
        model=config.model_name,
        device=config.device
    )
    
    all_true_labels = []
    all_preds = []
    all_probs = []
    
    # Procesar el test_loader
    for batch in test_loader:
        texts = [tokenizer.decode(ids, skip_special_tokens=True) 
                for ids in batch['input_ids']]
        true_labels = batch['labels'].cpu().numpy()
        
        # Clasificación zero-shot
        results = classifier(
            texts,
            candidate_labels=config.candidate_labels,
            hypothesis_template=config.hypothesis_template
        )
        
        # Procesar resultados
        for result, true_label in zip(results, true_labels):
            pred_label = config.candidate_labels.index(result['labels'][0])
            prob = result['scores'][0]
            
            all_true_labels.append(true_label)
            all_preds.append(pred_label)
            all_probs.append(prob)
    
    # Convertir a arrays numpy
    all_true_labels = np.array(all_true_labels)
    all_preds = np.array(all_preds)
    all_probs = np.array(all_probs)
    
    # Calcular métricas (igual que en tu evaluate_model)
    accuracy = np.mean(all_preds == all_true_labels)
    report = classification_report(all_true_labels, all_preds, 
                                  target_names=config.candidate_labels, output_dict=True)
    conf_matrix = confusion_matrix(all_true_labels, all_preds)
    roc_auc = roc_auc_score(all_true_labels, all_probs)
    pr_auc = average_precision_score(all_true_labels, all_probs)
    log_loss_val = log_loss(all_true_labels, all_probs)
    
    return {
        "accuracy": accuracy,
        "f1_weighted": report['weighted avg']['f1-score'],
        "f1_macro": report['macro avg']['f1-score'],
        "f1_class0": report[config.candidate_labels[0]]['f1-score'],
        "f1_class1": report[config.candidate_labels[1]]['f1-score'],
        "roc_auc": roc_auc,
        "pr_auc": pr_auc,
        "log_loss": log_loss_val,
        "report": report,
        "conf_matrix": conf_matrix,
        "predictions": all_preds.tolist(),
        "true_labels": all_true_labels.tolist(),
        "probs": all_probs.tolist()
    }

In [13]:
def train_and_evaluate_zero_shot(path1, path2, config, dataset_name, dataset_type):
    seeds = list(range(config.num_repeats))
    metrics = {
        'accuracies': [],
        'f1_weighteds': [],
        'f1_macros': [],
        'f1_class0s': [],
        'f1_class1s': [],
        'roc_aucs': [],
        'pr_aucs': [],
        'log_losses': [],
        'confusion_matrices': []
    }
    
    for seed in seeds:
        # Pipeline común (igual que antes para preparar los datos)
        ppl = PipelineCommon([
            (DataLoader_raw(path1, path2), [], {'datos_raw': 'datos_raw', 'original_datos_raw': 'original_datos_raw'}),
            (SentenceSplitterAndCleaner(config.tokenizer), ['datos_raw', 'original_datos_raw'], 
             {'datos_procesados': 'datos_procesados', 'original_datos_procesados': 'original_datos_procesados'}),
            (DataProcessor(config.tokenizer, config.max_length, seed, name=dataset_name, 
                           threshold=config.threshold, model_type=config.model_type), 
             ['datos_procesados', 'original_datos_procesados'], 
             {'train_encodings': 'train_encodings', 'train_labels': 'train_labels',
              'test_encodings': 'test_encodings', 'test_labels': 'test_labels'}),
            (DatasetCreator(batch_size=config.batch_size), 
             ['train_encodings', 'train_labels', 'test_encodings', 'test_labels'], 
             {'train_loader': 'train_loader', 'test_loader': 'test_loader'})
        ])
        
        result = ppl()
        test_loader = result['test_loader']
        
        # Configuración Zero-Shot
        zs_config = ZeroShotConfig(
            model_name=config.model_name,
            candidate_labels=["жанр0", "жанр1"],
            hypothesis_template="Este fragmento literario pertenece al género {}."
        )
        
        # Evaluación Zero-Shot
        results = zero_shot_evaluate(zs_config, test_loader, config.tokenizer)
        
        # Acumular métricas
        metrics['accuracies'].append(results['accuracy'])
        metrics['f1_weighteds'].append(results['f1_weighted'])
        metrics['f1_macros'].append(results['f1_macro'])
        metrics['f1_class0s'].append(results['f1_class0'])
        metrics['f1_class1s'].append(results['f1_class1'])
        metrics['roc_aucs'].append(results['roc_auc'])
        metrics['pr_aucs'].append(results['pr_auc'])
        metrics['log_losses'].append(results['log_loss'])
        metrics['confusion_matrices'].append(results['conf_matrix'])
    
    # Calcular promedios
    avg_conf_matrix = np.mean(metrics['confusion_matrices'], axis=0)
    
    return {
        'dataset_name': dataset_name,
        'dataset_type': dataset_type,
        'model_name': config.model_name,
        'avg_accuracy': float(np.mean(metrics['accuracies'])),
        'std_accuracy': float(np.std(metrics['accuracies'])),
        'avg_f1_weighted': float(np.mean(metrics['f1_weighteds'])),
        'std_f1_weighted': float(np.std(metrics['f1_weighteds'])),
        'avg_f1_macro': float(np.mean(metrics['f1_macros'])),
        'std_f1_macro': float(np.std(metrics['f1_macros'])),
        'avg_f1_class0': float(np.mean(metrics['f1_class0s'])),
        'std_f1_class0': float(np.std(metrics['f1_class0s'])),
        'avg_f1_class1': float(np.mean(metrics['f1_class1s'])),
        'std_f1_class1': float(np.std(metrics['f1_class1s'])),
        'avg_roc_auc': float(np.mean(metrics['roc_aucs'])),
        'std_roc_auc': float(np.std(metrics['roc_aucs'])),
        'avg_pr_auc': float(np.mean(metrics['pr_aucs'])),
        'std_pr_auc': float(np.std(metrics['pr_aucs'])),
        'avg_log_loss': float(np.mean(metrics['log_losses'])),
        'std_log_loss': float(np.std(metrics['log_losses'])),
        'avg_confusion_matrix': avg_conf_matrix.tolist(),
        'confusion_matrices': [cm.tolist() for cm in metrics['confusion_matrices']],
        'type': 'zero-shot',
        'true_labels': results['true_labels'],
        'predictions': results['predictions']
    }

In [14]:
zero_shot_models = [
    {'model': 'facebook/bart-large-mnli', 'name': 'BART-MNLI'},
    {'model': 'MoritzLaurer/mDeBERTa-v3-base-mnli-xnli', 'name': 'mDeBERTa-XNLI'},
    {'model': 'joeddav/xlm-roberta-large-xnli', 'name': 'XLM-RoBERTa-XNLI'}
]

In [15]:
def main():
    results = []
    
    # Modo Zero-Shot
    for model in zero_shot_models:
        for dataset in datasets:
            config = create_custom_config(model['model'], 'zero-shot', dataset)
            result = train_and_evaluate_zero_shot(
                dataset['path1'],
                dataset['path2'],
                config,
                dataset['name'],
                dataset['type']
            )
            results.append(result)

    
    save_results(results, 'model_results_complete.json')

In [16]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

def plot_combined_metrics(results, save_path=None):
    # Convertir resultados a DataFrame
    df = pd.DataFrame(results)
    
    # Configurar estilo
    sns.set(style="whitegrid")
    plt.figure(figsize=(15, 10))
    
    # Gráfico de accuracy por modelo y tipo de dataset
    plt.subplot(2, 2, 1)
    sns.barplot(data=df, x='model_name', y='avg_accuracy', hue='dataset_type', 
                palette='viridis', errorbar='sd')
    plt.title('Accuracy por Modelo y Tipo de Dataset')
    plt.xticks(rotation=45, ha='right')
    plt.ylabel('Accuracy')
    plt.xlabel('Modelo')
    plt.legend(title='Tipo de Dataset')
    
    # Gráfico de F1-macro por tipo de modelo
    plt.subplot(2, 2, 2)
    sns.boxplot(data=df, x='type', y='avg_f1_macro', palette='Set2')
    plt.title('Distribución de F1-Macro por Tipo de Modelo')
    plt.ylabel('F1 Macro')
    plt.xlabel('Tipo de Modelo')
    
    # Gráfico de ROC-AUC comparando enfoques
    plt.subplot(2, 2, 3)
    sns.scatterplot(data=df, x='model_name', y='avg_roc_auc', hue='type', 
                   style='dataset_type', s=150, palette='dark')
    plt.title('ROC-AUC por Modelo y Enfoque')
    plt.xticks(rotation=45, ha='right')
    plt.ylabel('ROC-AUC')
    plt.xlabel('Modelo')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, bbox_inches='tight', dpi=300)
    plt.show()

In [17]:
def plot_confusion_matrices(results, ncols=3, figsize=(20, 15), save_path=None):
    # Filtrar solo resultados con matrices de confusión
    valid_results = [r for r in results if 'avg_confusion_matrix' in r]
    
    nrows = int(np.ceil(len(valid_results) / ncols))
    plt.figure(figsize=figsize)
    
    for i, result in enumerate(valid_results, 1):
        plt.subplot(nrows, ncols, i)
        
        # Obtener etiquetas según el tipo de modelo
        labels = ["жанр0", "жанр1"]
        if result['type'] == 'zero-shot':
            labels = result.get('candidate_labels', labels)
        
        sns.heatmap(result['avg_confusion_matrix'], annot=True, fmt='.1f',
                   xticklabels=labels, yticklabels=labels,
                   cmap='Blues', cbar=False)
        
        title = f"{result['model_name']}\n{result['dataset_name']}"
        if result['type'] == 'zero-shot':
            title += " (Zero-Shot)"
        plt.title(title)
        plt.xlabel('Predicho')
        plt.ylabel('Real')
    
    plt.tight_layout()
    
    if save_path:
        plt.savefig(save_path, bbox_inches='tight', dpi=300)
    plt.show()

In [2]:
import requests

def classify_text_gpt(text):
    prompt = f"""
    Ты — эксперт по классификации текста на русском языке. Тебе дан текст, и ты должен определить, является ли он **Описанием** (описывает что-то, содержит детали, примеры) или **Определением** (дает краткое и формальное определение термина). Ответь только с меткой: "Описание" или "Определение".

    Текст: {text}
    Метка:
    """
    #deberia ser 0 
    # Ejemplo con una API hipotética
    response = requests.post("https://api.x.ai/grok", json={"prompt": prompt})
    return response.json()["completion"].strip()

text = "Молекула — это наименьшая частица вещества, которая сохраняет его химические свойства."
print(classify_text_gpt(text))  # Salida: Определение

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch
from scipy.spatial.distance import cosine

# Cargar modelo y tokenizador
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

# Plantillas
template_def = "Это краткое и формальное определение термина или понятия."
template_desc = "Это подробное описание с деталями, примерами или характеристиками."

# Texto a clasificar
# text = "Молекула — это наименьшая частица вещества, которая сохраняет его химические свойства."
text = "Термин образ автора предложен В.В. Виноградовым как важнейший из инструментов стилистического анализа художественной речи;"
# Obtener embeddings
emb_text = get_embedding(text)
emb_def = get_embedding(template_def)
emb_desc = get_embedding(template_desc)

# Calcular similitud coseno
sim_def = 1 - cosine(emb_text, emb_def)
sim_desc = 1 - cosine(emb_text, emb_desc)

# Clasificar
label = "Определение" if sim_def > sim_desc else "Описание"
print(f"Метка: {label}, Similitud Определение: {sim_def:.3f}, Similitud Описание: {sim_desc:.3f}")

Метка: Определение, Similitud Определение: 0.998, Similitud Описание: 0.998


In [9]:
from transformers import pipeline

model_name = "meta-llama/Llama-3-8b"  # Ejemplo, requiere acceso
generator = pipeline("text-generation", model=model_name)
prompt = f"""
Ты — эксперт по классификации текста на русском языке. Тебе дан текст, и ты должен определить, является ли он **Описанием** (текст, который описывает что-то с деталями, примерами или характеристиками) или **Определением** (текст, который дает краткое и формальное определение термина или понятия). Ответь только с меткой: "Описание" или "Определение".

Текст: Молекула — это наименьшая частица вещества, которая сохраняет его химические свойства.
Метка:
"""
result = generator(prompt, max_length=50)[0]["generated_text"].strip()
label = result.split("Метка:")[-1].strip()
label_num = 1 if label == "Описание" else 0
print(f"Метка: {label}, Label numérico: {label_num}")

OSError: meta-llama/Llama-3-8b is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`