In [1]:
!tree ../dataset

[01;34m../dataset[00m
├── [01;34men_espanol[00m
│   ├── docx2txt.py
│   ├── Второй_жанр_исходная.txt
│   └── Первый_жанр_исходная.txt
├── Второй_жанр_исходная.txt
├── Первый_жанр_исходная.txt
├── [01;34mСокращение по частям речи[00m
│   ├── 1.Первый жанр исходная выборка.txt
│   ├── 2.Первый жанр без клауз, включающих наречия.txt
│   ├── 3.Первый жанр без клауз, включающих глаголы.txt
│   ├── 4. Первый жанр без клауз, включающих глаголы и наречия.txt
│   ├── Без прилагательных второй жанр.txt
│   ├── Без прилагательных первый жанр.txt
│   └── Случайные выборки.txt
└── [01;34mсокращение по частотности[00m
    ├── 1а_ без сокращений.txt
    ├── 1б_Изъяты лексемы с частотой выше 100.txt
    ├── 1в_Изъяты лексемы с частотой выше 49.txt
    ├── 1г_Изъяты лексемы с частотой выше 29.txt
    ├── 1д_Изъяты лексемы с частотой выше 9.txt
    ├── 2а_ без сокращений.txt
    ├── 2б_Изъяты лексемы с частотой выше 100.txt
    ├── 2в_Изъяты лексемы с частотой выше 49.txt

In [2]:
#load data
#preprocessing
#train/test
#tokenization
#dataset-dataloader
#model-setings
#trainig
#evaluation


In [4]:
import re
import razdel
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from isanlp.pipeline_common import PipelineCommon
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm
import gc
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk


class DataLoader_raw:
    """
    para carcar los datos
    output: texto etiquetado
    """
    def __init__(self, file1, file2):
        self.file1 = file1
        self.file2 = file2
    
    
    def __call__(self):
        with open(self.file1, 'r', encoding='utf-8') as f:
            textos1 = f.readlines()
        datos_genero1 = [(texto.strip(), 0) for texto in textos1]
        with open(self.file2, 'r', encoding='utf-8') as f:
            textos2 = f.readlines()
        datos_genero2 = [(texto.strip(), 1) for texto in textos2]
        
        return {'datos_raw': datos_genero1 + datos_genero2}

    
class SentenceSplitterAndCleaner:
    def __init__(self, tokenizer, min_length_threshold=6):
        self.tokenizer = tokenizer
        self.min_length_threshold = min_length_threshold
    
    def __call__(self, data_raw):
        #datos_raw = data['datos_raw']
        datos_procesados = []
        for texto, etiqueta in data_raw:
            # caso de uso en texto modificado
            texto_limpio = re.sub(r'\.,', '. Ok999999999 ,', texto)#caso 1
            texto_limpio = re.sub(r'\.;', '. Ok999999999 ', texto_limpio)#caso 2
            texto_limpio = re.sub(r'\. ([a-zа-я])', r'. Ok999999999 \1', texto_limpio) # caso 3
            texto_limpio = re.sub(r'(\w)([А-Я])', r'\1. \2', texto_limpio)#caso 4 
            # Eliminar corchetes y dividir en oraciones
            for oracion in razdel.sentenize(re.sub(r'\[.*?\]', '', texto_limpio).strip()):
                oracion_texto = re.sub(r'\s*Ok999999999', ' ', oracion.text).strip()#restaurar 
                # Filtrar oraciones cortas
                if len(self.tokenizer.encode(oracion_texto, truncation=False)) >= self.min_length_threshold:
                    datos_procesados.append((oracion_texto, etiqueta))
        return {'datos_procesados': datos_procesados}    
    
    
class DataProcessor:
    def __init__(self, tokenizer, max_length, random_state, test_size=0.2):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.test_size = test_size
        self.random_state = random_state
    
    def __call__(self, datos_procesados):
        #datos_procesados = data['datos_procesados']
        df = pd.DataFrame(datos_procesados, columns=["text", "label"])
        train_data, test_data = train_test_split(df, test_size=self.test_size, random_state=self.random_state)
        # Tokenizar entrenamiento
        train_encodings = self.tokenizer(
            train_data["text"].tolist(),
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        train_labels = torch.tensor(train_data["label"].values)
        # Tokenizar prueba
        test_encodings = self.tokenizer(
            test_data["text"].tolist(),
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )
        test_labels = torch.tensor(test_data["label"].values)
        return {
            'train_encodings': train_encodings,
            'train_labels': train_labels,
            'test_encodings': test_encodings,
            'test_labels': test_labels
        }  
    
    
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

class DatasetCreator:
    def __init__(self, batch_size=8):
        self.batch_size = batch_size
    
    def __call__(self, train_encodings, train_labels, test_encodings, test_labels):
        train_dataset = TextDataset(train_encodings, train_labels)
        test_dataset = TextDataset(test_encodings, test_labels)
        train_loader = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False)
        return {'train_loader': train_loader, 'test_loader': test_loader}
    
def train_model(model, train_loader, optimizer, loss_fn, device, epochs=3):
    for epoch in range(epochs):
        model.train()
        loop = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}')
        for batch in loop:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            loss.backward()
            optimizer.step()
            loop.set_postfix(loss=loss.item())
        torch.cuda.empty_cache()#libera memroria de la gpu
        gc.collect()#limpa la cpu

def evaluate_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    accuracy = sum(p == l for p, l in zip(all_preds, all_labels)) / len(all_labels)
    report = classification_report(all_labels, all_preds, target_names=["Género 1", "Género 2"])
    conf_matrix = confusion_matrix(all_labels, all_preds)
    return {"accuracy": accuracy, "report": report, "conf_matrix": conf_matrix}

def display_results(results):
    print(f'Precisión en el conjunto de prueba: {results["accuracy"]:.2%}')
    print("\nReporte de clasificación:\n", results["report"])
    plt.figure(figsize=(6, 4))
    sns.heatmap(results["conf_matrix"], annot=True, fmt='d', cmap='Blues', 
                xticklabels=["Género 1", "Género 2"], 
                yticklabels=["Género 1", "Género 2"])
    plt.title('Matriz de Confusión')
    plt.xlabel('Predicción')
    plt.ylabel('Real')
    plt.show()

  from .autonotebook import tqdm as notebook_tqdm
2025-03-23 20:53:40.735030: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
# Inicializar el tokenizador
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
max_length = 128
#Longitud promedio de tokens: 27.77
#Longitud máxima de tokens: 192
#Longitud mínima de tokens: 2
batch_size = 8
epochs = 3
num_repeats = 6
seeds = list(range(num_repeats))

accuracies = []

for seed in seeds:
    print(f"Repetición con semilla {seed}")
    
    # Instanciar DataProcessor con la semilla actual
    #data_processor = DataProcessor(tokenizer, max_length, test_size=0.2, random_state=seed)
    
    # Crear el pipeline (ajusta según tu estructura)
    ppl = PipelineCommon([
        (DataLoader_raw('../dataset/Первый_жанр_исходная.txt', '../dataset/Второй_жанр_исходная.txt'), 
         [], 
         {'datos_raw': 'datos_raw'}),
        (SentenceSplitterAndCleaner(tokenizer), 
         ['datos_raw'], 
         {'datos_procesados': 'datos_procesados'}),
        (DataProcessor(tokenizer, max_length,random_state=seed), 
         ['datos_procesados'], 
         {'train_encodings': 'train_encodings', 'train_labels': 'train_labels', 
          'test_encodings': 'test_encodings', 'test_labels': 'test_labels'}),
        (DatasetCreator(batch_size=8), 
         ['train_encodings', 'train_labels', 'test_encodings', 'test_labels'], 
         {'train_loader': 'train_loader', 'test_loader': 'test_loader'})
    ])
    
    # Ejecutar el pipeline
    result = ppl()
    train_loader = result['train_loader']
    test_loader = result['test_loader']
    
    # Configurar el modelo (reiniciarlo en cada repetición)
    model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=2)
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=2e-5)
    loss_fn = torch.nn.CrossEntropyLoss()
    
    # Entrenar el modelo (función train_model definida por ti)
    train_model(model, train_loader, optimizer, loss_fn, device, epochs=epochs)
    
    # Evaluar el modelo (función evaluate_model definida por ti)
    results = evaluate_model(model, test_loader, device)
    accuracy = results['accuracy']  # Ajusta según la métrica que uses
    accuracies.append(accuracy)
    
    print(f"Accuracy in {seed} seed: {accuracy:.4f}")

# Calcular promedio y desviación estándar
average_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
print(f"\nAccuracy promedio después de {num_repeats} repeticiones: {average_accuracy:.4f} ± {std_accuracy:.4f}")



Repetición con semilla 0


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Accuracy in 0 seed: 0.9910
Repetición con semilla 1


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Accuracy in 1 seed: 0.9865
Repetición con semilla 2


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Accuracy in 2 seed: 0.9955
Repetición con semilla 3


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Accuracy in 3 seed: 0.9955
Repetición con semilla 4


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Accuracy in 4 seed: 0.9865
Repetición con semilla 5


Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

Accuracy in 5 seed: 0.9955

Accuracy promedio después de 6 repeticiones: 0.9918 ± 0.0040
