# **Text Segmentation**

### **Modelo de Procesamiento de Lenguaje Natural para clasificación**

Queremos encontrar en qué partes del texto se ubican las obras para que quienes posteriormente realicen análisis sobre la base de datos puedan medir la importancia de estas. No es lo mismo una obra que aparece en un párrafo, que una que tiene una imágen o hasta su propio título.

Este modelo toma los libros que se encuentran separados por línea y clasifica por "Title" (Título), "Paragraph" (Párrafo) y "Caption" (Pie de foto + Notas).

El modelo utilizado fue [distilbert-base-uncased](https://huggingface.co/distilbert/distilbert-base-uncased) al ser más maleable y trabajable para una tarea no tan pesada como esta.

# Preparación del enotrno


1) Conexión con Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Cambio de directorio
%cd "/content/drive/MyDrive/ARCHITECTURE_NER/Text_Segmentation"

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1HoJSyvMQ6jiLlA6VSCROqkUj-Dxup7jL/ARCHITECTURE_NER/Text_Segmentation


2) Definimos las variables del entorno

In [None]:
TRAIN_MODEL = False # Cambiar a True si se quiere entrenar el modelo

# Entrenamiento

2) Instalación e importación de las librerías necesarias

In [None]:
if TRAIN_MODEL:
  !pip install datasets
  !pip install evaluate
else:
  print("El entrenamiento no está habilitado. Cambiá la variable TRAIN_MODEL a True en la sección Preparación del entorno si deseas entrenar el modelo.")

El entrenamiento no está habilitado. Cambiá la variable TRAIN_MODEL a True en la sección Preparación del entorno si deseas entrenar el modelo.


In [None]:
if TRAIN_MODEL:
  import pandas as pd
  import torch
  from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
  from datasets import Dataset
  from evaluate import load
  from sklearn.model_selection import train_test_split
  from sklearn.metrics import accuracy_score, classification_report
else:
  print("El entrenamiento no está habilitado. Cambiá la variable TRAIN_MODEL a True en la sección Preparación del entorno si deseas entrenar el modelo.")

El entrenamiento no está habilitado. Cambiá la variable TRAIN_MODEL a True en la sección Preparación del entorno si deseas entrenar el modelo.


3) Entrenamiento del modelo

In [None]:
if TRAIN_MODEL:
  # Función para cargar el dataset de entrenamiento
  def load_data_from_csv(csv_file):
      df = pd.read_csv(csv_file)
      texts = df.iloc[:, 0].tolist()  # Primera columna contiene la línea de texto
      labels = df.iloc[:, 1].tolist()  # Segunda columna contiene la etiqueta (título, párrafo o caption)
      return texts, labels

  def prepare_data_from_csv(csv_file):
      texts, labels = load_data_from_csv(csv_file)

      # Separación del dataset en los conjuntos de entrenamiento y validación
      train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

      train_data = Dataset.from_dict({"text": train_texts, "label": train_labels})
      val_data = Dataset.from_dict({"text": val_texts, "label": val_labels})

      return train_data, val_data

  # Cargamos el dataset de entrenamiento
  csv_file = 'train_segmentation.csv'
  train_data, val_data = prepare_data_from_csv(csv_file)

  # Cargamos el modelo elegido
  model_name = "distilbert-base-uncased"
  tokenizer = DistilBertTokenizer.from_pretrained(model_name)

  # Conseguimos la información de las etiquetas a utilizar
  unique_labels = list(set(train_data['label']))
  print(unique_labels)
  num_labels = len(unique_labels)

  # Configuramos el modelo
  model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

  # Función para tokenizar las palabras
  def tokenize_function(examples):
      return tokenizer(examples["text"], padding="max_length", truncation=True)

  # Tokenizamos los datasets de entrenamiento y validación
  train_data = train_data.map(tokenize_function, batched=True)
  val_data = val_data.map(tokenize_function, batched=True)

  # Convertimos las etiquetas a números para ser procesadas
  def encode_labels(example):
      example['label'] = unique_labels.index(example['label'])  # Map labels to integers
      return example

  train_data = train_data.map(encode_labels)
  val_data = val_data.map(encode_labels)

  # Definimos los hiperparámetros del modelo
  training_args = TrainingArguments(
      output_dir="./results",
      eval_strategy="epoch",
      per_device_train_batch_size=16,
      per_device_eval_batch_size=16,
      num_train_epochs=5,
      weight_decay=0.01,
  )

  # Función para computar las métricas
  def compute_metrics(eval_pred):
      logits, labels = eval_pred
      predictions = torch.argmax(torch.tensor(logits), dim=-1)

      # Calculamos la accuracy general
      overall_accuracy = accuracy_score(labels, predictions)

      # Generamos un reporte de métricas
      report = classification_report(labels, predictions, target_names = unique_labels, output_dict=True)

      # Extraemos la precisión por etiqueta
      label_accuracies = {label: report[label]['precision'] for label in unique_labels}

      metrics = {
          'overall_accuracy': overall_accuracy,
      }

      for label in unique_labels:
          metrics[f'accuracy_per_label_{label}'] = label_accuracies[label]

      return metrics

  # Definimos el Entrenador
  trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=train_data,
      eval_dataset=val_data,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics,
  )

  # Entrenamos el modelo
  trainer.train()

  # Evaluamos el modelo
  trainer.evaluate()

  # Función para hacer la clasificación
  def classify_text(text):
      inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to('cuda')
      outputs = model(**inputs)
      prediction = torch.argmax(outputs.logits, dim=-1).item()
      predicted_label = unique_labels[prediction]
      return predicted_label

else:
  print("El entrenamiento no está habilitado. Cambiá la variable TRAIN_MODEL a True en la sección Preparación del entorno si deseas entrenar el modelo.")

El entrenamiento no está habilitado. Cambiá la variable TRAIN_MODEL a True en la sección Preparación del entorno si deseas entrenar el modelo.


# Test

1) Prueba con un pequeño rejunte de libros
- Imprime cada línea con la label predicha
- Imprime la accuracy del modelo al terminar

In [1]:
!pip install torch



In [2]:
# Importamos las librerías necesarias
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import os

# Conexión con Google Drive
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/MyDrive/ARCHITECTURE_NER/Text_Segmentation"

# Preparamos el mejor modelo obtenido y definimos el tokenizador
model_path = 'lucasdefino/text-segmentation'
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)
model.to('cuda')

# Función para clasificación
unique_labels = ['Paragraph', 'Title', 'Caption']
def classify_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to('cuda')
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1).item()
    predicted_label = unique_labels[prediction]
    return predicted_label

Mounted at /content/drive
/content/drive/MyDrive/ARCHITECTURE_NER/Text_Segmentation


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/769 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
# Leemos el CSV que contiene algunas líneas de varios libros
df = pd.read_csv('SMALL_Everything.csv', header=None, sep=";")

# Definimos las columnas
df = df[[0, 1]].rename(columns={0: 'Text', 1: 'Labels'})


# Clasificamos y obtenemos un puntaje
score = 0

for index, row in df.iterrows():
    text = row['Text']
    true_label = row['Labels']

    predicted_label = classify_text(text)
    print(f"{text} -> {predicted_label}")

    if predicted_label == true_label:
        score += 1

print(f"Accuracy: {score / len(df)}")

Chapter 4 -> Title
Brick -> Title
Another tradition of modern architecture -> Title
Few cowsheds figure in the histories of architecture. One that does, along -> Paragraph
with an adjacent barn, was completed in 1925 on Gut Gurkau Farm near -> Paragraph
Liibeck, Germany, to designs by Hugo Haring (Figure 20). Haring was part -> Paragraph
Monumentality and order: Louis I. Kahn -> Title
24. Church of St Peter, Klippan, Sweden, 1966. Most aspects of the -> Caption
churchs construction were rethought afresh, although there was little -> Paragraph
12 Joseph Maria Olbrich -> Caption
12. In this they showed an affinity with both Otto Wagner's classi- -> Paragraph
(1865-1945), Charles Annesley Voysey (1857-1941), and Charles Robert -> Paragraph
Art Nouveau | -> Title
1890-1910 -> Title
(United workshops for art in craft}, one of the several reformist arts and crafts -> Paragraph
Fig. 14b. Brunc Taut, Appropriation of the SS Australia (see fig. 14a) for a caricature of the -> Caption
modern hou

# Clasificación

En esta última sección generamos los CSVs de los libros con una label por línea

1) Importamos las librerías necesarias

In [None]:
import pandas as pd
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import os

2) Importamos el mejor modelo

In [None]:
model_path = '/content/drive/MyDrive/ARCHITECTURE_NER/Text_Segmentation/best_model'
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
model = DistilBertForSequenceClassification.from_pretrained(model_path)
model.to('cuda')

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


3) Clasificación de obras y guardado de resultados en respectivos CSVs

In [None]:
unique_labels = ['Paragraph', 'Title', 'Caption']
def classify_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to('cuda')
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1).item()
    predicted_label = unique_labels[prediction]
    return predicted_label

# Definimos las carpetas de Google Drive
input_folder = '/content/drive/MyDrive/ARCHITECTURE_NER/Text_Segmentation/txts_text_segmentation'
output_folder = '/content/drive/MyDrive/ARCHITECTURE_NER/Text_Segmentation/csvs_text_segmentation'
os.makedirs(output_folder, exist_ok=True)

# Procesamos cada libro
for filename in os.listdir(input_folder):
    if filename.endswith(".txt"):
        input_file_path = os.path.join(input_folder, filename)
        output_filename = f"TS {filename.replace('BOOK', '').replace('.txt', '').strip()}.csv"
        output_file_path = os.path.join(output_folder, output_filename)

        with open(input_file_path, 'r', encoding='utf-8') as file:
            lines = file.readlines()

        classified_results = []
        for line in lines:
            stripped_line = line.strip()
            if stripped_line:
                label = classify_text(stripped_line)
                classified_results.append({'text': stripped_line, 'label': label})

        # Guardamos los resultados a un CSV por libro
        df = pd.DataFrame(classified_results)
        df.to_csv(output_file_path, index=False)

        print(f"Processed {filename} and saved as {output_filename}")

print("All files processed and saved.")

Processed lines_BOOK RStephen Sennott II.txt and saved as TS lines_ RStephen Sennott II.csv
Processed lines_BOOK RStephen Sennott III.txt and saved as TS lines_ RStephen Sennott III.csv
Processed lines_BOOK Sigfried Giedion Building in France Building in Iron Building in Ferroconcrete.txt and saved as TS lines_ Sigfried Giedion Building in France Building in Iron Building in Ferroconcrete.csv
Processed lines_BOOK Sigfried Giedion Space Time and Architecture.txt and saved as TS lines_ Sigfried Giedion Space Time and Architecture.csv
Processed lines_BOOK Vittorio M Lampugnani.txt and saved as TS lines_ Vittorio M Lampugnani.csv
Processed lines_BOOK Walter Curt Behrendt Modern Building Its Nature Problems And Forms.txt and saved as TS lines_ Walter Curt Behrendt Modern Building Its Nature Problems And Forms.csv
All files processed and saved.
