In [1]:
!pip install spacy sklearn-crfsuite
!pip install python-crfsuite
import os
import re
from sklearn_crfsuite import CRF
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import spacy
from spacy.tokens import Doc
import random
random.seed(42)



In [3]:
# Загрузка модели spaCy для английского языка
nlp = spacy.load("en_core_web_sm")


In [13]:
def parse_ann(ann_text):
    entities = []
    for line in ann_text.split('\n'):
        if line.startswith('T'):
            parts = line.split('\t')
            if len(parts) < 3:
                continue  # Пропустить некорректные строки
            entity_info = parts[1].split()
            label = entity_info[0]
            # Обработка позиций с разделителями (например, '1633;1652')
            start_end = entity_info[1].replace(',', ';').split(';')  # Унификация разделителей
            start = int(start_end[0])
            end = int(start_end[-1])  # Берём последнее значение как конец
            text = parts[2].strip()
            entities.append((start, end, label, text))
    return entities

In [17]:
# Функция для преобразования текста и аннотаций в BIO-формат
def text_to_bio(txt_path, ann_path):
    with open(txt_path, 'r') as f:
        text = f.read()
    with open(ann_path, 'r') as f:
        ann = f.read()
    entities = parse_ann(ann)

    doc = nlp(text)
    tokens = [token.text for token in doc]
    bio_tags = ['O'] * len(tokens)

    for start, end, label, entity_text in entities:
        span = doc.char_span(start, end, alignment_mode='contract')
        if span is None:
            # Попытка альтернативного выравнивания
            span = doc.char_span(start, end, alignment_mode='expand')
        if span is not None:
            start_token = span.start
            end_token = span.end
            # Проверка перекрытия существующих меток
            if any(tag != 'O' for tag in bio_tags[start_token:end_token]):
                continue  # Пропустить конфликтующие аннотации
            bio_tags[start_token] = f'B-{label}'
            for i in range(start_token + 1, end_token):
                bio_tags[i] = f'I-{label}'
        else:
            print(f"Сущность '{entity_text}' не сопоставлена с токенами.")

    return list(zip(tokens, bio_tags))



In [6]:
# Извлечение признаков для CRF
def token_features(token):
    return {
        'word': token.text,
        'pos': token.pos_,
        'lemma': token.lemma_,
        'is_upper': token.text.isupper(),
        'prefix': token.text[:3],
        'suffix': token.text[-3:],
        'is_digit': token.text.isdigit(),
    }


In [30]:
def prepare_features_labels(data):
    X, y = [], []
    for sentence in data:
        tokens = [t[0] for t in sentence]
        tags = [t[1] for t in sentence]

        # Используем уже существующие токены вместо повторной токенизации
        # Создаем Doc объект вручную, чтобы избежать изменений в токенизации
        doc = Doc(nlp.vocab, words=tokens)
        doc = nlp(doc)

        features = []
        for i, token in enumerate(doc):
            feat = token_features(token)
            # Добавление контекстных признаков
            if i > 0:
                prev_token = doc[i-1]
                feat['prev_pos'] = prev_token.pos_
            else:
                feat['prev_pos'] = 'START'
            if i < len(doc)-1:
                next_token = doc[i+1]
                feat['next_pos'] = next_token.pos_
            else:
                feat['next_pos'] = 'END'
            features.append(feat)

        # Проверка длины
        if len(features) != len(tags):
            print(f"Пропущен пример: {len(features)} признаков vs {len(tags)} меток")
            continue

        X.append(features)
        y.append(tags)

    return X, y

In [4]:
import zipfile

# data_r = zipfile.ZipFile('data.zip', 'r')
# data_r.printdir()

# data_r.extractall()

os.getcwd()
print(os.listdir("data"))

['27821134.ann', '27846860.ann', '27904130.txt', '22515939.ann', '25293719.ann', '25410883.ann', '28103924.txt', '26264228.txt', '28120581.ann', '26656340.ann', '26309459.ann', '22218279.ann', '21067996.txt', '28154281.ann', '27842595.txt', '28292056.txt', '19307547.ann', '28079821.ann', '27773410.txt', '28239141.ann', '25884600.txt', '22791498.ann', '27980272.txt', '28353561.txt', '24781756.txt', '26530965.ann', '28151916.txt', '22218279.txt', '25139918.ann', '26350418.txt', '27928148.txt', '25139918.txt', '23468586.ann', '25721834.ann', '28202865.txt', '28120581.txt', '22781096.ann', '24294397.txt', '18561524.txt', '28403092.ann', '25853982.ann', '28033278.txt', '28115731.txt', '24526194.ann', '28207542.ann', '27749582.ann', '23033875.ann', '28296749.ann', '26523273.ann', '26361431.ann', '28559815.ann', '18416479.ann', '28538413.ann', '24957905.txt', '22719160.txt', '28250406.txt', '28090049.ann', '27846860.txt', '21067996.ann', '28202865.ann', '21923918.ann', '25246819.ann', '182581

In [66]:
txt_files = [f for f in os.listdir('data') if f.endswith('.txt')]
all_data = []
for txt_file in txt_files:
    ann_file = txt_file.replace('.txt', '.ann')
    bio_data = text_to_bio(f'data/{txt_file}', f'data/{ann_file}')
    all_data.append(bio_data)


Сущность 'In 2013' не сопоставлена с токенами.
Сущность 'A 49-year-old' не сопоставлена с токенами.


In [68]:
# Разделение данных на обучающую и тестовую выборки
train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)

# Подготовка признаков и меток
X_train, y_train = prepare_features_labels(train_data)
X_test, y_test = prepare_features_labels(test_data)


In [75]:
# Обучение CRF-модели
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)


In [76]:
# Предсказание на тестовых данных
y_pred = crf.predict(X_test)

# Вывод метрик
labels = list(set(tag for sent in y_test for tag in sent))
print(classification_report(
    [tag for sent in y_test for tag in sent],
    [tag for sent in y_pred for tag in sent],
    labels=labels,
    zero_division=0
))

                          precision    recall  f1-score   support

             B-Lab_value       0.63      0.61      0.62       659
                  B-Time       0.00      0.00      0.00         5
                 B-Shape       1.00      0.22      0.36         9
                  B-Area       0.25      0.20      0.22         5
      B-Disease_disorder       0.41      0.25      0.31       279
               B-History       0.24      0.07      0.11        74
        B-Administration       0.79      0.48      0.59        23
   B-Personal_background       1.00      0.43      0.60        14
B-Nonbiological_location       0.71      0.49      0.58        81
          B-Sign_symptom       0.60      0.59      0.59       657
               B-Texture       0.50      0.33      0.40         9
  B-Biological_structure       0.59      0.55      0.57       544
                   B-Sex       0.92      0.90      0.91        39
               B-Outcome       0.62      0.50      0.56        10
   B-Qual

In [77]:
def generate_permutations(text, max_permutations=3):
    doc = nlp(text)
    sents = [sent.text for sent in doc.sents]
    permutations = []

    # Создаем до max_permutations вариантов
    for _ in range(max_permutations):
        shuffled = sents.copy()
        random.shuffle(shuffled)
        permutations.append(" ".join(shuffled))

    return permutations

In [78]:
def parse_ann(ann_text):
    entities = []
    for line in ann_text.split('\n'):
        if line.startswith('T'):
            parts = line.split('\t')
            if len(parts) < 3:
                continue
            entity_info = parts[1].split()
            label = entity_info[0]
            text = parts[2].strip()
            entities.append((label, text))
    return entities

In [79]:
def text_to_bio_augmented(orig_text, entities):
    doc = nlp(orig_text)
    tokens = [token.text for token in doc]
    bio_tags = ['O'] * len(tokens)

    # Поиск сущностей по тексту (без позиций)
    for label, entity_text in entities:
        start = orig_text.find(entity_text)
        if start == -1:
            continue
        end = start + len(entity_text)
        span = doc.char_span(start, end, alignment_mode='contract')
        if span is not None:
            start_token = span.start
            end_token = span.end
            bio_tags[start_token] = f'B-{label}'
            for i in range(start_token + 1, end_token):
                bio_tags[i] = f'I-{label}'

    return list(zip(tokens, bio_tags))

In [80]:
def load_augmented_data(txt_path, ann_path, max_permutations=3):
    with open(txt_path, 'r') as f:
        orig_text = f.read()
    with open(ann_path, 'r') as f:
        ann = f.read()
    entities = parse_ann(ann)

    # Генерируем аугментированные тексты
    augmented_texts = generate_permutations(orig_text, max_permutations)
    all_bio = []
    for text in augmented_texts:
        bio_data = text_to_bio_augmented(text, entities)
        all_bio.append(bio_data)

    return all_bio

In [85]:
txt_files = [f for f in os.listdir('data') if f.endswith('.txt')]
all_data = []
for txt_file in txt_files:
    ann_file = txt_file.replace('.txt', '.ann')
    augmented_bio = load_augmented_data(f'data/{txt_file}', f'data/{ann_file}', max_permutations=3)
    all_data.extend(augmented_bio)

In [86]:
# Разделение данных на обучающую и тестовую выборки
train_data, test_data = train_test_split(all_data, test_size=0.2, random_state=42)

# Подготовка признаков и меток
X_train, y_train = prepare_features_labels(train_data)
X_test, y_test = prepare_features_labels(test_data)


In [87]:
# Обучение CRF-модели
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)


In [88]:
# Предсказание на тестовых данных
y_pred = crf.predict(X_test)

# Вывод метрик
labels = list(set(tag for sent in y_test for tag in sent))
print(classification_report(
    [tag for sent in y_test for tag in sent],
    [tag for sent in y_pred for tag in sent],
    labels=labels,
    zero_division=0
))

                          precision    recall  f1-score   support

  I-Biological_structure       0.75      0.81      0.78      1448
                  B-Time       0.85      0.76      0.80        29
 I-Therapeutic_procedure       0.76      0.81      0.78       385
   I-Personal_background       0.92      1.00      0.96        11
          I-Sign_symptom       0.72      0.79      0.75       782
B-Nonbiological_location       0.82      0.87      0.84       191
           I-Coreference       0.74      0.55      0.63        47
              B-Distance       0.79      0.89      0.84        87
            I-Occupation       1.00      0.49      0.65        35
              B-Severity       0.66      0.63      0.64       175
                B-Dosage       0.89      0.82      0.85       181
              I-Distance       0.86      0.81      0.84       191
  B-Biological_attribute       1.00      0.57      0.73         7
                B-Weight       1.00      1.00      1.00         1
         

In [1]:
!pip install scispacy
!python -m pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz

Collecting spacy<3.8.0,>=3.7.0 (from scispacy)
  Using cached spacy-3.7.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy<3.8.0,>=3.7.0->scispacy)
  Using cached thinc-8.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Using cached spacy-3.7.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
Using cached thinc-8.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (920 kB)
Installing collected packages: thinc, spacy
  Attempting uninstall: thinc
    Found existing installation: thinc 8.1.12
    Uninstalling thinc-8.1.12:
      Successfully uninstalled thinc-8.1.12
  Attempting uninstall: spacy
    Found existing installation: spacy 3.4.4
    Uninstalling spacy-3.4.4:
      Successfully uninstalled spacy-3.4.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fo

In [2]:
import os
import json
from sklearn.model_selection import train_test_split
import spacy
from collections import defaultdict

nlp = spacy.load("en_core_sci_sm")

def load_data(data_dir):
    texts, annotations = [], []
    for txt_file in os.listdir(data_dir):
        if not txt_file.endswith(".txt"):
            continue
        ann_file = txt_file.replace(".txt", ".ann")
        with open(os.path.join(data_dir, txt_file), "r") as f:
            text = f.read()
        with open(os.path.join(data_dir, ann_file), "r") as f:
            ann = [line.strip() for line in f if line.startswith("T")]
        texts.append(text)
        annotations.append(ann)
    return texts, annotations

# Пример структуры аннотации: ["T1	Disorder 0 5	cancer", ...]
texts, annotations = load_data("data/")
train_texts, test_texts, train_ann, test_ann = train_test_split(texts, annotations, test_size=0.2, random_state=42)

In [3]:
def build_wcl1(texts, annotations):
    word_classes = defaultdict(set)
    for text, ann in zip(texts, annotations):
        doc = nlp(text)
        for sent in doc.sents:
            for token in sent:
                # Извлечение признаков: POS, зависимость, медицинские сущности
                features = (token.pos_, token.dep_, token.ent_type_)
                word_classes[token.lemma_].add(features)
    return word_classes

wcl1_model = build_wcl1(train_texts, train_ann)

In [4]:
def parse_ann(ann_text):
    entities = []
    for line in ann_text.split('\n'):
        if not line.startswith('T'):
            continue  # Пропускаем строки, не начинающиеся с 'T'
        parts = line.strip().split('\t')
        if len(parts) < 3:
            continue  # Пропускаем некорректные строки
        entity_info = parts[1].split()
        if len(entity_info) < 3:
            continue  # Пропускаем строки с недостаточными данными
        label = entity_info[0]
        # Обработка позиций с разделителями (например, '1633;1652')
        start_end = entity_info[1].replace(',', ';').split(';')
        if len(start_end) < 2:
            continue  # Пропускаем некорректные позиции
        try:
            start = int(start_end[0])
            end = int(start_end[-1])
        except ValueError:
            continue  # Пропускаем строки с нечисловыми позициями
        text = parts[2].strip()
        entities.append((start, end, label, text))
    return entities

def build_wcl3(texts, annotations):
    field_lattices = {
        "DEFINIENDUM": defaultdict(set),
        "DEFINITOR": defaultdict(set),
        "DEFINIENS": defaultdict(set)
    }
    for text, ann in zip(texts, annotations):
        doc = nlp(text)
        for ent in ann:
            # Проверяем, что аннотация содержит достаточно данных
            parts = ent.split('\t')
            if len(parts) < 3:
                continue
            entity_info = parts[1].split()
            if len(entity_info) < 3:
                continue
            label = entity_info[0]
            start_end = entity_info[1].replace(',', ';').split(';')
            if len(start_end) < 2:
                continue
            try:
                start = int(start_end[0])
                end = int(start_end[-1])
            except ValueError:
                continue
            term = parts[2].strip()
            # Далее обработка как в оригинальном коде
            span = doc.char_span(start, end, alignment_mode='contract')
            if span and label in field_lattices:
                for token in span:
                    features = (token.pos_, token.dep_)
                    field_lattices[label][token.lemma_].add(features)
    return field_lattices

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

def extract_features(text, wcl_model):
    doc = nlp(text)
    features = []
    for token in doc:
        lemma = token.lemma_
        class_info = wcl_model.get(lemma, set())
        features.append({
            "pos": token.pos_,
            "is_medical_ent": token.ent_type_ == "DISORDER",
            "class_size": len(class_info),
            **{f"feature_{i}": 1 if f in class_info else 0 for i, f in enumerate(["NOUN", "VERB", "ADJ"])}
        })
    return features

from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


X_train = [extract_features(text, wcl1_model) for text in train_texts]
y_train = [[1 if any(token.text in a for a in ann) else 0 for token in nlp(text)] for text, ann in zip(train_texts, train_ann)]


X_test = [extract_features(text, wcl1_model) for text in test_texts]
y_test = [[1 if any(token.text in a for a in ann) else 0 for token in nlp(text)] for text, ann in zip(test_texts, test_ann)]


# Преобразование признаков в числовые векторы
vectorizer = DictVectorizer(sparse=False)
X_train_flat = [feature for text_features in X_train for feature in text_features]
X_train_vec = vectorizer.fit_transform(X_train_flat)

# Преобразование целевой переменной в плоский список
y_train_flat = [label for text_labels in y_train for label in text_labels]

# Обучение модели
clf = RandomForestClassifier()
clf.fit(X_train_vec, y_train_flat)

# Обработка тестовых данных
X_test_flat = [feature for text_features in X_test for feature in text_features]
X_test_vec = vectorizer.transform(X_test_flat)
y_test_flat = [label for text_labels in y_test for label in text_labels]

# Предсказание и оценка
y_pred = clf.predict(X_test_vec)
print(classification_report(y_test_flat, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.54      0.63      6397
           1       0.80      0.92      0.86     13111

    accuracy                           0.80     19508
   macro avg       0.79      0.73      0.74     19508
weighted avg       0.79      0.80      0.78     19508



In [16]:
def parse_ann_with_fields(ann):
    """Парсинг аннотаций с разделением на три поля."""
    fields = {
        "DEFINIENDUM": [],
        "DEFINITOR": [],
        "DEFINIENS": []
    }
    for line in ann:
        if not line.startswith("T"):
            continue
        parts = line.split('\t')
        if len(parts) < 3:
            continue
        entity_info = parts[1].split()
        if len(entity_info) < 3:
            continue
        label = entity_info[0]
        start_end = entity_info[1].replace(',', ';').split(';')
        if len(start_end) < 2:
            continue
        try:
            start = int(start_end[0])
            end = int(start_end[-1])
            text = parts[2].strip()
            # Эвристика: распределение по полям на основе типа сущности
            if label == "Disease":
                fields["DEFINIENDUM"].append((start, end, text))
            elif label == "Treatment":
                fields["DEFINITOR"].append((start, end, text))
            else:
                fields["DEFINIENS"].append((start, end, text))
        except:
            continue
    return fields

In [6]:
def build_wcl3(texts, annotations):
    field_lattices = {
        "DEFINIENDUM": defaultdict(set),
        "DEFINITOR": defaultdict(set),
        "DEFINIENS": defaultdict(set)
    }

    for text, ann in zip(texts, annotations):
        doc = nlp(text)
        fields = parse_ann_with_fields(ann)

        for field_name, entities in fields.items():
            for start, end, entity_text in entities:
                span = doc.char_span(start, end, alignment_mode='contract')
                if span:
                    for token in span:
                        lemma = token.lemma_.lower()
                        features = (token.pos_, token.dep_)
                        field_lattices[field_name][lemma].add(features)

    return field_lattices

In [7]:
def extract_features_wcl3(text, wcl3_model):
    doc = nlp(text)
    features = []
    for token in doc:
        lemma = token.lemma_.lower()
        field_features = {}
        for field in ["DEFINIENDUM", "DEFINITOR", "DEFINIENS"]:
            class_info = wcl3_model[field].get(lemma, set())
            field_features.update({
                f"{field}_pos": token.pos_,
                f"{field}_class_size": len(class_info),
                **{f"{field}_feat_{i}": 1 if f in class_info else 0
                   for i, f in enumerate(["NOUN", "VERB", "ADJ"])}
            })
        features.append(field_features)
    return features

In [9]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Обучение и оценка WCL-3
# Построение модели
wcl3_model = build_wcl3(train_texts, train_ann)

# Подготовка данных
X_train_wcl3 = [extract_features_wcl3(text, wcl3_model) for text in train_texts]
X_test_wcl3 = [extract_features_wcl3(text, wcl3_model) for text in test_texts]

# Векторизация
vectorizer_wcl3 = DictVectorizer(sparse=False)
X_train_wcl3_flat = [feature for text_features in X_train_wcl3 for feature in text_features]
X_train_wcl3_vec = vectorizer_wcl3.fit_transform(X_train_wcl3_flat)


In [10]:
def assign_field(entity_text, label):
    if "diagnosis" in entity_text.lower() or label == "Disease":
        return "DEFINIENDUM"
    elif "treatment" in entity_text.lower() or label == "Drug":
        return "DEFINITOR"
    else:
        return "DEFINIENS"

In [11]:
def extract_features_wcl3(text, wcl3_model):
    doc = nlp(text)
    features = []
    for i, token in enumerate(doc):
        lemma = token.lemma_.lower()
        field_features = {}
        prev_token = doc[i-1] if i > 0 else None
        next_token = doc[i+1] if i < len(doc)-1 else None

        for field in ["DEFINIENDUM", "DEFINITOR", "DEFINIENS"]:
            class_info = wcl3_model[field].get(lemma, set())
            field_features.update({
                f"{field}_prev_pos": prev_token.pos_ if prev_token else "START",
                f"{field}_next_pos": next_token.pos_ if next_token else "END",
                f"{field}_is_capitalized": token.text.istitle(),
                f"{field}_prefix": token.text[:3],
                f"{field}_suffix": token.text[-3:],
            })
        features.append(field_features)
    return features

In [17]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_wcl3_balanced, y_train_balanced = smote.fit_resample(X_train_wcl3_vec, y_train_flat)

clf_wcl3 = RandomForestClassifier(
    class_weight={0: 1.5, 1: 1},  # Увеличение веса класса 0
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    random_state=42
)
clf_wcl3.fit(X_train_wcl3_balanced, y_train_balanced)

In [18]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    RandomForestClassifier(class_weight='balanced'),
    param_grid,
    cv=3,
    scoring='f1_weighted'
)
grid_search.fit(X_train_wcl3_balanced, y_train_balanced)
clf_wcl3 = grid_search.best_estimator_

In [21]:
# Предсказание
X_test_wcl3_flat = [feature for text_features in X_test_wcl3 for feature in text_features]
X_test_wcl3_vec = vectorizer_wcl3.transform(X_test_wcl3_flat)

y_pred_wcl3 = clf_wcl3.predict(X_test_wcl3_vec)

In [None]:
print("\nWCL-3 Результаты:")
print(classification_report(y_test_flat, y_pred_wcl3))

WCL-3 Результаты:
                  
                  precision    recall  f1-score   support

           0           0.66      0.71      0.69      6822
           1           0.85      0.80      0.84     14358

    accuracy                               0.82     21180
    macro avg          0.79      0.76      0.78     21180
    weighted avg       0.87      0.87      0.87     21180

In [8]:
import random
from sklearn.utils import resample

# Функция генерации перестановок
def generate_permutations(text, k=8):
    doc = nlp(text)
    sents = [sent.text for sent in doc.sents]
    permutations = []

    for _ in range(k):
        if len(sents) > 1:  # Перемешиваем только тексты с >1 предложением
            shuffled = random.sample(sents, len(sents))
        else:
            shuffled = sents  # Для коротких текстов оставляем как есть
        permutations.append(" ".join(shuffled))
    return permutations

In [9]:
def adapt_annotations(orig_text, perm_text, ann):
    orig_doc = nlp(orig_text)
    perm_doc = nlp(perm_text)

    new_ann = []
    for ent_line in ann:
        parts = ent_line.split('\t')
        if len(parts) < 3:
            continue  # Пропускаем строки с некорректным форматом

        # Обработка компонентов аннотации
        entity_id = parts[0]
        entity_info = parts[1].split()
        entity_text = parts[2].strip()

        if len(entity_info) < 3:
            continue  # Пропускаем строки без позиций

        label = entity_info[0]
        start_end = entity_info[1].replace(',', ';').split(';')
        if len(start_end) < 2:
            continue  # Пропускаем некорректные позиции

        try:
            start = int(start_end[0])
            end = int(start_end[-1])
        except ValueError:
            continue  # Пропускаем нечисловые значения

        # Поиск термина в переставленном тексте
        term_text = entity_text
        start_idx = perm_text.find(term_text)
        if start_idx != -1:
            new_ann.append(f"{entity_id}\t{label} {start_idx} {start_idx+len(term_text)}\t{term_text}")

    return new_ann

In [10]:
# Расширение датасета с аугментацией
augmented_train_texts = []
augmented_train_ann = []

for text, ann in zip(train_texts, train_ann):
    perms = generate_permutations(text, k=8)
    for perm in perms:
        adapted_ann = adapt_annotations(text, perm, ann)
        augmented_train_texts.append(perm)
        augmented_train_ann.append(adapted_ann)

# Объединение с исходными данными
combined_train_texts = train_texts + augmented_train_texts
combined_train_ann = train_ann + augmented_train_ann

In [11]:
# Перестройка WCL-3 с аугментированными данными
wcl3_model_aug = build_wcl3(combined_train_texts, combined_train_ann)

In [12]:
# Обучение на расширенном датасете
X_train_wcl3_aug = [extract_features_wcl3(text, wcl3_model_aug) for text in combined_train_texts]
y_train_aug = [[1 if any(token.text in a for a in ann) else 0 for token in nlp(text)]
              for text, ann in zip(combined_train_texts, combined_train_ann)]

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Векторизация и балансировка
vectorizer_aug = DictVectorizer(sparse=False)
X_train_aug_flat = [feature for text_features in X_train_wcl3_aug for feature in text_features]
X_train_aug_vec = vectorizer_aug.fit_transform(X_train_aug_flat)
y_train_aug_flat = [label for text_labels in y_train_aug for label in text_labels]

In [None]:
# Обучение с оптимизированными параметрами
clf_aug = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    class_weight={0: 1.8, 1: 1},
    min_samples_split=3,
    random_state=42
)
clf_aug.fit(X_train_aug_vec, y_train_aug_flat)

In [None]:
# Оценка
X_test_aug = [extract_features_wcl3(text, wcl3_model_aug) for text in test_texts]
X_test_aug_flat = [feature for text_features in X_test_aug for feature in text_features]
X_test_aug_vec = vectorizer_aug.transform(X_test_aug_flat)

y_pred_aug = clf_aug.predict(X_test_aug_vec)
print("WCL-3 + Permutation-randomk Результаты:")
print(classification_report(y_test_flat, y_pred_aug))

WCL-3 + Permutation-randomk Результаты::
                  
                  precision    recall  f1-score   support

           0           0.73      0.77      0.75      6822
           1           0.90      0.84      0.87     14358

    accuracy                               0.85     21180
    macro avg          0.74      0.74      0.74     21180
    weighted avg       0.85      0.85      0.85     21180