In [5]:
!pip install gliner

Collecting gliner
  Downloading gliner-0.2.13-py3-none-any.whl.metadata (7.3 kB)
Collecting onnxruntime (from gliner)
  Downloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting coloredlogs (from onnxruntime->gliner)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime->gliner)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading gliner-0.2.13-py3-none-any.whl (47 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnxruntime-1.20.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (13.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m94.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
import json

def extract_relation_labels_from_file(file_path):
    relation_labels = set()
    with open(file_path, 'r') as file:
        for line in file:
            try:
                entry = json.loads(line.strip())
                names = entry.get('names', [])
                for name in names:
                    relation_labels.add(name)
            except json.JSONDecodeError:
                continue

    return list(relation_labels)

file_path = 'val_wiki-2.json'
relation_labels = extract_relation_labels_from_file(file_path)
print(relation_labels)


['subject has object as biological, foster, and/or adoptive child', 'military rank achieved by a person (should usually have a "start time" qualifier), or military rank associated with a position', 'sport', 'official classification by a regulating body under which the subject (events, teams, participants, or equipment) qualifies for inclusion', "person's voice type. expected values: soprano, mezzo-soprano, contralto, countertenor, tenor, baritone, bass (and derivatives)", 'located in or next to body of water', 'position played on team / speciality', 'original language of film or TV show', 'primary topic of a work (see also P180: depicts)', 'sea, lake or river', 'the area of the celestial sphere of which the subject is a part (from a scientific standpoint, not an astrological one)', 'part of', 'position or specialism of a player on a team, e.g. Small Forward', 'main subject', 'organization or club to which the subject belongs. Do not use for membership in ethnic or social groups, nor fo

In [None]:
from gliner import GLiNER
from datasets import load_dataset
import torch
import json
from tqdm import tqdm
from datetime import datetime


class MultiTaskProcessor:

    def __init__(self, model_name="knowledgator/gliner-multitask-large-v0.5", device=None):
        self.device = device or torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = GLiNER.from_pretrained(model_name).to(self.device)
        self.tasks = {
            "ner": self._process_ner,
            "relation_extraction": self._process_relation_extraction,
            "summarize": self._process_summarize,
            "open_extraction": self._process_open_extraction,
            "question_answer": self._process_question_answer,
            "sentiment_analysis": self._sentiment_analysis,
        }

    def _save_results(self, results, task_name):

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        file_name = f"{task_name}_results_{timestamp}.json"

        with open(file_name, "w") as file:
            json.dump(results, file, indent=2)

    def _process_ner(self, dataset, threshold=0.5):

        labels = ["Person", "Country", "Location", "City", "Event", "Organization", 'State']
        results = {}

        with tqdm(total=len(dataset), desc="Processing NER") as pbar:
            for idx, entry in enumerate(dataset):
                text = " ".join(entry["tokenized_text"])
                entities = self.model.predict_entities(text, labels, flat_ner=False, threshold=threshold)
                results[idx] = entities
                pbar.update(1)

        return results

    def _process_relation_extraction(self, dataset, threshold=0.5):
        labels = relation_labels
        results = {}

        with open(dataset, 'r', encoding='utf-8') as f:
            with tqdm(desc="Processing Relation Extraction") as pbar:
                for line in f:
                    try:
                        entry = json.loads(line.strip())
                        tokens = entry.get("tokens", [])
                        head = entry.get("head", {})
                        tail = entry.get("tail", {})
                        head_name = head.get("text", None)
                        tail_name = tail.get("text", None)

                        if head_name and tail_name and tokens:
                            input_text = f"Identify the relation between '{head_name}' and '{tail_name}' in the context of the sentence: " + " ".join(tokens)
                            predictions = self.model.predict_entities(input_text, flat_ner=False, labels=labels, threshold=threshold)
                            for relation in predictions:
                                relation_label = relation.get("label")
                                if relation_label:
                                    relation_label_str = f"{head_name} <> {relation_label} <> {tail_name}"
                                    if relation_label_str not in results:
                                        results[relation_label_str] = []
                                    results[relation_label_str].append({'head': head_name, 'tail': tail_name, 'text': input_text})
                        pbar.update(1)
                    except json.JSONDecodeError as e:
                        print(f"Ошибка при обработке строки JSON: {e}")
                    except Exception as e:
                        print(f"Общая ошибка: {e}")

        return results




    def _process_summarize(self, dataset, threshold=0.5):
        labels = ['summary']
        prompt = "Summarize the given text, highlighting the most important information:\n"
        results = {}

        with tqdm(total=len(dataset), desc="Processing Summarization") as pbar:
            for idx, entry in enumerate(dataset):
                text = entry["article"]
                input_text = prompt + text
                summaries = self.model.predict_entities(input_text, labels=labels, threshold=threshold)

                summary_text = [summary["text"] for summary in summaries]
                results[idx] = summary_text
                pbar.update(1)

        return results

    def _process_question_answer(self, dataset, threshold=0.5):
        predictions = []

        with tqdm(total=len(dataset), desc="Processing Question Answering") as pbar:
            for entry in dataset:
                question_id = entry["id"]
                question = entry["question"]
                context = entry["context"]

                input_text = question + " " + context

                answers = self.model.predict_entities(input_text, labels=["answer"], threshold=threshold)

                prediction = {
                    "id": question_id,
                    "prediction_text": answers[0]["text"] if answers else "",  # Берем лучший ответ или пустую строку
                }

                predictions.append(prediction)
                pbar.update(1)

        return predictions



    def _sentiment_analysis(self, dataset, threshold=0.5):

      labels = ["positive", "negative"]
      results = {}
      with tqdm(total=len(dataset), desc="Processing Sentiment Analysis") as pbar:
          for idx, entry in enumerate(dataset):
              text = entry["text"]
              if not text.strip():
                  results[idx] = {"text": text, "sentiment": []}
                  pbar.update(1)
                  continue
              try:
                  sentiment_result = self.model.predict_entities(
                      text, labels=labels, flat_ner=False, threshold=threshold
                  )
                  results[idx] = {
                      "text": text,
                      "sentiment": [
                          {"label": entity["label"], "score": entity["score"]}
                          for entity in sentiment_result
                      ],
                  }
              except Exception as e:
                  results[idx] = {"text": text, "sentiment": []}
              pbar.update(1)
      return results


    def _process_open_extraction(self, dataset, threshold=0.5):
        labels = ["match"]
        results = {}

        with tqdm(total=len(dataset), desc="Processing Positive Aspects") as pbar:
            for idx, entry in enumerate(dataset):

                text = entry["text"]
                prompt = "Find all positive aspects about the product:\n"
                input_text = prompt + text

                matches = self.model.predict_entities(input_text, labels=labels, threshold=threshold)

                positive_aspects = [match["text"] for match in matches]
                results[idx] = positive_aspects
                pbar.update(1)

        return results

    def process(self, task_name, dataset, threshold=0.5):

        if task_name not in self.tasks:
            raise ValueError(f"Invalid task name: {task_name}")

        results = self.tasks[task_name](dataset, threshold)
        self._save_results(results, task_name)

        return results


if __name__ == "__main__":

    dataset_sum_keywords = load_dataset('cnn_dailymail', '3.0.0')['validation']
    dataset_qa = load_dataset("rajpurkar/squad_v2")['validation']
    dataset_sment_open = load_dataset("stanfordnlp/imdb")['test']

    processor = MultiTaskProcessor()


    results_re = processor.process("relation_extraction", "val_wiki-2.json")
    processor._save_results(results_re, "relation_extraction")

    results_qa = processor.process('question_answer', dataset_qa)
    processor._save_results(results_qa, 'question_answer')

    results_sum = processor.process("summarize", dataset_sum_keywords)
    processor._save_results(results_sum, 'summarize')

    results_sment = processor.process('sentiment_analysis', dataset_sment_open)
    processor._save_results(results_sment, 'sentiment_analysis')

    results_open = processor.process('open_extraction', dataset_sment_open)
    processor._save_results(results_open, 'open_extraction')


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/14.5k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

gliner_config.json:   0%|          | 0.00/880 [00:00<?, ?B/s]

gliner_multitask_performance.png:   0%|          | 0.00/76.8k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.76G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

