In [7]:
! pip install pullenti_client



In [2]:
from pullenti_client import Client
from pullenti_client.referent import Referent, Slot
from pullenti_client.result import Match, Result

import pandas as pd
from typing import Any, Optional, List, Union

class PullentiAnalyzer:
    PULLENTI_KEYS = {
        "GEO",
        "ORGANIZATION",
        "PERSON"
    }

    GEO_RULES = {"NA": ["name", "type"]}
    ORGANIZATION_RULES = {"NA": ["name"]}
    PERSON_RULES = {"NA": ["firstname"]}

    def __init__(
            self,
            text: str,
            analyzers: Optional[List[str]] = None,
            pullenti_kwargs: Optional[dict] = None
    ):
        """
        Args:
            text:
            analyzers:
        Returns:
            A new cursor object using the connection
        """

        self.text = text
        self.analyzers = analyzers if analyzers else self.PULLENTI_KEYS
        self.pullenti_client = None

        # Check analyzers
        if analyzers:
            if not all(key in analyzers for key in self.PULLENTI_KEYS):
                raise KeyError

        # Init pullenti
        try:
            self.pullenti_client = Client(**pullenti_kwargs)
        except Exception:
            raise Exception

    @staticmethod
    def _append_matches(matches: list, match: Match, analyzer: Optional[str]):
        if analyzer:
            if match.referent.label == analyzer:
                matches.append(match)
        else:
            matches.append(match)
        return matches

    @staticmethod
    def _data_helper(dataframe: pd.DataFrame, rules):
        dataframe.dropna(subset=rules["NA"], how="any", inplace=True)
        dataframe.dropna(how="all", axis=1, inplace=True)
        dataframe.drop_duplicates(subset=rules["NA"], inplace=True)
        return dataframe

    def result(self) -> Result:
        return self.pullenti_client(self.text)

    def matches(self, analyzer: Optional[str] = None) -> Optional[List[Match]]:
        matches = []
        for match in self.result().matches:
            matches = self._append_matches(matches, match, analyzer)
            for child_match in match.children:
                matches = self._append_matches(matches, child_match, analyzer)
        return matches

    def slots(self, analyzer: Optional[str]) -> Optional[List[Slot]]:
        return [match.referent.slots for match in self.matches(analyzer)]

    def data(self, analyzer: Optional[str]) -> Optional[pd.DataFrame]:
        if not self.slots(analyzer):
            return None

        data: list[dict] = []
        for slots in self.slots(analyzer):
            ner_row = {"tag": analyzer}

            for slot in slots:
                if slot.key.lower() not in ner_row:
                    ner_row[slot.key.lower()] = slot.value
                elif slot.key.lower() + "_vars" not in ner_row:
                    ner_row[slot.key.lower() + "_vars"] = [slot.value]
                elif slot.key.lower() + "_vars" in ner_row:
                    ner_row[slot.key.lower() + "_vars"].append(slot.value)

                if slot.key.lower() == "attribute":
                    attributes = [
                        slot.value for slot in slot.value.slots
                        if type(slot.value) is not Referent
                    ]
                    ner_row[slot.key.lower() + "_vars"] = attributes
                    del ner_row[slot.key.lower()]

            if "higher" not in ner_row.keys():
                data.append(ner_row)

        dataframe = pd.DataFrame.from_dict(data=data)

        if analyzer == "GEO":
            dataframe = self._data_helper(dataframe, self.GEO_RULES)
        elif analyzer == "ORGANIZATION":
            dataframe = self._data_helper(dataframe, self.ORGANIZATION_RULES)
        elif analyzer == "PERSON":
            dataframe = self._data_helper(dataframe, self.PERSON_RULES)

        return dataframe

In [6]:
PULLENTI_CONFIG = {"host": "localhost", "port": 8081}

In [7]:
some_text = "Усманов на суде с Навальным предъявил справку об уплате налогов в России. Опровергая свою же пресс-службу"

In [8]:
analyzer = PullentiAnalyzer(some_text, [], PULLENTI_CONFIG)

In [13]:
analyzer.matches()
# test.data("ORGANIZATION")
# test.data("PERSON")

[Match(
     referent=Referent(
         label='GEO',
         slots=[Slot(
              key='ALPHA2',
              value='RU'
          ),
          Slot(
              key='NAME',
              value='РФ'
          ),
          Slot(
              key='NAME',
              value='РОССИЙСКАЯ ФЕДЕРАЦИЯ'
          ),
          Slot(
              key='NAME',
              value='РОССИЯ'
          ),
          Slot(
              key='TYPE',
              value='государство'
          )]
     ),
     span=Span(
         start=66,
         stop=72
     ),
     children=[]
 )]

In [9]:
data = test.data("GEO")

In [16]:
import uuid

myuuid = uuid.uuid4()

str(myuuid)

'9b1a1b47-a758-42f4-b446-da808e34d966'

In [17]:
# article: article_id|rima_article_id|title|plain_text|published_dt 
# entity: entity_id|tag|name / [firts name + last name] |name_vars|type|type_vars

# entity_attribute entity_id|key[lastname|firstname|middlename|sex|attribute_vars]|value

# article_x_entity_person: article_id|article_id