In [21]:
!pip install simpletransformers



In [107]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from simpletransformers.ner import NERModel,NERArgs
import re

In [23]:
df = pd.read_csv('/kaggle/input/fixed-tags-for-ner/fixed_csv_for_ner.csv').drop('Unnamed: 0', axis=1)

In [77]:

def transform_dataframe(df):
    # Преобразуем столбцы 'Text' и 'Tags' к строке, обрабатывая NaN
    df['Text'] = df['Text'].fillna('').astype(str)
    df['Tags'] = df['Tags'].fillna('').astype(str)

    # Создаем пустой DataFrame для результата
    result_df = pd.DataFrame(columns=['sentence_id', 'words', 'labels'])

    # Номер предложения
    sentence_id = 0

    # Итерация по строкам DataFrame
    for index, row in tqdm(df.iterrows()):
        text = row['Text'].split()  # разбиваем текст на слова
        tags = row['Tags'].split()  # разбиваем теги на список тегов

        # Убедимся, что количество слов соответствует количеству тегов
        if len(text) != len(tags):
            print(f"Warning: Mismatch at row {index} with text '{row['Text']}' and tags '{row['Tags']}'")
            continue  # Пропускаем текущую запись

        temp_df = pd.DataFrame({
            'sentence_id': [sentence_id] * len(text),
            'words': text,
            'labels': tags
        })

        # Добавляем временный DataFrame в итоговый
        result_df = pd.concat([result_df, temp_df], ignore_index=True)

        # Увеличиваем номер предложения
        sentence_id += 1

    return result_df


transformed_df = transform_dataframe(df)
transformed_df

13001it [06:58, 18.19it/s]



16572it [10:58, 25.17it/s]


Unnamed: 0,sentence_id,words,labels
0,0,фокус,"['O',"
1,0,недели,"'O',"
2,0,фн,"'O',"
3,0,сегодня,"'O',"
4,0,ммк,"'ORG',"
...,...,...,...
1367155,16570,марта,"'O',"
1367156,16570,м,"'O',"
1367157,16570,видео,"'O',"
1367158,16570,мсфо,"'O',"


In [25]:
label = transformed_df["labels"].unique().tolist()

args = NERArgs()
args.num_train_epochs = 1
args.learning_rate = 1e-4
args.overwrite_output_dir =True
args.train_batch_size = 32
args.eval_batch_size = 32

In [26]:
label

["['O',", "'O',", "'ORG',", "'O']", "['ORG',", "'ORG']", "['O']"]

In [27]:
model = NERModel('bert', 'ai-forever/ruBert-base',labels=label, args=args)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at ai-forever/ruBert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
X = transformed_df[['sentence_id', 'words']]
y = transformed_df['labels']

train_data, test_data = train_test_split(transformed_df, test_size=0.2)

In [29]:
model.train_model(train_data, eval_data = test_data, acc=accuracy_score)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 1 of 1:   0%|          | 0/518 [00:00<?, ?it/s]

(518, 0.12672990950320678)

In [30]:
result, model_outputs, preds_list = model.eval_model(test_data)

  0%|          | 0/5 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/500 [00:00<?, ?it/s]



In [31]:
result

{'eval_loss': 0.11346666055172681,
 'precision': 0.6566566058792198,
 'recall': 0.6735171696149844,
 'f1_score': 0.6649800300512413}

In [32]:
prediction, model_output = model.predict(["vk совершает сделку с yandex, после чего вк решили, что стоит и покупать дзен, а после этого руснефть приняла решение создать новые облегации"])

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [33]:
prediction

[[{'vk': "'ORG',"},
  {'совершает': "'O',"},
  {'сделку': "'O',"},
  {'с': "'O',"},
  {'yandex,': "'ORG',"},
  {'после': "'O',"},
  {'чего': "'O',"},
  {'вк': "'ORG',"},
  {'решили,': "'O',"},
  {'что': "'O',"},
  {'стоит': "'O',"},
  {'и': "'O',"},
  {'покупать': "'O',"},
  {'дзен,': "'O',"},
  {'а': "'O',"},
  {'после': "'O',"},
  {'этого': "'O',"},
  {'руснефть': "'ORG',"},
  {'приняла': "'O',"},
  {'решение': "'O',"},
  {'создать': "'O',"},
  {'новые': "'O',"},
  {'облегации': "'O',"}]]

In [34]:
model.save_model(output_dir='/kaggle/working/', model=model.model)

In [40]:
df_id = pd.read_excel('/kaggle/input/12345678/names and synonyms.xlsx')
df_id = df_id[['issuerid', 'Unnamed: 11']]

In [41]:
df_id

Unnamed: 0,issuerid,Unnamed: 11
0,1,Держава
1,2,Московский кредитный банк
2,3,Российский акционерный коммерческий дорожный банк
3,4,алроса
4,5,Авангард
...,...,...
250,270,Хендерсон
251,271,Sovcombank
252,272,АЗС Трасса
253,273,Делимобил


In [58]:
org_words = []
for sentence in prediction:
    for word_dict in sentence:
        for word, tag in word_dict.items():
            if "'ORG'," in tag:  # Проверяем, содержится ли тег 'ORG'
                org_words.append(word)  # Добавляем слово в список
org_words

['vk', 'yandex,', 'вк', 'руснефть']

In [109]:
def clean_text(text):
    return re.sub(r'[^\w\s]', '', text).lower()
org_words = [clean_text(word) for word in org_words]
org_words

['vk', 'yandex', 'вк', 'руснефть']

In [100]:
df_id = pd.read_excel('/kaggle/input/12345678/names and synonyms.xlsx')

In [103]:
fixed_ner_first = df_id['Unnamed: 11']
df_id['issuerid'] = df_id['issuerid'].apply(lambda x: str(x))
all_new = df_id[['Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'issuerid']]
df_id = pd.concat([fixed_ner_first, all_new], axis=1)
df_id

Unnamed: 0,Unnamed: 11,Unnamed: 11.1,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11.2,Unnamed: 11.3,issuerid
0,Держава,Держава,Держава,DERZHAVA,DERZ,,,,Держава,Держава,1
1,Московский кредитный банк,Московский кредитный банк,Московский кредитный банк,мкб,Credit Bank of Moscow,Credit Bank,,,Московский кредитный банк,Московский кредитный банк,2
2,Российский акционерный коммерческий дорожный банк,Российский акционерный коммерческий дорожный банк,Российский акционерный коммерческий дорожный банк,РДБанк,Дорожный банк,Russian public joint-stock commercial roads Bank,RosDorBank,РосДорБанк,Российский акционерный коммерческий дорожный банк,Российский акционерный коммерческий дорожный банк,3
3,алроса,алроса,алроса,alrosa,,,,,алроса,алроса,4
4,Авангард,Авангард,Авангард,AVANGARD,,,,,Авангард,Авангард,5
...,...,...,...,...,...,...,...,...,...,...,...
250,Хендерсон,Хендерсон,Хендерсон,,,,,,Хендерсон,Хендерсон,270
251,Sovcombank,Sovcombank,Sovcombank,,,,,,Sovcombank,Sovcombank,271
252,АЗС Трасса,АЗС Трасса,АЗС Трасса,"АЗС ""Трасса""",АЗС «Трасса»,,,,АЗС Трасса,АЗС Трасса,272
253,Делимобил,Делимобил,Делимобил,Каршеринг Рус,delimobil,"ПАО ""Каршеринг Руссия""","""Каршеринг Руссия"", ПАО",,Делимобил,Делимобил,273


In [None]:
def remove_nans(values_list):
    return [value.lower() for value in values_list if not pd.isna(value)]

data = {}
for i in range(len(df_id.values)):
    data[df_id.values[i][-1]] = list(set(df_id.values[i]))

for key in data.keys():
    data[key] = remove_nans(data[key])

In [None]:
data

In [110]:
org_keys = {word: set() for word in org_words}

for word in org_words:
    for key, names in data.items():
        if word in names:
            org_keys[word].add(key)

print(org_keys)

{'vk': {'223'}, 'yandex': {'236'}, 'вк': {'223'}, 'руснефть': set()}


In [72]:
df_id[df_id['issuerid'] == '236']

Unnamed: 0,Unnamed: 11,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11.1,issuerid
235,Яндекс,Яндекс,Yandex,,,,,Яндекс,236


In [2]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.70.0-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m612.3 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.33.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit->simpletransformers)
  Downloading pydeck-0.8.1b0-py2.py3-none-any.whl.metadata (3.9 kB)
Collecting watchdog>=2.1.5 (from streamlit->simpletransformers)
  Downloading watchdog-4.0.0-py3-none-manylinux2014_x86_64.whl.metadata (37 kB)
Downloading simpletransformers-0.70.0-py3-none-any.whl (315 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
from simpletransformers.ner import NERModel


model = NERModel(
    model_type='bert',
    model_name='/kaggle/input/fin-mid/model'  
)




2024-04-14 02:52:53.653025: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-14 02:52:53.653132: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-14 02:52:53.810832: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
prediction, model_output = model.predict(["vk совершает сделку с yandex, после чего вк решили, что стоит и покупать дзен, а после этого руснефть приняла решение создать новые облегации"])

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
import re
prediction
org_words = []
for sentence in prediction:
    for word_dict in sentence:
        for word, tag in word_dict.items():
            if "'ORG'," in tag:  
                org_words.append(word) 
org_words
def clean_text(text):
    return re.sub(r'[^\w\s]', '', text).lower()
org_words = [clean_text(word) for word in org_words]
org_words

['vk', 'yandex', 'вк', 'руснефть']

In [12]:
import pandas as pd
df_id = pd.read_excel('/kaggle/input/hahahaha/names and synonyms.xlsx')
fixed_ner_first = df_id['Unnamed: 11']
df_id['issuerid'] = df_id['issuerid'].apply(lambda x: str(x))
all_new = df_id[['Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'issuerid']]
df_id = pd.concat([fixed_ner_first, all_new], axis=1)
df_id

Unnamed: 0,Unnamed: 11,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11.1,issuerid
0,,Держава,DERZHAVA,DERZ,,,,,1
1,,Московский кредитный банк,мкб,Credit Bank of Moscow,Credit Bank,,,,2
2,roads Bank,Российский акционерный коммерческий дорожный банк,РДБанк,Дорожный банк,Russian public joint-stock commercial roads Bank,RosDorBank,РосДорБанк,roads Bank,3
3,,алроса,alrosa,,,,,,4
4,,Авангард,AVANGARD,,,,,,5
...,...,...,...,...,...,...,...,...,...
250,,Хендерсон,,,,,,,270
251,,Sovcombank,,,,,,,271
252,,АЗС Трасса,"АЗС ""Трасса""",АЗС «Трасса»,,,,,272
253,,Делимобил,Каршеринг Рус,delimobil,"ПАО ""Каршеринг Руссия""","""Каршеринг Руссия"", ПАО",,,273


In [13]:
def remove_nans(values_list):
    return [value.lower() for value in values_list if not pd.isna(value)]

data = {}
for i in range(len(df_id.values)):
    data[df_id.values[i][-1]] = list(set(df_id.values[i]))

for key in data.keys():
    data[key] = remove_nans(data[key])

In [19]:
import re
def get_org_from_text(text):
    prediction, model_output = model.predict([text])
    org_words = []
    for sentence in prediction:
        for word_dict in sentence:
            for word, tag in word_dict.items():
                if "'ORG'," in tag:  
                    org_words.append(word) 
    def clean_text(text):
        return re.sub(r'[^\w\s]', '', text).lower()
    org_words = [clean_text(word) for word in org_words]
    return org_words

In [20]:
get_org_from_text('Роснефть (ROSN) справедливая цена 425.17 рубля, потенциал роста на 23%. #сборник  1. Роснефть увеличила добычу на 3% в 2022 году, планирует увеличить ещё на 5% в 2023 https://t.me/AK47pfl/14309 2. Саудовская Аравия повышает цены реализации по нефти для всех регионов https://t.me/AK47pfl/14311 3. Нефть и газ зарабатывают много денег: BP решили увеличить инвестиции в добычу нефти https://t.me/AK47pfl/14313 4. Российская нефть проходит ограничения лучше прогнозов https://t.me/AK47pfl/14303 5. Цены реализации российской нефти, похоже, выше, чем принято считать https://t.me/AK47pfl/14304  @AK47pfl')

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

['роснефть', 'роснефть']

In [18]:
data
import pickle
with open('my_dict.pkl', 'wb') as f:
    pickle.dump(data, f)


In [56]:
def get_id_from_text(text):
    org_words = get_org_from_text(text)
    org_keys = {word: set() for word in org_words}

    for word in org_words:
        for key, names in data.items():
            if word in names:
                org_keys[word].add(key)
    answer =[list(v) for v in org_keys.values()]

    answers = [[int(id_str) for id_str in id_list] for id_list in answer]

    return(answers)
mas = get_id_from_text("вк яндекс самосвал мкб")

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [57]:
sas =[]
for m in mas:
    sas.append((m[0],get_sent_for_text('вк яндекс самосвал мкб')))
sas

[(223, 3), (236, 3), (2, 3)]

In [67]:
def get_sent_for_text(text):
    a = text[0]
    return float(3)

In [34]:
get_sent_for_text("get_org_from_text('Роснефть (ROSN) справедливая цена 425.17 рубля, потенциал роста на 23%. #сборник  1. Роснефть увеличила добычу на 3% в 2022 году, планирует увеличить ещё на 5% в 2023 https://t.me/AK47pfl/14309 2. Саудовская Аравия повышает цены реализации по нефти для всех регионов https://t.me/AK47pfl/14311 3. Нефть и газ зарабатывают много денег: BP решили увеличить инвестиции в добычу нефти https://t.me/AK47pfl/14313 4. Российская нефть проходит ограничения лучше прогнозов https://t.me/AK47pfl/14303 5. Цены реализации российской нефти, похоже, выше, чем принято считать https://t.me/AK47pfl/14304  @AK47pfl')")

3

In [68]:
def get_all_from_text(text):
    def get_org_from_text(text):
        prediction, model_output = model.predict([text])
        org_words = []
        for sentence in prediction:
            for word_dict in sentence:
                for word, tag in word_dict.items():
                    if "'ORG'," in tag:  
                        org_words.append(word) 
        def clean_text(text):
            return re.sub(r'[^\w\s]', '', text).lower()
        org_words = [clean_text(word) for word in org_words]
        return org_words
    def get_id_from_text(text):
        org_words = get_org_from_text(text)
        org_keys = {word: set() for word in org_words}

        for word in org_words:
            for key, names in data.items():
                if word in names:
                    org_keys[word].add(key)
        answer =[list(v) for v in org_keys.values()]

        answers = [[int(id_str) for id_str in id_list] for id_list in answer]

        return(answers)
    mas = get_id_from_text(text)
    def get_sent_for_text(text):
        a = text[0]
        return float(3)
    sas =[]
    for m in mas:
        sas.append((m[0],get_sent_for_text(text)))
    return(sas)
    
    

In [69]:
get_all_from_text("а я в мкб кредит взял а не в сбер")

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[(2, 3.0), (150, 3.0)]

In [70]:
texts = ["сосня","тиньков","мкб","а я в мкб кредит взял а не в сбер"]

In [72]:
scores =[]
for text in texts:
    scores.append(get_all_from_text(text))
scores

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[[], [], [(2, 3.0)], [(2, 3.0), (150, 3.0)]]

In [None]:
def score_text(messages):
    scores = []
    for message in messages:
        scores.append(get_all_from_text(message))
    return scores