## Made by Shcherbakov Pavel

In [1011]:
import pandas as pd
pd.set_option('display.max_rows', 500)
import natasha as nt
from ipymarkup import show_line_markup
from yargy.tokenizer import MorphTokenizer
from yargy.predicates import gram, eq, normalized
from yargy import Parser, rule
from yargy.pipelines import morph_pipeline

tokenizer = MorphTokenizer()

In [1012]:
raw_input = pd.read_csv(r'F:/DataScience/Educational/Собеседования/Bewise/test_data.csv')


In [1013]:
#Total number of dialogs
num_dialog = raw_input['dlg_id'].unique()
num_dialog

array([0, 1, 2, 3, 4, 5], dtype=int64)

In [1014]:
dialog_dict_full = {}
# Full dialog dict with dlg_id as a key and tuple (line_n, text) as value
def set_full_dialog(df):
    for elem in num_dialog:
        condition = raw_input[(raw_input['dlg_id'] == elem)]['text'].values
        a = [(i, value) for (i, value) in enumerate(condition)]
        dialog_dict_full[elem] = a
    return dialog_dict_full


In [1015]:
dialog_dict_full_str = {}
# Full dialog dict with dlg_id as a key and text as value
def set_full_dialog_str(df):
    for elem in num_dialog:
        curr_dialog = []
        condition = raw_input[(raw_input['dlg_id'] == elem)]['text'].values
        for text in condition:
            curr_dialog.append(text)
        str_curr_dialog = '. '.join(curr_dialog)
        dialog_dict_full_str[elem] = str_curr_dialog
    return dialog_dict_full_str


In [1016]:
# We are interested in manager phrases only 
manager_greet, manager_bye = {}, {}
def set_manager_help(df):
    for elem in num_dialog:
        dialog_greet, dialog_bye = [], []
        condition = raw_input[(raw_input['role'] == 'manager') & (raw_input['dlg_id'] == elem)]['text'].values
        for i, a in enumerate(condition):
            if i < 3:
# First three sentenses for identifying greeting, introducing, company name and manager name
                dialog_greet.append(a)
            elif i >= len(condition) - 3:
# Last three sentences for identifying goodbye
                dialog_bye.append(a)
        manager_greet[elem] = '. '.join(dialog_greet)
        manager_bye[elem] = '. '.join(dialog_bye)
    return manager_greet, manager_bye


In [1017]:
dialog_dict_full = set_full_dialog(raw_input)
dialog_dict_full_str = set_full_dialog_str(raw_input)
manager_greet, manager_bye = set_manager_help(raw_input)

In [1018]:
#Rules for entity extraction from text
#Greet rule
GREET = morph_pipeline([
    'Здравствуй',
    'Здравствуйте',
    'Доброго дня',
    'Добрый вечер',
    'Доброе утро',
    'Привет',
    'Приветствую',
    'Доброго времени суток'
])

#Introduce rule
INT_PHR = morph_pipeline([
    'Меня зовут',
    'Моё имя',
    'Меня',
    'Это',
    'Вас беспокоит',
    'Я'
])
#Bye rule
BYE = morph_pipeline([
    'До свидания',
    'Пока',
    'До скорого',
    'Всего доброго',
    'Всего хорошего',
    'Хорошего дня',
    'Доброй ночи',
    'До встречи',
    'Удачи',
])
#POS selection
CONJ = gram('CONJ')
PRCL = gram('PRCL')
INTJ = gram('INTJ')
ADJ =  gram('ADJF')
NOUN = gram('NOUN')
VERB = gram('VERB')
ADVB = gram('ADVB')
#Name selection
NAME = gram('Name')
#Company
COMP = morph_pipeline([
    'Компания',
    'Кампания', # For the case of mistake
    'Организация',
    'Фирма',
    'Из',
    'Предприятие'
    ])

INTRO = rule(INT_PHR, NAME)
BYEPARSE = rule(BYE)
NAMEPARSE = rule(NAME)
# Company name should end with NOUN
COMPANY = rule(COMP, VERB.optional(),ADJ.repeatable().optional(), \
               CONJ.optional(), PRCL.optional(), INTJ.optional(), NOUN)
COMPANYPARSE = rule(VERB.optional(),ADJ.repeatable().optional(), \
               CONJ.optional(), PRCL.optional(), INTJ.optional(), NOUN)


In [1019]:
span_greet, span_intro, span_company, span_bye = {}, {}, {}, {}
res_greet, res_intro, res_name, res_company, res_bye = {}, {}, {}, {}, {}
#Entity parsing
def parse(cond, manager_list):
    span_list = {}
    parser = Parser(cond)
    for elem in num_dialog:
        temp_list = []
        all_greet = list(parser.findall(manager_list[elem]))
        spans = [elem.span for elem in all_greet]
        if not spans:
            print(f"No entity is found in {elem} dialog")
        for el in spans:
            temp_list.append(el)
        span_list[elem] = temp_list
       #show_line_markup(manager_list[elem], spans)
    return span_list
    

In [1020]:
span_greet = parse(GREET, manager_greet)
print('-------------------------------------')
span_intro = parse(INTRO, manager_greet)
print('-------------------------------------')
span_bye = parse(BYEPARSE, manager_bye)
print('-------------------------------------')
span_company = parse(COMPANY, manager_greet)

No entity is found in 4 dialog
No entity is found in 5 dialog
-------------------------------------
No entity is found in 4 dialog
-------------------------------------
-------------------------------------
No entity is found in 4 dialog
No entity is found in 5 dialog


In [1021]:
def get_final_dict(span_list, manager_list):
    dict_word, dict_sent, final_dict = {}, {}, {}
    for key, value in span_list.items():
        wordlist = []
        for elem in value:
            start_idx, end_idx = elem
            word = manager_list[key][start_idx:end_idx]
            wordlist.append(word)
        dict_word[key] = wordlist
    # dict_word -- dict with dlg_id as key and list of words as values
    for key in dict_word.keys():
        sentlist = []
        sent_list = manager_list[key].split('. ')
        for sent in sent_list:
            for elem in dict_word[key]:
                if elem in sent:
                    full_sent = sent
                    sentlist.append(full_sent)
            dict_sent[key] = sentlist
    # dict_sent -- dict with dlg_id as key and list of sentences as value
    for key, sent in dict_sent.items():
        idxlist = []
        for elem in sent:
            for num, phrase in dialog_dict_full[key]:
                if elem == phrase:
                    idxlist.append(num)
            final_dict[key] = idxlist
# final_dict -- dict with dlg_id as key and list of indexes of entity as value
    return final_dict

In [1022]:
# Auxiliary dict for entity extraction (name and company)
def get_single_info(span_list, manager_list):
    dict_word = {}
    for key, value in span_list.items():
        if value:
            for elem in value:
                start_idx, end_idx = elem
                word = '. '.join(manager_list[key])[start_idx:end_idx]
                dict_word[key] = word
        else:
            dict_word[key] = 'Unknown'
    return dict_word


In [1023]:
# Auxiliary dict for entity extraction (phrases of greet, introducing and bye)
def get_mul_info(span_list, manager_list):
    dict_word = {}
    for key, value in span_list.items():
        wordlist = []
        for elem in value:
            start_idx, end_idx = elem
            word = manager_list[key][start_idx:end_idx]
            wordlist.append(word)
        dict_word[key] = wordlist
    return dict_word


In [1024]:
res_greet = get_final_dict(span_greet, manager_greet)
res_intro = get_final_dict(span_intro, manager_greet)
res_bye = get_final_dict(span_bye, manager_bye)
res_name  = get_mul_info(span_intro, manager_greet)
res_company  = get_mul_info(span_company, manager_greet)


In [1025]:
#Help function for name and company extraction
def parse_name_company(parse, res_list):
    parser = Parser(parse)
    idx_dict = {}
    for key, value in res_list.items():
        temp_list = []
        all_greet = list(parser.findall('. '.join(value)))
        spans = [elem.span for elem in all_greet]
        for el in spans:
            temp_list.append(el)
        idx_dict[key] = temp_list
    return idx_dict
    


In [1026]:
#List of manager names
idx_dict = parse_name_company(NAMEPARSE, res_name)   
manager_names = get_single_info(idx_dict, res_name)
manager_names


{0: 'ангелина',
 1: 'ангелина',
 2: 'ангелина',
 3: 'максим',
 4: 'Unknown',
 5: 'анастасия'}

In [1027]:
#List of company names
idx_dict = parse_name_company(COMPANYPARSE, res_company)   
company_names = get_single_info(idx_dict, res_company)
company_names


{0: 'диджитал бизнес',
 1: 'диджитал бизнес',
 2: 'диджитал бизнес',
 3: 'китобизнес',
 4: 'Unknown',
 5: 'Unknown'}

In [1028]:
def set_result(df, column, res_dict):
    df[column] = False
    for dlg_id, line_n in res_dict.items():
        for elem in line_n:
            condition = (df['dlg_id'] == dlg_id) & (df['line_n'] == elem) & \
            (df['role'] == 'manager')
            df.loc[condition, [column]] = True
    return df

In [1029]:
df = set_result(raw_input, 'Greet', res_greet)
df = set_result(raw_input, 'Introduce', res_intro)
df = set_result(raw_input, 'Bye', res_bye)

In [1030]:
df['Greet and bye'] = False
for elem in num_dialog:
    cond1 = df[df['dlg_id'] == elem]['Greet'].any()
    cond2 = df[df['dlg_id'] == elem]['Bye'].any()
    cond3 = df['dlg_id'] == elem
    df.loc[cond1 & cond2 & cond3, ['Greet and bye']] = True

In [1031]:
df

Unnamed: 0,dlg_id,line_n,role,text,Greet,Introduce,Bye,Greet and bye
0,0,0,client,Алло,False,False,False,True
1,0,1,manager,Алло здравствуйте,True,False,False,True
2,0,2,client,Добрый день,False,False,False,True
3,0,3,manager,Меня зовут ангелина компания диджитал бизнес з...,False,True,False,True
4,0,4,client,Ага,False,False,False,True
5,0,5,manager,Угу ну возможно вы рассмотрите и другие вариан...,False,False,False,True
6,0,6,client,Да мы работаем с компанией которая нам подлива...,False,False,False,True
7,0,7,client,Как как бы уже до этого момента работаем все у...,False,False,False,True
8,0,8,manager,Угу а на что вы обращаете внимание при выборе,False,False,False,True
9,0,9,client,Как бы нет,False,False,False,True
