In [280]:
import pandas as pd
from pandas import DataFrame as df
import xml.etree.ElementTree as ET
import numpy as np
import re


In [281]:
DATE_TEMPLATE = re.compile(r'\d{2}\.\d{2}\.\d{4}')

In [282]:
tree = ET.parse('output.xml')
root = tree.getroot()
r = root.findall("Lead")

def make_dict_of_leads(root=root):
    dict_of_leads = dict();
    for lead in root.iter('Lead'):
        id = lead.attrib["id"]

        pulpy = ET.fromstring(lead.attrib["text"])
        source_sentence = pulpy.find("b").find("s")

        remove_explicit_from_sentence(source_sentence)

        text = ''.join(source_sentence.itertext())
        dict_of_leads[id] = re.sub(r'\.$', '', text.strip());
    return dict_of_leads

def remove_explicit_from_sentence(xml_sentence):
    
    def remove_target_node(xml_sentence, target_template):
        for node in xml_sentence:
            if re.search(target_template, node.attrib["lemma"]):
                node.text = ''
                
    # to remove
    phoneno = re.compile(r'\d{11}')
    date = DATE_TEMPLATE

    remove_target_node(xml_sentence, phoneno)
    remove_target_node(xml_sentence, date)
   

one = make_dict_of_leads()['0']


"""
   <b>
      <s>
         21.06.2016
         <P n0="" lemma="89140580517">89140580517</P>
         хочет
         <W n1="" lemma="buy">купить</W>
         <W n2="" lemma="ноутбук">ноутбук</W>
         за 17000 руб ездить
         <P n3="" lemma="Якутия">Якутия</P>
         .
      </s>
   </b>
"""


one

'самсунг с4 купить'

In [283]:
# todo: помнить о тексте лида. Там выделены факты прямо в разметке - полезно при выводе информации в веб-интерфейсе

def compare_facts_to_leads(root=root):
    facts_grouped_by_lead = dict()

    for i in root.find("document").find('facts'):
        lead_id = i.attrib['LeadID']
        if facts_grouped_by_lead.get(lead_id):
            facts_grouped_by_lead[lead_id].append(i)
        else:
            facts_grouped_by_lead[lead_id] = [i]
    return facts_grouped_by_lead

def make_common_table():
    calls = df()
    
    facts = compare_facts_to_leads()
    leads = make_dict_of_leads()

    for lead in facts:
        try:
            elems = facts[lead]
            one_sentence = leads[lead]
            cols = ["lead_id", "conversation"]
            values = [lead, one_sentence]
            for fact_name in elems:
                for fact_field in fact_name:
                    cols.append(fact_name.tag + "_" + fact_field.tag)
                    values.append(fact_field.attrib["val"])
            one_row = pd.DataFrame([values], columns=cols)
            calls = calls.append(one_row)

            values = []
            cols = []
        except ValueError as e:
            print(e)
            print(lead, leads[lead])
    return calls


In [284]:
calls = make_common_table()
calls["lead_id"] = calls["lead_id"].map(int)
calls = calls.sort_values(by="lead_id")

calls = calls.set_index("lead_id")

# Звонки из Бурятии
calls_from_buryatia = calls[calls.CustomerPlace_Buryatia.notnull()]

calls.head(3)

Unnamed: 0_level_0,CallDate_Date,CustomerBuys_Word,CustomerPhone_Phone,CustomerPlace_Buryatia,CustomerPlace_Yakutia,CustomerSells_Word,Matter_Notebook,Matter_Other,Matter_Phone,Matter_TV,Matter_Tablet,Pawn_Word,Repare_Word,conversation
lead_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,24.06.2016,КУПЛЯ,89627305831,,,,,,САМСУНГ С4,,,,,самсунг с4 купить
1,24.06.2016,КУПЛЯ,89243637772,,,,,,АЙФОН 5С,,,,,айфон 5с купить
2,24.06.2016,,89241742373,,,,,,АЙФОН 5С,,,,,айфон 5с


In [285]:

def prepare_for_excel_ykt(source_df, col_name, output_cols):
    yakutia = source_df[source_df.CustomerPlace_Buryatia.isnull()]
    
    actions = yakutia[yakutia[col_name].notnull()]
    actioned = df(actions, columns=output_cols[0])

    actioned.columns=output_cols[1]
    return actioned


# todo действие по умолчанию - купить. Собрать такие графы в таблицу "Купить" - клиент покупает у нас
# todo ремонт только телефонов и ноутбуков

cols_for_buy_and_sell = [
    [
        "CallDate_Date",
        "CustomerPhone_Phone",
        "Matter_Notebook",
        "Matter_Tablet",
        "Matter_TV",
        "Matter_Phone",
        "Matter_Other",
        "conversation",
        "Комплектация",
        "Цена клиента",
        "Наша цена",
        "Утилизация",
        "Решение клиента",
    ],

    [
        "Дата",
        "Номер телефона",
        "Ноутбук/нетбук",
        "Планшет",
        "Телевизор",
        "Телефон",
        "Другое",
        "Разговор",
        "Комплектация",
        "Цена клиента",
        "Наша цена",
        "Утилизация",
        "Решение клиента",
    ]
]


In [286]:

writer = pd.ExcelWriter('report.xlsx')

vikup = prepare_for_excel_ykt(calls, "CustomerSells_Word", cols_for_buy_and_sell)
prodazha = prepare_for_excel_ykt(calls, "CustomerBuys_Word", cols_for_buy_and_sell)
remont = prepare_for_excel_ykt(calls, "Repare_Word", cols_for_buy_and_sell)
lombard = prepare_for_excel_ykt(calls, "Pawn_Word", cols_for_buy_and_sell)

vikup.to_excel(writer, sheet_name = "Выкуп", index=False)
prodazha.to_excel(writer, sheet_name = "Продажа", index=False)
remont.to_excel(writer, sheet_name = "Ремонт", index=False)
lombard.to_excel(writer, sheet_name = "Ломбард", index=False)
writer.save()