In [166]:
import pandas as pd
from pandas import DataFrame as df
import xml.etree.ElementTree as ET
import numpy as np
import re


In [167]:
DATE_TEMPLATE = re.compile(r'\d{2}\.\d{2}\.\d{4}')

In [168]:
tree = ET.parse('output.xml')
root = tree.getroot()
r = root.findall("Lead")

def make_dict_of_leads(root=root):
    dict_of_leads = dict();
    for lead in root.iter('Lead'):
        id = lead.attrib["id"]

        pulpy = ET.fromstring(lead.attrib["text"])
        source_sentence = pulpy.find("b").find("s")

        remove_explicit_from_sentence(source_sentence)

        text = ''.join(source_sentence.itertext())
        dict_of_leads[id] = text;
    return dict_of_leads

def remove_explicit_from_sentence(xml_sentence):
    def remove_target_node(xml_sentence, target_template):
        for node in xml_sentence:
            if re.search(target_template, node.attrib["lemma"]):
                xml_sentence.remove(node)
    # to remove
    phoneno = re.compile(r'\d{11}')
    date = DATE_TEMPLATE

    remove_target_node(xml_sentence, phoneno)
    remove_target_node(xml_sentence, date)
      
    

make_dict_of_leads()["46"]

one = make_dict_of_leads()['0']


"""
   <b>
      <s>
         21.06.2016
         <P n0="" lemma="89140580517">89140580517</P>
         хочет
         <W n1="" lemma="buy">купить</W>
         <W n2="" lemma="ноутбук">ноутбук</W>
         за 17000 руб ездить
         <P n3="" lemma="Якутия">Якутия</P>
         .
      </s>
   </b>
"""


one

'купить ноутбук за 17000 руб ездить Якутия.'

In [174]:
# todo: помнить о тексте лида. Там выделены факты прямо в разметке - полезно при выводе информации в веб-интерфейсе

def compare_facts_to_leads(root=root):
    facts_grouped_by_lead = dict()

    for i in root.find("document").find('facts'):
        lead_id = i.attrib['LeadID']
        if facts_grouped_by_lead.get(lead_id):
            facts_grouped_by_lead[lead_id].append(i)
        else:
            facts_grouped_by_lead[lead_id] = [i]
    return facts_grouped_by_lead

def make_common_table():
    appendix = [
        "Notebook_Word",
        "Notebook_Vendor",
        "Notebook_Model",
        "Notebook_UserDefinedName",
        'Phone_Word',
        'Phone_Vendor',
        'Phone_Model',
        'Phone_UserDefinedName',
        "notebook",
        "phone"
    ]
    calls = df()
    
    facts = compare_facts_to_leads()
    leads = make_dict_of_leads()

    for lead in facts:
        try:
            elems = facts[lead]
            one_sentence = leads[lead]
            cols = ["conversation"]
            values = [one_sentence]
            for fact_name in elems:
                for fact_field in fact_name:
                    cols.append(fact_name.tag + "_" + fact_field.tag)
                    values.append(fact_field.attrib["val"])
            one_row = pd.DataFrame([values], columns=cols)
            calls = calls.append(one_row)

            values = []
            cols = []
        except ValueError:
            print(lead, leads[lead])
    return calls


In [175]:
calls = make_common_table()

to_str = lambda x: " " + x if x is not np.nan else ""

calls = calls.assign(notebooks = lambda x: x.Notebook_Word.map(to_str) + " " + x.Notebook_Vendor.map(to_str))

calls = calls.assign(phones = lambda x: 
                     x.Phone_Word.map(to_str) + " " + 
                     x.Phone_Vendor.map(to_str) + " " + 
                     x.Phone_Model.map(to_str) + " " + 
                     x.Phone_UserDefinedName.map(to_str))


calls.drop(['Notebook_Word', 'Notebook_Vendor'], axis=1, inplace=True)
calls.drop(['Phone_Word', 'Phone_Vendor', 'Phone_Model', 'Phone_UserDefinedName'], axis=1, inplace=True)

calls.to_excel("whole_table.xlsx")
calls

Unnamed: 0,CallDate_Date,CustomerBuys_NumberInPrice,CustomerBuys_Word,CustomerPhone_Phone,CustomerPlace_Place,CustomerSells_Word,Pawn_Word,Repare_Word,conversation,notebooks,phones
0,08.06.2016,,,89140580517,,,,,,,
0,04.04.2016,,BUY,89234212342,,,,,купить ноутбук.,НОУТБУК,
0,05.05.2016,,,89140580517,,,PAWN,,ломб сони хперия хперия з3.,,СОНИ ХПЕРИЯ ХПЕРИЯ З3
0,11.06.2016,,BUY,89140580517,,,,,купить.,,
0,04.04.2016,17,BUY,89140580517,,,,,b 17 зац.,,
0,04.04.2016,17,BUY,89140580517,,,,,покупка 17 зац.,,
0,04.04.2016,,BUY,89140580517,,,,,куп телефон 27.,,ТЕЛЕФОН
0,04.04.2016,17,BUY,89140580517,,,,,buy 17 зац.,,
0,10.06.2016,,,,,,,,,,
0,11.06.2016,,,89140580517,,,,,,,


In [173]:
pawn = calls[calls.Pawn_Word.notnull()]
buy_out = calls[calls.CustomerBuys_Word.notnull()]
buy_out.to_excel("buy_out.xlsx")
pawn.to_excel("pawn.xlsx")

calls[calls.CustomerSells_Word.notnull()].to_excel("sell.xlsx")

repare = calls[calls.Repare_Word.notnull()]
repare.to_excel("repare.xlsx")