In [168]:
import pandas as pd
from pandas import DataFrame as df
import xml.etree.ElementTree as ET
import numpy as np
import re


In [169]:
DATE_TEMPLATE = re.compile(r'\d{2}\.\d{2}\.\d{4}')

In [175]:
tree = ET.parse('output.xml')
root = tree.getroot()
r = root.findall("Lead")

def make_dict_of_leads(root=root):
    dict_of_leads = dict();
    for lead in root.iter('Lead'):
        id = lead.attrib["id"]

        pulpy = ET.fromstring(lead.attrib["text"])
        source_sentence = pulpy.find("b").find("s")

        remove_explicit_from_sentence(source_sentence)

        text = ''.join(source_sentence.itertext())
        dict_of_leads[id] = re.sub(r'\.$', '', text.strip());
    return dict_of_leads

def remove_explicit_from_sentence(xml_sentence):
    
    def remove_target_node(xml_sentence, target_template):
        for node in xml_sentence:
            if re.search(target_template, node.attrib["lemma"]):
                node.text = ''
                
    # to remove
    phoneno = re.compile(r'\d{11}')
    date = DATE_TEMPLATE

    remove_target_node(xml_sentence, phoneno)
    remove_target_node(xml_sentence, date)
   

one = make_dict_of_leads()['0']


"""
   <b>
      <s>
         21.06.2016
         <P n0="" lemma="89140580517">89140580517</P>
         хочет
         <W n1="" lemma="buy">купить</W>
         <W n2="" lemma="ноутбук">ноутбук</W>
         за 17000 руб ездить
         <P n3="" lemma="Якутия">Якутия</P>
         .
      </s>
   </b>
"""


one

'  самсунг с4 купить'

In [171]:
# todo: помнить о тексте лида. Там выделены факты прямо в разметке - полезно при выводе информации в веб-интерфейсе

def compare_facts_to_leads(root=root):
    facts_grouped_by_lead = dict()

    for i in root.find("document").find('facts'):
        lead_id = i.attrib['LeadID']
        if facts_grouped_by_lead.get(lead_id):
            facts_grouped_by_lead[lead_id].append(i)
        else:
            facts_grouped_by_lead[lead_id] = [i]
    return facts_grouped_by_lead

def make_common_table():
    calls = df()
    
    facts = compare_facts_to_leads()
    leads = make_dict_of_leads()

    for lead in facts:
        try:
            elems = facts[lead]
            one_sentence = leads[lead]
            cols = ["lead_id", "conversation"]
            values = [lead, one_sentence]
            for fact_name in elems:
                for fact_field in fact_name:
                    cols.append(fact_name.tag + "_" + fact_field.tag)
                    values.append(fact_field.attrib["val"])
            one_row = pd.DataFrame([values], columns=cols)
            calls = calls.append(one_row)

            values = []
            cols = []
        except ValueError as e:
            print(e)
            print(lead, leads[lead])
    return calls


In [172]:
calls = make_common_table()
calls["lead_id"] = calls["lead_id"].map(int)
calls = calls.sort_values(by="lead_id")
calls.to_excel("whole_table.xlsx")
calls

Unnamed: 0,CallDate_Date,CustomerBuys_Word,CustomerPhone_Phone,CustomerPlace_Place,CustomerSells_Word,Matter_Notebook,Matter_Other,Matter_Phone,Matter_TV,Matter_Tablet,Pawn_Word,Repare_Word,conversation,lead_id
0,24.06.2016,КУПЛЯ,89627305831,,,,,САМСУНГ С4,,,,,самсунг с4 купить.,0
0,24.06.2016,КУПЛЯ,89243637772,,,,,АЙФОН 5С,,,,,айфон 5с купить.,1
0,24.06.2016,,89241742373,,,,,АЙФОН 5С,,,,,айфон 5с.,2
0,24.06.2016,,89640762630,,,,,АЙФОН 4,,,,,айфон 4.,3
0,24.06.2016,,89248742895,,,,,АЙФОН,,,,,айфон.,4
0,24.06.2016,,89659963995,,,,,СОТОВЫЙ,,,,,сотовый до 4000руб..,5
0,24.06.2016,КУПЛЯ,89659963995,,,,,НОТ4,,,,,нот4 купить.,6
0,24.06.2016,КУПЛЯ,89644232158,,,,,,,НОТ САМСУНГ,,,нот самсунг купить.,7
0,24.06.2016,,89248719793,,ВЫКУП,,,САМСУНГ С5МИНИ,,,,,самсунг с5мини продать.,8
0,25.06.2016,КУПЛЯ,89141039012,,,,,ЛЕНОВО 2010 ТЕЛЕФОН,,,,,леново 2010 сотовый купить.,9


In [173]:
#pawn = calls[calls.Pawn_Word.notnull()].to_excel("pawn.xlsx")
buy_out = calls[calls.CustomerBuys_Word.notnull()].to_excel("buy_out.xlsx")
calls[calls.CustomerSells_Word.notnull()].to_excel("sell.xlsx")
repare = calls[calls.Repare_Word.notnull()].to_excel("repare.xlsx")
