In [1]:
import pandas as pd
import numpy as np

import sys
import os
from datetime import datetime

from rdkit import Chem


In [2]:
TOP = os.getcwd().replace('notebooks', '')
raw_dir = TOP + 'data/raw/'
interim_dir = TOP + 'data/interim/'
external_dir = TOP + 'data/external/'
figures_dir = TOP + 'reports/figures/'

In [3]:
with open(raw_dir + 'Copy of Oncologic Primary Classification.plugin.config','r') as f:
    xml=f.read()


In [4]:
xml=xml.replace('\n','')

In [6]:
#xml

In [7]:
import xml.etree.ElementTree as ET
e=ET.parse(raw_dir+'Copy of Oncologic Primary Classification.plugin.config').getroot()


In [11]:
import re
import ast

In [70]:
class Query:
    
    def __init__(self,xml,qid=None):
        self.xml=xml
        self.id=qid
        self.logic=None
        self.subqueries=[]
        self.category=None
        
    def write_query(self,qtype,tree):
        self.type=qtype
        if qtype=='b:StructureQuery':
            qstring=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ComplexSearch').text
            qstring=re.sub('false','False',qstring)
            qstring=re.sub('true','True',qstring)
            qdict=ast.literal_eval(qstring)
            smart=qdict['queries'][0]['smart']
            self.smart=smart
            print(self.smart)
            if '[Ch3,#1]' in self.smart:
                split=re.search(r'(.*)\[([^\(\)]*),([^\(\)].*)\]$',self.smart)
                split1=split.group(1)
                split2=split.group(1)+'['+split.group(2)+']'
                smart_match1=define_smart_match(split1)
                smart_match2=define_smart_match(split2)
                def smart_match(x):
                    return any([smart_match1(x),smart_match2(x)])
            else:
                smart_match=define_smart_match(smart)
            self.query=smart_match
        elif qtype=='b:ParameterQuery':
            self.operand=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}Operand').text
            self.prop=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ParameterName').text
            self.value=float(self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}Value').text)
            compare=define_compare(self.prop,self.operand,self.value)
            self.query=compare
        elif qtype=='LogicalQuery':
            self.logic=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Logic').text
            elements=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Elements')
            node_ids=[elem.attrib['{http://schemas.microsoft.com/2003/10/Serialization/}Ref']\
                      for elem in elements.findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query')\
                      if '{http://schemas.microsoft.com/2003/10/Serialization/}Ref' in elem.attrib]
            if self.logic=='Not':
                node_id=node_ids[0] #Should only be one
                sq=tree[node_id]
                self.subqueries=[sq]
                def func(x):
                    return not(sq.query(x))
                self.query=func
            elif self.logic=='And':
                sqs=[tree[node_id] for node_id in node_ids]
                self.subqueries=sqs
                def func(x):
                    return all([sq.query(x) for sq in self.subqueries])
                self.query=func
            else:
                sqs=[tree[node_id] for node_id in node_ids]
                self.subqueries=sqs
                for orquery in elements.findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query'):
                    if '{http://www.w3.org/2001/XMLSchema-instance}type' in orquery.attrib:
                        extra_sq=Query(orquery)
                        extra_sq.write_query('b:StructureQuery',tree)
                        sqs.append(extra_sq)      
                def func(x):
                    return any([sq.query(x) for sq in self.subqueries])
                self.query=func
    
    def print_tree(self,x,tabs=0):
        qinfo=(self.id,self.type)
        if self.type=='b:StructureQuery':
            qinfo=qinfo+(self.smart,)
        elif self.type=='b:ParameterQuery':
            qinfo=qinfo+(self.prop,self.value,self.operand)
        elif self.type=='LogicalQuery':
            qinfo=qinfo+(self.logic,)
        try:
            qinfo=qinfo+(self.query(x),)
        except:
            qinfo=qinfo+('does not process',)
        print('\t'*tabs+str(qinfo))
        for sq in self.subqueries:
            sq.print_tree(x,tabs+1)

In [71]:
def define_smart_match(smart):
    pattern=Chem.MolFromSmarts(smart)
    if not pattern:
        return None
    def smart_match(x):
        mol=x['mol']
        ret=True if mol.GetSubstructMatches(pattern) else False
        return ret
    return smart_match
def define_compare(prop,operand,value):
    def compare(x):
        ret = op_dict[operand](x[prop_dict[prop]],value)
        return ret
    return compare

In [72]:
import operator as op
op_dict={
    'GreaterThan': op.gt,
    'GreaterThanOrEqualTo': op.ge,
    'LessThan': op.lt,
    'LessThanOrEqualTo': op.le
}
prop_dict={
    'log Kow':'logp',
    'Molecular Weight':'mol_weight',
    'Molecular weight':'mol_weight',
    'Water Solubility': 'ws'
}


In [73]:
q = Query(xml)

In [74]:
for elem in e.iter('{http://schemas.microsoft.com/2003/10/Serialization/Arrays}anyType'):
    category=elem.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Caption').text
    queries=elem.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Expression')\
        .find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Queries')\
        .findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query')
    contents=[query.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Content') for query in queries]
    query_tree={}
    for query in contents:
        attributes=query.attrib
        if '{http://schemas.microsoft.com/2003/10/Serialization/}Id' not in attributes:
            continue
        query_id=attributes['{http://schemas.microsoft.com/2003/10/Serialization/}Id']
        query_type=attributes['{http://www.w3.org/2001/XMLSchema-instance}type']
        q=Query(query,query_id)
        q.category = category
        q.write_query(query_type,query_tree)

[#8]([#6R0]=[#6R0])[#6](=[#8])[#6]=[#6]
[#8]([#6X4])[#6](=[#8])[#6]=[#6]
[#6](=[#8])([#6]=[#6])[O]c


KeyError: '23'

In [77]:
all_tests={}
print_tests = {}
bad_smarts=set()
bad_cats=set()
for elem in e.iter('{http://schemas.microsoft.com/2003/10/Serialization/Arrays}anyType'):
    category=elem.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Caption').text
    queries=elem.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Expression')\
        .find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Queries')\
        .findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query')
    contents=[query.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Content') for query in queries]
    query_tree={}
    for query in contents:
        attributes=query.attrib
        if '{http://schemas.microsoft.com/2003/10/Serialization/}Id' not in attributes:
            continue
        query_id=attributes['{http://schemas.microsoft.com/2003/10/Serialization/}Id']
        query_type=attributes['{http://www.w3.org/2001/XMLSchema-instance}type']
        try:
            q=Query(query,query_id)
            q.category=category
            q.write_query(query_type,query_tree)
            q.print_tree(q.query)
            if not q.query or not all([sq.query for sq in q.subqueries]): #Smarts did not compile, sqs needed bc of hidden sqs in or queries
                bad_cats.add(category)
                if q.type=='b:StructureQuery':
                    bad_smarts.add(q.smart)
        except Exception as error:
            print(error)
        finally:
            query_tree[query_id]=q
        all_tests[category]=query_tree[query_id]
        print_tests[category] = [query_id, query_type]#

[#8]([#6R0]=[#6R0])[#6](=[#8])[#6]=[#6]
('23', 'b:StructureQuery', '[#8]([#6R0]=[#6R0])[#6](=[#8])[#6]=[#6]', 'does not process')
[#8]([#6X4])[#6](=[#8])[#6]=[#6]
('25', 'b:StructureQuery', '[#8]([#6X4])[#6](=[#8])[#6]=[#6]', 'does not process')
[#6](=[#8])([#6]=[#6])[O]c
('27', 'b:StructureQuery', '[#6](=[#8])([#6]=[#6])[O]c', 'does not process')
('29', 'LogicalQuery', 'Or', 'does not process')
	('23', 'b:StructureQuery', '[#8]([#6R0]=[#6R0])[#6](=[#8])[#6]=[#6]', 'does not process')
	('25', 'b:StructureQuery', '[#8]([#6X4])[#6](=[#8])[#6]=[#6]', 'does not process')
	('27', 'b:StructureQuery', '[#6](=[#8])([#6]=[#6])[O]c', 'does not process')
[#8]=[C]([#7v3]([[#1],c,CX4])[[#1],c,CX4])[C]=[C]
('39', 'b:StructureQuery', '[#8]=[C]([#7v3]([[#1],c,CX4])[[#1],c,CX4])[C]=[C]', 'does not process')
[#6](=[#6])[C]([Nv3]([CR0]=[CR0])[c,CX4,#1])=[O]
('41', 'b:StructureQuery', '[#6](=[#6])[C]([Nv3]([CR0]=[CR0])[c,CX4,#1])=[O]', 'does not process')
[#7v3]([#6](=[#8])[#6]=[#6])([#6R0]=[#6R0])[#6R0]=

[10:02:01] SMARTS Parse Error: syntax error while parsing: [#8]=[C]([#7v3]([[#1],c,CX4])[[#1],c,CX4])[C]=[C]
[10:02:01] SMARTS Parse Error: Failed parsing SMARTS '[#8]=[C]([#7v3]([[#1],c,CX4])[[#1],c,CX4])[C]=[C]' for input: '[#8]=[C]([#7v3]([[#1],c,CX4])[[#1],c,CX4])[C]=[C]'
[10:02:01] SMARTS Parse Error: syntax error while parsing: [#8]=[#6]{>-1}.$[F{<-1},Cl{<-1},Br{<-1},I{<-1}]{1}
[10:02:01] SMARTS Parse Error: Failed parsing SMARTS '[#8]=[#6]{>-1}.$[F{<-1},Cl{<-1},Br{<-1},I{<-1}]{1}' for input: '[#8]=[#6]{>-1}.$[F{<-1},Cl{<-1},Br{<-1},I{<-1}]{1}'
[10:02:01] SMARTS Parse Error: syntax error while parsing: [#7v3]($[$[$[[#6]{>-1}=[#6]{>-2}{>-3}.$[[#1]{<-3},[#8h]{<-3},[#6]{<-3}([#8h])=[#8],[#16]{<-3}([#8h])(=[#8])=[#8],F{<-3},Cl{<-3},Br{<-3},I{<-3},[#6]{<-3}#[#7v3],[#8]{<-3}[#6X4],[#8]{<-3}[#6](=[#8])[#6X4]]{1}.$[[#1]{<-2},[#8h]{<-2},[#6]{<-2}([#8h])=[#8],[#16]{<-2}([#8h])(=[#8])=[#8],F{<-2},Cl{<-2},Br{<-2},I{<-2},[#6]{<-2}#[#7v3],[#8]{<-2}[#6X4],[#8]{<-2}[#6](=[#8])[#6X4]]{1}.$[[#1]{<

In [78]:
bad_cats

{'Acrylamide Reactive Functional Groups',
 'Acyl and Benzoyl Type Compounds',
 'Aliphatic Azo and Azoxy Type Compounds',
 'Alkanesulfonoxy Ester Type Compounds',
 'Alkyl Sulfate Type Compounds',
 'Alpha- and beta-Haloether Reactive Functional Groups',
 'Alpha-Haloalkylamine Reactive Functional Groups',
 'Alpha-Halothioether Reactive Functional Groups',
 'Anhydride Type Compounds',
 'Aromatic Amine Type Compounds',
 'Aryldiazonium Salts',
 'Carbamyl Halide Type Compounds',
 'Coumarine and Furocoumarin Type Compounds',
 'Dicarbonyl Type Compounds',
 'Epoxide Reactive Functional Groups',
 'Ethyleneimine (Aziridine) Reactive Functional Groups',
 'Halogenated Aromatic Hydrocarbon Type Compounds',
 'Halogenated Cycloalkane Type Compounds',
 'Halogenated Linear Aliphatic Hydrocarbone Type Compounds',
 'Halogenated Nitroaromatic Type Compounds',
 'Hydrazo Type Compounds',
 'Lactone Type Reactive Functional Groups',
 'Nitrogen Mustards Reactive Functional Groups',
 'Nitrosamide Type Compounds',

In [84]:
print(all_tests['Acrylate Reactive Functional Groups'])

<__main__.Query object at 0x7a4cf961c3a0>


In [18]:
import operator as op
op_dict={
    'GreaterThan': op.gt,
    'GreaterThanOrEqualTo': op.ge,
    'LessThan': op.lt,
    'LessThanOrEqualTo': op.le
}

In [19]:
def define_smart_match(smart):
    pattern=Chem.MolFromSmarts(smart)
    if not pattern:
        return None
    def smart_match(x):
        mol=x['mol']
        ret=True if mol.GetSubstructMatches(pattern) else False
        return ret
    return smart_match

In [22]:
#print(ET.tostring(e, encoding='utf8').decode('utf8'))

In [36]:
#[(elem.tag, elem.attrib) for elem in e.iter()]

In [None]:
"http://schemas.microsoft.com/2003/10/Serialization/Arrays"/><ExternalLabels

In [27]:
xml.find("http://schemas.microsoft.com/2003/10/Serialization/Arrays")

10138

In [71]:
alert_dict = {}
for elem in e.iter('{http://schemas.microsoft.com/2003/10/Serialization/Arrays}string'):
    id_value = elem.attrib['{http://schemas.microsoft.com/2003/10/Serialization/}Id']
    alert_dict[id_value] = elem.text

In [73]:
len(alert_dict)

728

In [93]:
import json

In [109]:
 smarts = []
for elem in e.iter('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ComplexSearch'):
        x = json.loads(elem.text)
        try:
            print(x['queries'][0]['predefined'])
        except:
            smarts.append(x['queries'][0]['smart'])    


Organic
Inorganic


In [112]:
for k, v in alert_dict.items():
    if 'Phenyl' in v:
        print(k)

98
99
173
174


In [124]:
len(alert_dict)

728

In [121]:
def remove_duplicates(input_dict):
    unique_dict = {}
    seen_values = set()

    for key, value in input_dict.items():
        if value not in seen_values:
            unique_dict[key] = value
            seen_values.add(value)

    return unique_dict

In [125]:
alert_dict = remove_duplicates(alert_dict)

In [110]:
smarts

['[As,Cd,Cr,Hg,V,Al,Co,Ga,Li,Mg,Ni,Sn,Pb]',
 '$[[#1],[#6X4]]{..;x}[#8][#6]([#6])=[#8]',
 '$[[#6]{>-2},[#1]{>-2},[#7]{>-2}]{1}.[#6]{>-1}{<-2}=[#6h2].$[[#6]{<-1},[#1]{<-1},[#7]{<-1}]{1}',
 '$[[#1]{>-1},[#6X4]{>-1},[#7R0v3]{>-1}]{1}.[#6h]{<-1}=[#8]',
 '[#7][#6]([#7])=[#8]',
 '$[[#8]{>=1},[#16v2]{>=1}]{1}.[#6]{<=1}([#7])[#7]',
 '$[[#8]{>-1},[#7]{>-1}]{1}.[#6]{<-1}([#8])=[#8]',
 '$[[#8]{>=2},[#7]{>=2}]{1}.[#6]{<=2}([#7])[#7,#8][#7]{>=1}.$[[#8]{<=1},[#6]{<=1}]{1}',
 '[#6][#6]([#7v3])=[#8]',
 '$[[#8h]{>-1},[#6h3][#8]{>-1}]{1}.c{<-1}1ccc2[#6]3[#6][#6][#6]4[#6][#6][#6][#6]4[#6]3[#6][#6]c2c1',
 '[#6h3][#6]12[#6][#6][#6](=[#8])[#6]=[#6]1[#6][#6][#6]1[#6]2[#6][#6][#6]2[#6][#6][#6][#6]12',
 '[#6h3][#6]12[#6]=[#6][#6](=[#8])[#6]=[#6]1[#6][#6][#6]1[#6]2[#6][#6][#6]2[#6][#6][#6][#6]12',
 '[#6h3][#6]12[#6]=[#6][#6](=[#8])[#6]=[#6]1[#6]=[#6][#6]1[#6]2[#6][#6][#6]2[#6][#6][#6][#6]12',
 '[#6h3][#6]12[#6][#6][#6](=[#8])[#6]=[#6]1[#6]=[#6][#6]1[#6]2[#6][#6][#6]2[#6][#6][#6][#6]12',
 '[#8]=[#6]1[#6][#6][#6]2

In [162]:
#xml2=ET.parse(raw_dir+'Category 1_New_Inorganics and derivatives_organophosphorus,organosiloxane compounds_2.xml').getroot()

xml_tree = ET.parse(raw_dir+'Category 1_New_Inorganics and derivatives_organophosphorus,organosiloxane compounds_2.xml')
xml_root = xml_tree.getroot()

In [169]:
for x in xml_root.finda('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ComplexSearch', "b:StructureQuery"):
    print(x)
    

AttributeError: 'str' object has no attribute 'items'

In [181]:
category_1 = []
for x in xml_root.iter('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ComplexSearch'):
    a = json.loads(x.text)['queries'][0]['smart']
    category_1.append(a)
    

In [202]:
category_1[13]

'[Si]{<-1}1([Ch3])[#8]$[$[O[Siv4]([Ch3])[Ch3]]{..1}]{..;x}[Siv4]([#8]$[$[[Siv4]([Ch3])([Ch3])[#8]]{1..2}]{..;x}1)([Ch3])c1[c]{<-2}[c]{<-2}[c]{<-2}[c]{<-2}[c]{<-2}1.$[[ch]1c{>-1}[ch][ch][ch][ch]1,[Ch3]{>-1}]{1}.$[[Ch3]{>-2},[Ch2]{>-2}[Ch3],[Ch2]{>-2}[Ch2][Ch3],[Ch3][Ch]{>-2}[Ch3]]{1}'

In [240]:
for x in xml_root.iter('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ComplexSearch'):
    a = x.attrib
    print(a['{http://schemas.microsoft.com/2003/10/Serialization/}Id'])

17
19
21
23
25
29
192
194
207
209
254
256
309
311
313
315
317
319


In [246]:
alert_dicts = {}
for elem in xml_root.iter('http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries'):
    print(elem.tag)

In [211]:
alert_dicts

{'37': 'Male reproductive toxicity',
 '38': 'Spectrum of developmental toxicity endpoints observed include structural malformations',
 '39': 'n/a',
 '40': 'False',
 '45': 'No relevant studies identified',
 '46': 'For adverse effects on the fetus during pregnancy or those occurring during the perinatal period',
 '47': 'n/a',
 '48': 'False',
 '53': 'No relevant studies identified',
 '54': 'For adverse effects on the fetus during pregnancy or those occurring during the perinatal period',
 '55': 'n/a',
 '56': 'False',
 '61': 'No relevant studies identified',
 '62': 'For adverse effects on the fetus during pregnancy or those occurring during the perinatal period',
 '63': 'n/a',
 '64': 'False',
 '69': 'No relevant studies identified',
 '70': 'For adverse effects on the fetus during pregnancy or those occurring during the perinatal period',
 '71': 'n/a',
 '72': 'False',
 '77': 'No relevant studies identified',
 '78': 'Spectrum of developmental toxicity endpoints observed include structural ma

In [149]:
class Query:
    
    def __init__(self,xml,qid=None):
        self.xml=xml
        self.id=qid
        self.logic=None
        self.subqueries=[]
        self.category=None
        
    def write_query(self,qtype,tree):
        self.type=qtype
        if qtype=='b:StructureQuery':
            qstring=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ComplexSearch').text
            qstring=re.sub('false','False',qstring)
            qstring=re.sub('true','True',qstring)
            qdict=json.loadsl(qstring)
            smart=qdict['queries'][0]['smart']
            self.smart=smart
            if '[Ch3,#1]' in self.smart:
                split=re.search(r'(.*)\[([^\(\)]*),([^\(\)].*)\]$',self.smart)
                split1=split.group(1)
                split2=split.group(1)+'['+split.group(2)+']'
                smart_match1=define_smart_match(split1)
                smart_match2=define_smart_match(split2)
                def smart_match(x):
                    return any([smart_match1(x),smart_match2(x)])
            else:
                smart_match=define_smart_match(smart)
            self.query=smart_match
            
    def print_tree(self,x,tabs=0):
        qinfo=(self.id,self.type)
        if self.type=='b:StructureQuery':
            qinfo=qinfo+(self.smart)
        try:
            qinfo=qinfo+(self.query(x),)
        except:
            qinfo=qinfo+('does not process',)
        print('\t'*tabs+str(qinfo))
        for sq in self.subqueries:
            sq.print_tree(x,tabs+1)    

In [150]:
q = Query(xml = xml2)

In [151]:
q.write_query('b:StructureQuery', xml2)


AttributeError: 'NoneType' object has no attribute 'text'

In [127]:
chemical_elements

<Element '{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Scheme' at 0x7f14e5aed0e0>

In [128]:
for el in chemical_elements.iter():
    print(el)

<Element '{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Scheme' at 0x7f14e5aed0e0>
<Element '{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}CounterProfile' at 0x7f14e5aed450>
<Element '{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Credits' at 0x7f14e5aed6d0>
<Element '{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Adopted' at 0x7f14e5aed5e0>
<Element '{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Author' at 0x7f14e5aed720>
<Element '{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Changelog' at 0x7f14e5aed770>
<Element '{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Description' at 0x7f14e5aed810>
<Element '{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Disclaimer' at 0x7f14e5aed860>
<Element '{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Donator' at 0x7f14e5aeda90>
<Element '{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}LastModifiedDa

In [80]:
class Query:
    
    def __init__(self,xml,qid=None):
        self.xml=xml
        self.id=qid
        self.logic=None
        self.subqueries=[]
        self.category=None
        
    def write_query(self,qtype,tree):
        self.type=qtype
        if qtype=='a:StructureQuery':
            qstring=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ComplexSearch').text
            qstring=re.sub('false','False',qstring)
            qstring=re.sub('true','True',qstring)
            qdict=ast.literal_eval(qstring)
            smart=qdict['queries'][0]['smart']
            self.smart=smart
            return smart

In [37]:
class Query:
    
    def __init__(self,xml,qid=None):
        self.xml=xml
        self.id=qid
        self.logic=None
        self.subqueries=[]
        self.category=None
        
    def write_query(self,qtype,tree):
        self.type=qtype
        if qtype=='a:StructureQuery':
            qstring=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Queries}ComplexSearch').text
            qstring=re.sub('false','False',qstring)
            qstring=re.sub('true','True',qstring)
            qdict=ast.literal_eval(qstring)
            smart=qdict['queries'][0]['smart']
            self.smart=smart
            if '[Ch3,#1]' in self.smart:
                split=re.search(r'(.*)\[([^\(\)]*),([^\(\)].*)\]$',self.smart)
                split1=split.group(1)
                split2=split.group(1)+'['+split.group(2)+']'
                smart_match1=define_smart_match(split1)
                smart_match2=define_smart_match(split2)
                def smart_match(x):
                    return any([smart_match1(x),smart_match2(x)])
            else:
                smart_match=define_smart_match(smart)
            self.query=smart_match
        elif qtype=='LogicalQuery':
            self.logic=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Logic').text
            elements=self.xml.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Elements')
            node_ids=[elem.attrib['{http://schemas.microsoft.com/2003/10/Serialization/}Ref']\
                      for elem in elements.findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query')\
                      if '{http://schemas.microsoft.com/2003/10/Serialization/}Ref' in elem.attrib]
            if self.logic=='Not':
                node_id=node_ids[0] #Should only be one
                sq=tree[node_id]
                self.subqueries=[sq]
                def func(x):
                    return not(sq.query(x))
                self.query=func
            elif self.logic=='And':
                sqs=[tree[node_id] for node_id in node_ids]
                self.subqueries=sqs
                def func(x):
                    return all([sq.query(x) for sq in self.subqueries])
                self.query=func
            else:
                sqs=[tree[node_id] for node_id in node_ids]
                self.subqueries=sqs
                for orquery in elements.findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query'):
                    if '{http://www.w3.org/2001/XMLSchema-instance}type' in orquery.attrib:
                        extra_sq=Query(orquery)
                        extra_sq.write_query('b:StructureQuery',tree)
                        sqs.append(extra_sq)      
                def func(x):
                    return any([sq.query(x) for sq in self.subqueries])
                self.query=func
    
    def print_tree(self,x,tabs=0):
        qinfo=(self.id,self.type)
        if self.type=='b:StructureQuery':
            qinfo=qinfo+(self.smart,)
        elif self.type=='LogicalQuery':
            qinfo=qinfo+(self.logic,)
        try:
            qinfo=qinfo+(self.query(x),)
        except:
            qinfo=qinfo+('does not process',)
        print('\t'*tabs+str(qinfo))
        for sq in self.subqueries:
            sq.print_tree(x,tabs+1)

In [38]:
q = Query(xml)

In [42]:
for elem in e.iter('http://schemas.microsoft.com/2003/10/Serialization/Arrays'):
    print(elem)

In [39]:
all_tests={}
print_tests = {}
bad_smarts=set()
bad_cats=set()
import re
import ast
for elem in e.iter('{http://schemas.microsoft.com/2003/10/Serialization/Arrays}anyType'):
    category=elem.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Caption').text
    queries=elem.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Expression')\
        .find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Queries')\
        .findall('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Query')
    contents=[query.find('{http://schemas.datacontract.org/2004/07/LMC.Profiling.Engine}Content') for query in queries]
    #print(query)
    query_tree={}
    for query in contents:
        attributes=query.attrib
        if '{http://schemas.microsoft.com/2003/10/Serialization/}Id' not in attributes:
            continue
        query_id=attributes['{http://schemas.microsoft.com/2003/10/Serialization/}Id']
        query_type=attributes['{http://www.w3.org/2001/XMLSchema-instance}type']
        q=Query(query,query_id)
        q.category=category
        q.write_query(query_type,query_tree)
        q.print_tree(q.query)
        if not q.query or not all([sq.query for sq in q.subqueries]): #Smarts did not compile, sqs needed bc of hidden sqs in or queries
            bad_cats.add(category)
            if q.type=='b:StructureQuery':
                bad_smarts.add(q.smart)
        query_tree[query_id]=q
    all_tests[category]=query_tree[query_id]
    print_tests[category] = [query_id, query_type]#Final one should always be the top level query hopefully