In [6]:
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
from bs4 import *

In [7]:
filename = 'sample_data.txt'
data = open(filename,'r')
soup = BeautifulSoup(data,'lxml')

In [55]:
obj.find('barcode')

<barcode cc="1" vcc="1"><codevalid id="r"><st>0</st><wud>0</wud><cs>0</cs><bdn>2</bdn><cl>229</cl><bc></bc><condition>Ref,Camera,FXG2DBarcode,PDF</condition><position unit="mm" x="703" xmax="752" xmin="703" y="1063" z="103"></position><readlist>48000000</readlist><norca fee="1" fv="0" rq="97" vee="0"></norca><devices><device dn="5"><due>0</due><position unit="mm" x="703" y="1063" z="103"></position><norca fee="1" fv="0" rq="97" vee="0"></norca></device><device dn="2"><due>0</due><position unit="mm" x="752" y="1057" z="77"></position><norca fee="1" fv="0" rq="65" vee="0"></norca></device></devices></codevalid></barcode>

In [59]:
#Capture all field names
obj = soup.objectdata
features = {n.name:[] for n in obj}
#Grab field information for each field
for obj in soup.find_all('objectdata'):
    for feature in features.keys():
        features[feature].append(obj.find(feature))
df = pd.DataFrame(features)
df.head(2)

Unnamed: 0,barcode,condition,deviceid,devicename,general,hostmessage,incr,ocsdata,scaledata,seqnb,sorterstate,sortstate,timestamp,tokenid,volumetric
0,"<barcode cc=""1"" vcc=""1""><codevalid id=""r""><st>...","<condition>NoRead,ValidDim,NotLFT</condition>",<deviceid>32</deviceid>,<devicename>0089PS06CT1</devicename>,"<general errornb=""0"" ie=""15589"" iostate=""8800""...",<hostmessage>PDF</hostmessage>,<incr>21126</incr>,<ocsdata><rxstring>07/07/2017;23:56:44;0111.6;...,"<scaledata ows=""19""><owe unit=""LB""><value>111....",<seqnb>5864</seqnb>,"<sorterstate state=""started""><speed unit=""ft/m...","<sortstate session=""SOS"" sortname=""2017-07-07 ...",<timestamp>2017-07-07T23:00:05.633</timestamp>,<tokenid>0089PS06CT12017-07-07T23:00:03417</to...,"<volumetric oms1=""0000"" oms2=""00000000"" oms3=""..."
1,"<barcode cc=""1"" vcc=""1""><codevalid id=""j""><st>...","<condition>ValidDim,ValidWeight,ValidRead,PDFN...",<deviceid>32</deviceid>,<devicename>0089PS06CT1</devicename>,"<general errornb=""0"" ie=""16743"" iostate=""8800""...",<hostmessage>]C0037962200190000357853670078708...,<incr>22266</incr>,<ocsdata><rxstring>07/07/2017;23:56:45;0007.2;...,"<scaledata ows=""0""><owe unit=""LB""><value>7.20<...",<seqnb>5865</seqnb>,"<sorterstate state=""started""><speed unit=""ft/m...","<sortstate session=""SOS"" sortname=""2017-07-07 ...",<timestamp>2017-07-07T23:00:06.061</timestamp>,<tokenid>0089PS06CT12017-07-07T23:00:04418</to...,"<volumetric oms1=""0000"" oms2=""00000000"" oms3=""..."


In [65]:
"""Pre-process label for each example"""
#Create dictionary to assign encoded integer to each class.
classes = ['LFT','TooBig','NoRead','ValidDim','MultiRead','Irreg','TooSmall','Gap']
class_dict = {classes[i]:i for i in range(len(classes))}

#Extract and encode contents of condition field only for conditions listed in the `classes` list
conditions = [[class_dict[c] for c in obj.condition.contents[0].split(',') if c in classes]\
              for obj in soup.find_all('objectdata')]
##Perform one-hot-encoding
#Initialize matrix
conditions_encoded = np.zeros([len(conditions),len(classes)])
#Encode position in matrix for place of class
for c in range(len(conditions)):
    conditions_encoded[c,conditions[c]] = 1
#Store in dataframe
label_df = pd.DataFrame(conditions_encoded,columns=classes).astype(int)

In [86]:
"""Extract relevant information from each field for all objects"""
cols = ['date','time','height','width','length','volume','weight','angle','velocity','velocity_units','belt_velocity',\
        'belt_velocity_units']
#Initialize dictionary for storing extracted data
data = {}
for obj in soup.find_all('objectdata'):
    #Time stamp
    [date,time] = obj.timestamp.contents[0].split('T')
    data
    #Gap information
    gap = float(obj.oga.value.contents[0])
    #Item volume
    vol_keys = ['ohe','owi','ole']
    [height,width,length] = [float(obj.volumetric.size.attrs[key]) for key in vol_keys]
    vol = height*width*length
    vol_units = obj.volumetric.size.attrs['unit']
    #Item angle
    angle = int(obj.volumetric.oa.value.contents[0])
    #Item Velocity
    vel = int(obj.volumetric.otve.value.contents[0])
    vel_units = obj.volumetric.otve.attrs['unit']
    #Item weight
    weight = float(obj.scaledata.value.contents[0]) #In lbs
    #Conveyor belt velocity
    belt_vel = float(obj.sorterstate.speed.value.contents[0])
    belt_vel_units = obj.sorterstate.speed.attrs['unit']
    
    #Add data to dictionary
    d = [date,time,height,width,length,vol,weight,angle,vel,vel_units,belt_vel,belt_vel_units]
    for k,v in zip(cols,d):
        #Initialize each entry with an empty list and then append new value
        data[k] = data.get(k,[])+[v]
#Construct dataframe
df = pd.DataFrame(data)
#Add labels
df = df.join(label_df)
#Save as csv
df.to_csv('sampledata.csv',index=False)