Lectura de xml

In [35]:
import pandas as pd
import xml.etree.ElementTree as etree

tree = etree.parse("wbg-fal10.xml")
root = tree.getroot()

columns_problem = ["Name", "NrDays", "SlotsPerDay", "NrWeeks"]
columns_optimization = ["Time", "Room", "Distribution", "Student"]
columns_rooms = ["RoomId", "Capacity"]
columns_courses = ["CourseId", "ConfigId", "SubpartId", "ClassId", "ClassLimit", "ClassParent"]
columns_distributions = ["DistributionId", "DistributionType", "DistributionRequired", "DistributionPenalty", "ClassesList"]
columns_students = ["StudentId", "CoursesList"]
columns_relation_class_room = ["ClassId", "RoomId", "RoomPenalty"]
columns_relation_class_time = ["ClassId", "TimeDays", "TimeStart", "TimeLength", "TimeWeeks", "TimePenalty"]
columns_room_travel = ["RoomId", "PreviousRoomId", "TravelTimeValue"]
columns_room_unavailable = ["RoomId", "UnavailableDays", "UnavailableStart", "UnavailableLength", "UnavailableWeeks"]

df_problem = pd.DataFrame(columns = columns_problem)
df_problem = df_problem.append({"Name": root.attrib.get("name"), "NrDays": root.attrib.get("nrDays"), "SlotsPerDay": root.attrib.get("slotsPerDay"), "NrWeeks": root.attrib.get("nrWeeks")}, ignore_index = True)

df_optimization = pd.DataFrame(columns = columns_optimization)
df_rooms = pd.DataFrame(columns = columns_rooms)
df_courses = pd.DataFrame(columns = columns_courses)
df_distributions = pd.DataFrame(columns = columns_distributions)
df_students = pd.DataFrame(columns = columns_students)
df_relation_class_room = pd.DataFrame(columns = columns_relation_class_room)
df_relation_class_time = pd.DataFrame(columns = columns_relation_class_time)
df_room_travel = pd.DataFrame(columns = columns_room_travel)
df_room_unavailable = pd.DataFrame(columns = columns_room_unavailable)

for node in root:
    if node.tag == "optimization":
        df_optimization = df_optimization.append({"Time": node.attrib.get("time"), "Room": node.attrib.get("room"), "Distribution": node.attrib.get("distribution"), "Student": node.attrib.get("student")}, ignore_index = True)
    
    elif node.tag == "rooms":
        for room in node:
            roomid = room.attrib.get("id")
            cap = room.attrib.get("capacity")
            df_rooms = df_rooms.append(pd.Series([roomid, cap], index = columns_rooms), ignore_index = True)
            #print(len(list(gchild.tag for gchild in child.iter() if gchild is not child)))
            if len(list(child.tag for child in room.iter() if child is not room)) > 0:
                for child in room:
                    if child.tag == "travel":
                        prevroomid = child.attrib.get("room")
                        value = child.attrib.get("value")
                        df_room_travel = df_room_travel.append(pd.Series([roomid, prevroomid, value], index = columns_room_travel), ignore_index = True)
                    elif child.tag == "unavailable":
                        days = child.attrib.get("days")
                        start = child.attrib.get("start")
                        length = child.attrib.get("length")
                        weeks = child.attrib.get("weeks")
                        df_room_unavailable = df_room_unavailable.append(pd.Series([roomid, days, start, length, weeks], index = columns_room_unavailable), ignore_index = True)
            
    elif node.tag == "courses":
        for course in node:
            #print(course.tag)
            courseid = course.attrib.get("id")
            for config in course:
                #print(config.tag)
                configid = config.attrib.get("id")
                for subpart in config:
                    #print(subpart.tag)
                    subpartid = subpart.attrib.get("id")
                    for clase in subpart:
                        #print(clase.tag)
                        claseid = clase.attrib.get("id")
                        claselimit = clase.attrib.get("limit")
                        claseparent = clase.find("parent").text if clase.find("parent") is not None else None
                        df_courses = df_courses.append(pd.Series([courseid, configid, subpartid, claseid, claselimit, claseparent], index = columns_courses), ignore_index = True)
                        if len(list(child.tag for child in clase.iter() if child is not clase)) > 0:
                            for child in clase:
                                #print(child.tag)
                                if child.tag == "room":
                                    roomid = child.attrib.get("id")
                                    roompenalty = child.attrib.get("penalty")
                                    df_relation_class_room = df_relation_class_room.append(pd.Series([claseid, roomid, roompenalty], index = columns_relation_class_room), ignore_index = True)
                                elif child.tag == "time":
                                    timedays = child.attrib.get("days")
                                    timestart = child.attrib.get("start")
                                    timelength = child.attrib.get("length")
                                    timeweeks = child.attrib.get("weeks")
                                    timepenalty = child.attrib.get("penalty")
                                    df_relation_class_time = df_relation_class_time.append(pd.Series([claseid, timedays, timestart, timelength, timeweeks, timepenalty], index = columns_relation_class_time), ignore_index = True)
                                    
    elif node.tag == "distributions":
        contador = 1
        for distribution in node:
            #print(distribution.tag)
            distribid = contador
            distribtype = distribution.attrib.get("type")
            distribrequired = distribution.find("required").text if distribution.find("required") is not None else None
            distribpenalty = distribution.find("penalty").text if distribution.find("penalty") is not None else None
            listaclases = []
            for clase in distribution:
                #print(clase.tag)
                listaclases.append(clase.attrib.get("id"))
            df_distributions = df_distributions.append(pd.Series([distribid, distribtype, distribrequired, distribpenalty, listaclases], index = columns_distributions), ignore_index = True)
            contador += 1
             
    elif node.tag == "students":
        for student in node:
            studentid = student.attrib.get("id")
            listacourses = []
            for course in student:
                listacourses.append(course.attrib.get("id"))
            df_students = df_students.append(pd.Series([studentid, listacourses], index = columns_students), ignore_index = True)
            
            


print("################################################################################################################")
print("DataFrame Problem")
display(df_problem)
print("################################################################################################################")
print("DataFrame Optimization")
display(df_optimization)
print("################################################################################################################")
print("DataFrame Rooms")
display(df_rooms)
print("################################################################################################################")
print("DataFrame RoomTravel")
display(df_room_travel)
print("################################################################################################################")
print("DataFrame RoomUnavailable")
display(df_room_unavailable)
print("################################################################################################################")
print("DataFrame Courses")
display(df_courses)
print("################################################################################################################")
print("DataFrame Distributions")
display(df_distributions)
print("################################################################################################################")
print("DataFrame Students")
display(df_students)
print("################################################################################################################")
print("DataFrame RelationClassRoom")
display(df_relation_class_room)
print("################################################################################################################")
print("DataFrame RelationClassTime")
display(df_relation_class_time)
print("################################################################################################################")

################################################################################################################
DataFrame Problem


Unnamed: 0,Name,NrDays,SlotsPerDay,NrWeeks
0,wbg-fal10,7,288,16


################################################################################################################
DataFrame Optimization


Unnamed: 0,Time,Room,Distribution,Student
0,1,1,10,10


################################################################################################################
DataFrame Rooms


Unnamed: 0,RoomId,Capacity
0,1,1
1,2,1
2,3,2
3,4,1
4,5,1
5,6,4
6,7,2


################################################################################################################
DataFrame RoomTravel


Unnamed: 0,RoomId,PreviousRoomId,TravelTimeValue


################################################################################################################
DataFrame RoomUnavailable


Unnamed: 0,RoomId,UnavailableDays,UnavailableStart,UnavailableLength,UnavailableWeeks
0,3,1000000,240,24,1000000
1,3,10000,240,24,10000000000000
2,4,100000,240,24,1000000
3,6,100000,240,24,1000000
4,7,1000000,240,24,10000000000
5,7,1000000,240,24,1000000
6,7,1000000,240,24,10000
7,7,100000,222,12,10000000000
8,7,100000,222,12,1000000
9,7,10000,240,24,10000000000000


################################################################################################################
DataFrame Courses


Unnamed: 0,CourseId,ConfigId,SubpartId,ClassId,ClassLimit,ClassParent
0,1,1,1,1,2,
1,2,2,2,2,4,
2,3,3,3,3,4,
3,3,3,3,4,4,
4,3,3,3,5,4,
...,...,...,...,...,...,...
145,21,21,34,146,2,
146,21,21,35,147,1,
147,21,21,35,148,1,
148,21,21,35,149,1,


################################################################################################################
DataFrame Distributions


Unnamed: 0,DistributionId,DistributionType,DistributionRequired,DistributionPenalty,ClassesList
0,1,SameAttendees,,,"[1, 31, 32, 37, 38, 39]"
1,2,SameAttendees,,,"[3, 4, 5, 28, 29, 30]"
2,3,SameAttendees,,,"[40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51]"
3,4,SameAttendees,,,"[40, 37]"
4,5,SameAttendees,,,"[41, 37]"
...,...,...,...,...,...
77,78,NotOverlap,,,"[123, 124, 125]"
78,79,NotOverlap,,,"[132, 130, 133, 134, 131]"
79,80,NotOverlap,,,"[28, 29, 30]"
80,81,NotOverlap,,,"[79, 80, 81, 82, 83, 84, 85, 86, 87]"


################################################################################################################
DataFrame Students


Unnamed: 0,StudentId,CoursesList
0,1,"[1, 19, 20, 7, 8]"
1,2,"[1, 19, 7, 8, 11]"
2,3,"[19, 4, 5, 8, 9]"
3,4,"[3, 5, 6, 9, 15]"
4,5,"[3, 5, 6, 9, 15]"
5,6,"[3, 5, 6, 9, 15]"
6,7,"[17, 2, 5, 6, 7, 10]"
7,8,"[3, 5, 6, 7, 10]"
8,9,"[17, 5, 6, 9, 10]"
9,10,"[17, 4, 5, 6, 9]"


################################################################################################################
DataFrame RelationClassRoom


Unnamed: 0,ClassId,RoomId,RoomPenalty
0,1,6,4
1,1,7,0
2,1,3,0
3,2,6,0
4,3,6,0
...,...,...,...
426,146,3,0
427,147,2,0
428,148,2,0
429,149,2,0


################################################################################################################
DataFrame RelationClassTime


Unnamed: 0,ClassId,TimeDays,TimeStart,TimeLength,TimeWeeks,TimePenalty
0,1,1010100,102,10,1111111111111111,16
1,1,1010100,114,10,1111111111111111,0
2,1,1010100,126,10,1111111111111111,0
3,1,1010100,138,10,1111111111111111,0
4,1,1010100,150,10,1111111111111111,0
...,...,...,...,...,...,...
4612,150,1000000,198,10,1111111111111111,0
4613,150,0100000,198,10,1111111111111111,0
4614,150,0010000,198,10,1111111111111111,0
4615,150,0001000,198,10,1111111111111111,0


################################################################################################################
