In [5]:
"""
Input: XML from Reference Manager.
Output: CSV with dataframe which has columns extracted from xml tags + flag useful.
"""

import pandas as pd
import xml.etree.cElementTree as ET

def get_elem(record, tag):
    element = []
    if record.iterfind(tag):
        for elem in record.iterfind(tag) :
            element.append(elem.text.strip().encode('utf-8'))
    if element:
        return element
    else:
        return [None]

def parse_xml(filename):
    data = list()
    columns = ['filename','author','fulltitle','subtitle','pages',
               'volume','number','keywords','year','abstract']

    tree = ET.ElementTree(file=filename)

    # Count how many elements of type record we can find
    count = 0
    for elem in tree.iter(tag='record'):
        count += 1
    print count

    # Parsing fields into Dataframe
    for record in tree.iter(tag='record'):
        pdf_file = get_elem(record, 'urls/pdf-urls/url')[0]
        authors = get_elem(record, 'contributors/authors/author/style')
        full_title = get_elem(record, 'titles/title/style')[0]
        sub_title = get_elem(record, 'titles/secondary-title/style')[0]
        periodical_fulltitle = get_elem(record, 'periodical/full-title/style')[0]
        pages = get_elem(record, 'pages')[0]
        volume = get_elem(record, 'volume/style')[0]
        number = get_elem(record, 'number/style')[0]
        keywords = get_elem(record, 'keywords/keyword/style')
        year = get_elem(record, 'dates/year')[0]
        location = get_elem(record, 'pub-location/style')[0]
        publisher = get_elem(record, 'publisher/style')[0]
        isbn = get_elem(record, 'isbn/style')[0]
        abstract = get_elem(record, 'abstract/style')[0]

        data.append([pdf_file, authors, full_title, sub_title, pages, 
                     volume, number, keywords, year, abstract])
    
    df = pd.DataFrame(data, columns=columns)   
    
    # Saving Dataframe as csv
    df.to_csv('{0}'.format(filename.replace('.xml','.csv')), index=0)
    
    return df

In [6]:
"""
Here a summary of the data DKG provided:

664 pdfs about kidney (Niere.zip)
An XML with the metadata about the 664 PDFs (niere-zugeordnete-referenzen-xml.xml)
An XML with the metadata about PDFs excluded (niere-ausgeschlossene-xml.xml) 
"""

classifiedfile = '../metadata/niere-example/niere-zugeordnete-referenzen-xml.xml'
excludedfile = '../metadata/niere-example/niere-ausgeschlossene-xml.xml'

classified = parse_xml(classifiedfile)
print classified.shape

excluded = parse_xml(excludedfile)
print excluded.shape

classified.head()

658
(658, 10)
3796
(3796, 10)


Unnamed: 0,filename,author,fulltitle,subtitle,pages,volume,number,keywords,year,abstract
0,file://R:\Literatur\literatur_dialog\informati...,[=International Agency for Reseach on Cancer (...,Consumption of alcoholic beverages,IARC Monographs on the Evaluation of Carcinoge...,377-504,100 E,,"[basis,1-alk, endometrium,1-alk, endometrium,b...",2012,
1,file://R:\Literatur\literatur_dialog\informati...,"[Aass,N., De Mulder,P.H.M., Mickisch,G.H.J., M...",Randomized phase II/III trial of interferon al...,Journal of Clinical Oncology,4172-4178,23,18.0,"[niere,4-med-pall, niere,bereitgestellt, Quell...",2005,Purpose: A randomized phase II/III trial was c...
2,file://R:\Literatur\literatur_dialog\informati...,"[Abdel-Rahman,O., Fouad,M.]",Efficacy and toxicity of sunitinib for non cle...,Critical Reviews in Oncology/Hematology,238-250,94,2.0,"[niere,4-med-pall, niere,bereitgestellt, Quell...",2015,The randomized phase III trial of sunitinib ve...
3,file://R:\Literatur\literatur_dialog\informati...,"[Abdellateef,M.]",Laparoscopic Partial Nephrectomy: Expanding Ro...,World Journal of Laparoscopic Surgery,169-173,4,3.0,"[niere,4-op-lap, niere,bereitgestellt, Quelle,...",2011,Context: The increasing incidence of localized...
4,file://R:\Literatur\literatur_dialog\informati...,"[Abel,E Jason, Wood,Christopher G.]",Cytoreductive nephrectomy for metastatic RCC i...,Nature Reviews.Urology,375-383,6,7.0,"[niere,4-op-pall, niere,bereitgestellt, Quelle...",2009,Metastatic renal cell carcinoma (RCC) has trad...


In [7]:
# Add Flag
classified['useful'] = 1
excluded['useful'] = 0

full_df = pd.concat([classified, excluded])

# Add ID for records
full_df['num'] = range(1, len(full_df) + 1)

print full_df.shape

full_df.head()

(4454, 12)


Unnamed: 0,filename,author,fulltitle,subtitle,pages,volume,number,keywords,year,abstract,useful,num
0,file://R:\Literatur\literatur_dialog\informati...,[=International Agency for Reseach on Cancer (...,Consumption of alcoholic beverages,IARC Monographs on the Evaluation of Carcinoge...,377-504,100 E,,"[basis,1-alk, endometrium,1-alk, endometrium,b...",2012,,1,1
1,file://R:\Literatur\literatur_dialog\informati...,"[Aass,N., De Mulder,P.H.M., Mickisch,G.H.J., M...",Randomized phase II/III trial of interferon al...,Journal of Clinical Oncology,4172-4178,23,18.0,"[niere,4-med-pall, niere,bereitgestellt, Quell...",2005,Purpose: A randomized phase II/III trial was c...,1,2
2,file://R:\Literatur\literatur_dialog\informati...,"[Abdel-Rahman,O., Fouad,M.]",Efficacy and toxicity of sunitinib for non cle...,Critical Reviews in Oncology/Hematology,238-250,94,2.0,"[niere,4-med-pall, niere,bereitgestellt, Quell...",2015,The randomized phase III trial of sunitinib ve...,1,3
3,file://R:\Literatur\literatur_dialog\informati...,"[Abdellateef,M.]",Laparoscopic Partial Nephrectomy: Expanding Ro...,World Journal of Laparoscopic Surgery,169-173,4,3.0,"[niere,4-op-lap, niere,bereitgestellt, Quelle,...",2011,Context: The increasing incidence of localized...,1,4
4,file://R:\Literatur\literatur_dialog\informati...,"[Abel,E Jason, Wood,Christopher G.]",Cytoreductive nephrectomy for metastatic RCC i...,Nature Reviews.Urology,375-383,6,7.0,"[niere,4-op-pall, niere,bereitgestellt, Quelle...",2009,Metastatic renal cell carcinoma (RCC) has trad...,1,5


In [8]:
# save dataframe to be used for classification task
full_df.to_csv('../metadata/niere-example/classification_df.csv', index=0)

In [9]:
# dataframe stats
positive = full_df[full_df.useful == 1]
negative = full_df[full_df.useful == 0]

print """all records: {0}
    positive records: {1} ({2:.1f}% of all records)
    negative records: {3} ({4:.1f}% of all records)""".format(len(full_df), 
                                                              len(positive), float(len(positive))/len(full_df)*100,
                                                              len(negative), float(len(negative))/len(full_df)*100)

all records: 4454
    positive records: 658 (14.8% of all records)
    negative records: 3796 (85.2% of all records)
