In [1]:
import pandas as pd
import numpy as np

In [2]:
import xml.etree.ElementTree as ET

In [3]:
#tree = ET.parse("C:/Users/Sri/Google Drive/Data Science/Python/Data_formats/bio_xml_parse/original_data_jonas/abstracts.xml")
tree = ET.parse("C:/Users/Sri/Google Drive/Data Science/Python/Data_formats/bio_xml_parse/original_data_jonas/full_papers.xml")

root = tree.getroot()

### Simpler tagging

In [4]:
sentences = []
negations = []
speculations = []
for sent in root.iter('sentence'):
    sentences.append({sent.attrib['id']:"".join(sent.itertext())})
    for cue in sent.findall("./xcope/cue/[@type='negation']..."):
        negations.append({sent.attrib['id']:"".join(cue.itertext())})
    for cue in sent.findall("./xcope/cue/[@type='speculation']..."):
        speculations.append({sent.attrib['id']:"".join(cue.itertext())})

In [5]:
sent_df = pd.DataFrame()
lol_sents = [list(elem.values()) for elem in sentences]
lol_keys = [list(elem.keys()) for elem in sentences]

sents = [y for x in lol_sents for y in x]
sid = [y for x in lol_keys for y in x]
sent_df['sid'] = sid
sent_df['sentences'] = sents

sent_df.head()

Unnamed: 0,sid,sentences
0,S1.1,Mining prokaryotic genomes for unknown amino a...
1,S1.2,Abstract
2,S1.3,Background
3,S1.4,Selenocysteine and pyrrolysine are the 21st an...
4,S1.5,Since a number of microbial genomes have been ...


In [9]:
sent_df.shape

(2670, 2)

In [10]:
negations_df = pd.DataFrame()
lol_negscopes = [list(elem.values()) for elem in negations]
lol_negkeys = [list(elem.keys()) for elem in negations]

neg_scopes = [y for x in lol_negscopes for y in x]
neg_sid = [y for x in lol_negkeys for y in x]
negations_df['sid'] = neg_sid
negations_df['neg_scopes'] = neg_scopes
negations_df.head()

Unnamed: 0,sid,neg_scopes
0,S1.6,no tRNA gene for unknown amino acid was found ...
1,S1.13,no promising candidate for the 23rd amino acid...
2,S1.18,not terminate at a stop codon
3,S1.19,not one of the 20 amino acids
4,S1.34,no likely tRNA of the novel amino acid was det...


In [11]:
speculations_df = pd.DataFrame()
lol_specscopes = [list(elem.values()) for elem in speculations]
lol_speckeys = [list(elem.keys()) for elem in speculations]

spec_scopes = [y for x in lol_specscopes for y in x]
spec_sid = [y for x in lol_speckeys for y in x]
speculations_df['sid'] = spec_sid
speculations_df['spec_scopes'] = spec_scopes
speculations_df.head()

Unnamed: 0,sid,spec_scopes
0,S1.5,whether the 23rd amino acid is left undiscover...
1,S1.7,performance of the tRNA prediction program on ...
2,S1.10,Assuming that the 23rd amino acid is also enco...
3,S1.15,suggests that the unknown amino acid encoded b...
4,S1.27,suggests that its incorporation into the genet...


In [12]:
bio_df = pd.merge(sent_df,negations_df, how='left')
bio_df_scopes = pd.merge(bio_df,speculations_df, how='left')
bio_df_scopes.shape

(2742, 4)

In [13]:
bio_df_scopes.head(3)

Unnamed: 0,sid,sentences,neg_scopes,spec_scopes
0,S1.1,Mining prokaryotic genomes for unknown amino a...,,
1,S1.2,Abstract,,
2,S1.3,Background,,


In [14]:
bio_df_scopes.reset_index(drop=False, inplace=True)
bio_df_scopes.rename(columns={'index':'oidx'},inplace=True)
bio_df_scopes.head(5)

Unnamed: 0,oidx,sid,sentences,neg_scopes,spec_scopes
0,0,S1.1,Mining prokaryotic genomes for unknown amino a...,,
1,1,S1.2,Abstract,,
2,2,S1.3,Background,,
3,3,S1.4,Selenocysteine and pyrrolysine are the 21st an...,,
4,4,S1.5,Since a number of microbial genomes have been ...,,whether the 23rd amino acid is left undiscover...


In [15]:
bio_df_final = bio_df_scopes.groupby('sid').agg({ 'oidx':'first', 'sentences':'first','neg_scopes': list, 'spec_scopes':list})
bio_df_final.reset_index(inplace=True)
bio_df_final.sort_values('oidx', inplace=True)

In [16]:
bio_df_final.head(5)

Unnamed: 0,sid,oidx,sentences,neg_scopes,spec_scopes
0,S1.1,0,Mining prokaryotic genomes for unknown amino a...,[nan],[nan]
111,S1.2,1,Abstract,[nan],[nan]
222,S1.3,2,Background,[nan],[nan]
261,S1.4,3,Selenocysteine and pyrrolysine are the 21st an...,[nan],[nan]
272,S1.5,4,Since a number of microbial genomes have been ...,[nan],[whether the 23rd amino acid is left undiscove...


In [17]:
bio_df_final['neg_scopes'] = bio_df_final['neg_scopes'].apply(lambda x:[] if pd.isnull(x[0]) else x)
bio_df_final['spec_scopes'] = bio_df_final['spec_scopes'].apply(lambda x:[] if pd.isnull(x[0]) else x)

In [18]:
bio_df_final.iloc[100:150,:]

Unnamed: 0,sid,oidx,sentences,neg_scopes,spec_scopes
3,S1.101,101,The result of the above homology searches was ...,[],[]
4,S1.102,102,An iORF was discarded if there were any BLAST ...,[not cover the inframe stop codon],[]
5,S1.103,103,"A total of 26,003 iORF satisfied the above cri...",[],[]
6,S1.104,104,To examine intrafamily conservation of the inf...,[],[]
7,S1.105,105,"After removal of singletons, 679 clusters with...",[],[]
8,S1.106,106,A cluster was discarded unless all members of ...,[],[]
9,S1.107,107,The locations of the inframe stop codons were ...,[],[]
10,S1.108,108,These conditions reduced the number of cluster...,[],[]
11,S1.109,109,Manual inspection of these 273 clusters reveal...,[],[]
13,S1.110,110,"Hence, three-step filtering procedures were ap...",[],[]


In [19]:
bio_df_final.shape

(2670, 5)

In [20]:
bio_df_final.to_excel('bio_df__fullpapers_final.xlsx', index=False)

In [None]:
len(sentences)

In [None]:
for scope in root.iter('cue'):
        print (scope.text)

In [None]:
sentences = []
scopes = []

for sentence in root.iter('sentence'):
    for scope in sentence.iter():
        print (scope.text, scope.tag)
#         if(scope.tag == 'sentence'):
#             sentences.append(''.join(scope.itertext()))
#             scopes.append('')
#         elif(scope.tag == 'xcope'):
#             scopes.append(''.join(scope.itertext()))

In [None]:
sentences

In [None]:
scopes

In [None]:
sentences

In [None]:
scopes

In [None]:
for movie in root.findall("./genre/decade/movie/format/[@multiple='Yes']..."):
    print(movie.attrib)

In [None]:
import xml.etree.ElementTree as ET
#x =  # your xml file
tree = ET.parse("C:/Users/Sri/Google Drive/Data Science/Python/Data_formats/jonas_github/original_data_jonas/abstracts.xml")
root = tree.getroot()
string = ""
for c in root:
    string +=  c.text.strip()
print (string)

In [None]:
print(ET.tostring(root, encoding='utf8').decode('utf8'))

In [None]:
root.tag

In [None]:
root.attrib

In [None]:
for child in root:
    print (child.tag, child.attrib)

In [None]:
#To see all the elements in the entire tree

In [None]:
# helpfulway to see the whole document

In [None]:
print(ET.tostring(root, encoding='utf8').decode('utf8'))

In [None]:
def all_texts(root):
    for child in root:
         yield child.tail

In [None]:
list(all_texts(doc))

In [None]:
import xml.etree.ElementTree as et

xml = """
<A>
    hello

    <annotation> NOT part of text </annotation>

    world
</A>"""

In [None]:
doc = et.fromstring(xml)

In [None]:
def all_texts(root):
    if root.text is not None:
         yield root.text
    for child in root:
         if child.tail is not None:
            yield child.tail

In [None]:
list(all_texts(doc))

In [None]:
# To look inside a particular element

In [None]:
len([sent.text for sent in root.iter('sentence')])

In [None]:
for child in root.iter('sentence'):
    print (sent.tail.text)

In [None]:
for child in root.findall("./DocumentSet/Document/DocID/DocumentPart/sentence"):
    print (child.attrib, child.text)

In [None]:
scopes = [sent.text for sent in root.iter('xcope')]
scopes

In [None]:
sents[3]

In [None]:
sents[4]

In [None]:
[movie.attrib for movie in root.iter('format')]

**XPath Expressions
Many times elements will not have attributes, they will only have text content. Using the attribute .text, you can print out this content.**

In [None]:
for description in root.iter('description'):
    print(description.text)

In [None]:
for format in root.iter('format'):
    print(format.text)

**Understanding XPath is critically important to scanning and populating XMLs. ElementTree has a .findall() function that will traverse the immediate children of the referenced element. You can use XPath expressions to specify more useful searches.**

In [None]:
for movie in root.findall("./genre/decade/movie/[year='1992']"):
    print(movie.attrib)

**The function .findall() always begins at the element specified. This type of function is extremely powerful for a "find and replace". You can even search on attributes!
Now, print out only the movies that are available in multiple formats (an attribute).**

In [None]:
for format in root.findall("./genre/decade/movie/format/[@multiple='Yes']"):
    print(format.attrib)

In [None]:
# with findall you can traverse the children and also the attribues of an element

In [None]:
for format in root.findall("./genre/decade/movie/[@multiple='Yes']"):
    print(format.attrib)
#we get nothing here because @multiple is not an attribue of movie nor its child.

In [None]:
# to return the parent attribute of the current element use ...

In [None]:
for movie in root.findall("./genre/decade/movie/format/[@multiple='Yes']..."):
    print(movie.attrib)

### Modifying an XML

In [None]:
for movie in root.iter('movie'):
    print (movie.attrib)

In [None]:
b2tf = root.find("./genre/decade/movie/[@title='Back 2 the Future']")
b2tf

Notice that using the .find() method returns an element of the tree. Much of the time, it is more useful to edit the content within an element.
Modify the title attribute of the Back 2 the Future element variable to read "Back to the Future". Then, print out the attributes of your variable to see your change. You can easily do this by accessing the attribute of an element and then assigning a new value to it:

In [None]:
b2tf.attrib["title"] = "Back to the Future"
print(b2tf.attrib)

Write out your changes back to the XML so they are permanently fixed in the document. Print out your movie attributes again to make sure your changes worked. Use the .write() method to do this:

In [None]:
tree.write("movies.xml") # this overwrites the xml file.
# interesting, we saved the change to a variable, but the original document
# got updated with the above command.

tree = ET.parse('movies.xml')
root = tree.getroot()

for movie in root.iter('movie'):
    print(movie.attrib)

Fixing Attributes
The multiple attribute is incorrect in some places. Use ElementTree to fix the designator based on how many formats the movie comes in. First, print the format attribute and text to see which parts need to be fixed.

In [None]:
for form in root.findall("./genre/decade/movie/format"):
    print(form.attrib, form.text)

In [None]:
import re

for form in root.findall("./genre/decade/movie/format"):
    # Search for the commas in the format text
    match = re.search(',',form.text)
    if match:
        form.set('multiple','Yes')
    else:
        form.set('multiple','No')

In [None]:
# Write out the tree to the file again
tree.write("movies.xml")

tree = ET.parse('movies.xml')
root = tree.getroot()

for form in root.findall("./genre/decade/movie/format"):
    print(form.attrib, form.text)

Moving Elements
Some of the data has been placed in the wrong decade. Use what you have learned about XML and ElementTree to find and fix the decade data errors.

It will be useful to print out both the decade tags and the year tags throughout the document.

In [None]:
for decade in root.findall("./genre/decade"):
    print(decade.attrib)
    for year in decade.findall("./movie/year"):
        print(year.text, '\n')

In [None]:
for decade in root.iter('decade'):
    print(decade.attrib)

The two years that are in the wrong decade are the movies from the 2000s. Figure out what those movies are, using an XPath expression.

In [None]:
for movie in root.findall("./genre/decade/movie/[year='2000']"):
    print (movie.attrib)

You have to add a new decade tag, the 2000s, to the Action genre in order to move the X-Men data. The .SubElement() method can be used to add this tag to the end of the XML.

In [None]:
action = root.find("./genre[@category='Action']")
new_dec = ET.SubElement(action, 'decade')
new_dec.attrib["years"] = '2000s'

In [None]:
print(ET.tostring(action, encoding='utf8').decode('utf8'))

Now append the X-Men movie to the 2000s and remove it from the 1990s, using .append() and .remove(), respectively.

In [None]:
xmen = root.find("./genre/decade/movie[@title='X-Men']")
dec2000s = root.find("./genre[@category='Action']/decade[@years='2000s']")
dec2000s.append(xmen)
dec1990s = root.find("./genre[@category='Action']/decade[@years='1990s']")
dec1990s.remove(xmen)

In [None]:
print(ET.tostring(action, encoding='utf8').decode('utf8'))

Build XML Documents
Nice, so you were able to essentially move an entire movie to a new decade. Save your changes back to the XML.

In [None]:
tree.write("movies.xml")

tree = ET.parse('movies.xml')
root = tree.getroot()

print(ET.tostring(root, encoding='utf8').decode('utf8'))