In [9]:
import lxml.etree as ET

def fast_iter(context):
    """
    http://lxml.de/parsing.html#modifying-the-tree
    Based on Liza Daly's fast_iter
    http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
    See also http://effbot.org/zone/element-iterparse.htm
    """
    row_counter = 0
    for event, elem in context:
        row_counter += 1
#         func(elem, *args, **kwargs)
        # It's safe to call clear() here because no descendants will be
        # accessed
        elem.clear()
        # Also eliminate now-empty references from the root node to elem
        for ancestor in elem.xpath('ancestor-or-self::*'):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]
        if row_counter % 1000000 == 0:
            print(row_counter)
    del context
    return row_counter

In [10]:
%%time
context = ET.iterparse('data/Posts.xml', events=('end', ))
row_counter = fast_iter(context)

1000000
2000000
3000000
4000000
5000000
6000000
7000000
8000000
9000000
10000000
11000000
12000000
13000000
14000000
15000000
16000000
17000000
18000000
19000000
20000000
21000000
22000000
23000000
24000000
25000000
26000000
27000000
28000000
29000000
30000000
31000000
32000000
33000000
34000000
35000000
36000000
37000000
38000000
39000000
40000000
CPU times: user 14min 57s, sys: 10.6 s, total: 15min 8s
Wall time: 15min 7s


In [12]:
row_counter / 5615558

7.247836813367434

In [9]:
%%time
import lxml.etree as ET

def fast_iter(context):
    """
    http://lxml.de/parsing.html#modifying-the-tree
    Based on Liza Daly's fast_iter
    http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
    See also http://effbot.org/zone/element-iterparse.htm
    """
    c = 0
    row_counter = 0
    for event, elem in context:
        row_counter += 1
        if elem.attrib['LinkTypeId'] == '3':
            print(elem.attrib)
#         func(elem, *args, **kwargs)
        # It's safe to call clear() here because no descendants will be
        # accessed
        elem.clear()
        # Also eliminate now-empty references from the root node to elem
        for ancestor in elem.xpath('ancestor-or-self::*'):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]
        if row_counter % 1000000 == 0:
            print(row_counter)
        c += 1
        if c == 1000000: break 
    del context
    return row_counter


context = ET.iterparse('data/PostLinks.xml', events=('end', ))
row_counter = fast_iter(context)

{'Id': '749670241', 'CreationDate': '2013-02-18T03:03:16.917', 'PostId': '70714', 'RelatedPostId': '355934', 'LinkTypeId': '3'}
{'Id': '749670246', 'CreationDate': '2013-02-18T03:03:16.917', 'PostId': '86947', 'RelatedPostId': '73713', 'LinkTypeId': '3'}
{'Id': '749670247', 'CreationDate': '2013-02-18T03:03:16.917', 'PostId': '97679', 'RelatedPostId': '85122', 'LinkTypeId': '3'}
{'Id': '749670248', 'CreationDate': '2013-02-18T03:03:16.917', 'PostId': '99408', 'RelatedPostId': '1711', 'LinkTypeId': '3'}
{'Id': '749670249', 'CreationDate': '2013-02-18T03:03:16.917', 'PostId': '26925', 'RelatedPostId': '8127', 'LinkTypeId': '3'}
{'Id': '749670250', 'CreationDate': '2013-02-18T03:03:16.917', 'PostId': '73185', 'RelatedPostId': '672', 'LinkTypeId': '3'}
{'Id': '749670251', 'CreationDate': '2013-02-18T03:03:16.917', 'PostId': '81150', 'RelatedPostId': '48935', 'LinkTypeId': '3'}
{'Id': '749670252', 'CreationDate': '2013-02-18T03:03:16.917', 'PostId': '110362', 'RelatedPostId': '1854', 'LinkT

{'Id': '749673771', 'CreationDate': '2013-02-18T03:03:16.917', 'PostId': '887552', 'RelatedPostId': '37628', 'LinkTypeId': '3'}
{'Id': '749673772', 'CreationDate': '2013-02-18T03:03:16.917', 'PostId': '890292', 'RelatedPostId': '789659', 'LinkTypeId': '3'}
{'Id': '749673773', 'CreationDate': '2013-02-18T03:03:16.917', 'PostId': '1213265', 'RelatedPostId': '408670', 'LinkTypeId': '3'}
{'Id': '749673774', 'CreationDate': '2013-02-18T03:03:16.917', 'PostId': '1213265', 'RelatedPostId': '1212797', 'LinkTypeId': '3'}
{'Id': '749673775', 'CreationDate': '2013-02-18T03:03:16.917', 'PostId': '1217341', 'RelatedPostId': '780741', 'LinkTypeId': '3'}
{'Id': '749673776', 'CreationDate': '2013-02-18T03:03:16.917', 'PostId': '1219531', 'RelatedPostId': '805715', 'LinkTypeId': '3'}
{'Id': '749673777', 'CreationDate': '2013-02-18T03:03:16.917', 'PostId': '1220135', 'RelatedPostId': '159442', 'LinkTypeId': '3'}
{'Id': '749673778', 'CreationDate': '2013-02-18T03:03:16.917', 'PostId': '1220536', 'Related

{'Id': '749678508', 'CreationDate': '2013-02-18T03:08:16.490', 'PostId': '2109460', 'RelatedPostId': '1984128', 'LinkTypeId': '3'}
{'Id': '749678509', 'CreationDate': '2013-02-18T03:08:16.490', 'PostId': '2117946', 'RelatedPostId': '70159', 'LinkTypeId': '3'}
{'Id': '749678510', 'CreationDate': '2013-02-18T03:08:16.490', 'PostId': '2119711', 'RelatedPostId': '409087', 'LinkTypeId': '3'}
{'Id': '749678511', 'CreationDate': '2013-02-18T03:08:16.490', 'PostId': '2122319', 'RelatedPostId': '257288', 'LinkTypeId': '3'}
{'Id': '749678512', 'CreationDate': '2013-02-18T03:08:16.490', 'PostId': '2802788', 'RelatedPostId': '388242', 'LinkTypeId': '3'}
{'Id': '749678513', 'CreationDate': '2013-02-18T03:08:16.490', 'PostId': '2803437', 'RelatedPostId': '2802968', 'LinkTypeId': '3'}
{'Id': '749678514', 'CreationDate': '2013-02-18T03:08:16.490', 'PostId': '2803488', 'RelatedPostId': '24551', 'LinkTypeId': '3'}
{'Id': '749678515', 'CreationDate': '2013-02-18T03:08:16.490', 'PostId': '2805209', 'Relat

{'Id': '749681658', 'CreationDate': '2013-02-18T03:08:16.490', 'PostId': '3605530', 'RelatedPostId': '244226', 'LinkTypeId': '3'}
{'Id': '749681659', 'CreationDate': '2013-02-18T03:08:16.490', 'PostId': '3609740', 'RelatedPostId': '3276578', 'LinkTypeId': '3'}
{'Id': '749681660', 'CreationDate': '2013-02-18T03:08:16.490', 'PostId': '3890062', 'RelatedPostId': '3894621', 'LinkTypeId': '3'}
{'Id': '749681661', 'CreationDate': '2013-02-18T03:08:16.490', 'PostId': '3895360', 'RelatedPostId': '3102918', 'LinkTypeId': '3'}
{'Id': '749681662', 'CreationDate': '2013-02-18T03:08:16.490', 'PostId': '3903180', 'RelatedPostId': '3220009', 'LinkTypeId': '3'}
{'Id': '749681663', 'CreationDate': '2013-02-18T03:08:16.490', 'PostId': '3904141', 'RelatedPostId': '194397', 'LinkTypeId': '3'}
{'Id': '749681664', 'CreationDate': '2013-02-18T03:08:16.490', 'PostId': '3906081', 'RelatedPostId': '363681', 'LinkTypeId': '3'}
{'Id': '749681665', 'CreationDate': '2013-02-18T03:08:16.490', 'PostId': '3611446', 'R

{'Id': '749686336', 'CreationDate': '2013-02-18T03:13:16.453', 'PostId': '2839180', 'RelatedPostId': '762162', 'LinkTypeId': '3'}
{'Id': '749686337', 'CreationDate': '2013-02-18T03:13:16.453', 'PostId': '2847702', 'RelatedPostId': '2847683', 'LinkTypeId': '3'}
{'Id': '749686338', 'CreationDate': '2013-02-18T03:13:16.453', 'PostId': '4148541', 'RelatedPostId': '1472633', 'LinkTypeId': '3'}
{'Id': '749686339', 'CreationDate': '2013-02-18T03:13:16.453', 'PostId': '4151732', 'RelatedPostId': '1694388', 'LinkTypeId': '3'}
{'Id': '749686340', 'CreationDate': '2013-02-18T03:13:16.453', 'PostId': '4151732', 'RelatedPostId': '1827704', 'LinkTypeId': '3'}
{'Id': '749686341', 'CreationDate': '2013-02-18T03:13:16.453', 'PostId': '4166894', 'RelatedPostId': '2354210', 'LinkTypeId': '3'}
{'Id': '749686342', 'CreationDate': '2013-02-18T03:13:16.453', 'PostId': '4168447', 'RelatedPostId': '6123507', 'LinkTypeId': '3'}
{'Id': '749686343', 'CreationDate': '2013-02-18T03:13:16.453', 'PostId': '2861908', 

{'Id': '749689476', 'CreationDate': '2013-02-18T03:13:16.453', 'PostId': '4961070', 'RelatedPostId': '4961134', 'LinkTypeId': '3'}
{'Id': '749689477', 'CreationDate': '2013-02-18T03:13:16.453', 'PostId': '4961284', 'RelatedPostId': '1474249', 'LinkTypeId': '3'}
{'Id': '749689478', 'CreationDate': '2013-02-18T03:13:16.453', 'PostId': '4961284', 'RelatedPostId': '4607081', 'LinkTypeId': '3'}
{'Id': '749689479', 'CreationDate': '2013-02-18T03:13:16.453', 'PostId': '4965849', 'RelatedPostId': '812133', 'LinkTypeId': '3'}
{'Id': '749689480', 'CreationDate': '2013-02-18T03:13:16.453', 'PostId': '4976646', 'RelatedPostId': '2491419', 'LinkTypeId': '3'}
{'Id': '749689481', 'CreationDate': '2013-02-18T03:13:16.453', 'PostId': '4977646', 'RelatedPostId': '4977637', 'LinkTypeId': '3'}
{'Id': '749689482', 'CreationDate': '2013-02-18T03:13:16.453', 'PostId': '4983144', 'RelatedPostId': '1698750', 'LinkTypeId': '3'}
{'Id': '749689483', 'CreationDate': '2013-02-18T03:13:16.453', 'PostId': '4984080', 

{'Id': '749694151', 'CreationDate': '2013-02-18T03:18:16.653', 'PostId': '6145225', 'RelatedPostId': '4624536', 'LinkTypeId': '3'}
{'Id': '749694152', 'CreationDate': '2013-02-18T03:18:16.653', 'PostId': '5458116', 'RelatedPostId': '4306997', 'LinkTypeId': '3'}
{'Id': '749694153', 'CreationDate': '2013-02-18T03:18:16.653', 'PostId': '5461324', 'RelatedPostId': '814501', 'LinkTypeId': '3'}
{'Id': '749694154', 'CreationDate': '2013-02-18T03:18:16.653', 'PostId': '5464064', 'RelatedPostId': '167179', 'LinkTypeId': '3'}
{'Id': '749694155', 'CreationDate': '2013-02-18T03:18:16.653', 'PostId': '5468025', 'RelatedPostId': '5467828', 'LinkTypeId': '3'}
{'Id': '749694156', 'CreationDate': '2013-02-18T03:18:16.653', 'PostId': '5470765', 'RelatedPostId': '5266350', 'LinkTypeId': '3'}
{'Id': '749694157', 'CreationDate': '2013-02-18T03:18:16.653', 'PostId': '5475831', 'RelatedPostId': '5474790', 'LinkTypeId': '3'}
{'Id': '749694158', 'CreationDate': '2013-02-18T03:18:16.653', 'PostId': '5846625', '

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



{'Id': '749740428', 'CreationDate': '2013-02-18T03:53:24.857', 'PostId': '11911752', 'RelatedPostId': '11900730', 'LinkTypeId': '3'}
{'Id': '749740429', 'CreationDate': '2013-02-18T03:53:24.857', 'PostId': '12177186', 'RelatedPostId': '348792', 'LinkTypeId': '3'}
{'Id': '749740430', 'CreationDate': '2013-02-18T03:53:24.857', 'PostId': '12181330', 'RelatedPostId': '44965', 'LinkTypeId': '3'}
{'Id': '749740431', 'CreationDate': '2013-02-18T03:53:24.857', 'PostId': '12182717', 'RelatedPostId': '6599770', 'LinkTypeId': '3'}
{'Id': '749740432', 'CreationDate': '2013-02-18T03:53:24.857', 'PostId': '12183453', 'RelatedPostId': '3776781', 'LinkTypeId': '3'}
{'Id': '749740433', 'CreationDate': '2013-02-18T03:53:24.857', 'PostId': '10684145', 'RelatedPostId': '5415452', 'LinkTypeId': '3'}
{'Id': '749740434', 'CreationDate': '2013-02-18T03:53:24.857', 'PostId': '10687702', 'RelatedPostId': '10688314', 'LinkTypeId': '3'}
{'Id': '749740435', 'CreationDate': '2013-02-18T03:53:24.857', 'PostId': '106

{'Id': '749743694', 'CreationDate': '2013-02-18T03:53:24.857', 'PostId': '13443654', 'RelatedPostId': '9280921', 'LinkTypeId': '3'}
{'Id': '749743695', 'CreationDate': '2013-02-18T03:53:24.857', 'PostId': '12971310', 'RelatedPostId': '57483', 'LinkTypeId': '3'}
{'Id': '749743696', 'CreationDate': '2013-02-18T03:53:24.857', 'PostId': '12971310', 'RelatedPostId': '9488894', 'LinkTypeId': '3'}
{'Id': '749743697', 'CreationDate': '2013-02-18T03:53:24.857', 'PostId': '12974641', 'RelatedPostId': '2318650', 'LinkTypeId': '3'}
{'Id': '749743698', 'CreationDate': '2013-02-18T03:53:24.857', 'PostId': '12615287', 'RelatedPostId': '3647714', 'LinkTypeId': '3'}
{'Id': '749743699', 'CreationDate': '2013-02-18T03:53:24.857', 'PostId': '12619431', 'RelatedPostId': '12524012', 'LinkTypeId': '3'}
{'Id': '749743700', 'CreationDate': '2013-02-18T03:53:24.857', 'PostId': '12620367', 'RelatedPostId': '6196526', 'LinkTypeId': '3'}
{'Id': '749743701', 'CreationDate': '2013-02-18T03:53:24.857', 'PostId': '126

{'Id': '749748440', 'CreationDate': '2013-02-18T03:58:25.057', 'PostId': '12446794', 'RelatedPostId': '7740646', 'LinkTypeId': '3'}
{'Id': '749748441', 'CreationDate': '2013-02-18T03:58:25.057', 'PostId': '12447237', 'RelatedPostId': '7022383', 'LinkTypeId': '3'}
{'Id': '749748442', 'CreationDate': '2013-02-18T03:58:25.057', 'PostId': '13876551', 'RelatedPostId': '367786', 'LinkTypeId': '3'}
{'Id': '749748443', 'CreationDate': '2013-02-18T03:58:25.057', 'PostId': '13880577', 'RelatedPostId': '7304257', 'LinkTypeId': '3'}
{'Id': '749748444', 'CreationDate': '2013-02-18T03:58:25.057', 'PostId': '13882299', 'RelatedPostId': '7778392', 'LinkTypeId': '3'}
{'Id': '749748445', 'CreationDate': '2013-02-18T03:58:25.057', 'PostId': '13882299', 'RelatedPostId': '9380594', 'LinkTypeId': '3'}
{'Id': '749748446', 'CreationDate': '2013-02-18T03:58:25.057', 'PostId': '13887498', 'RelatedPostId': '1026584', 'LinkTypeId': '3'}
{'Id': '749748447', 'CreationDate': '2013-02-18T03:58:25.057', 'PostId': '132

{'Id': '749753047', 'CreationDate': '2013-02-18T04:03:30.310', 'PostId': '12726033', 'RelatedPostId': '120228', 'LinkTypeId': '3'}
{'Id': '749753048', 'CreationDate': '2013-02-18T04:03:30.310', 'PostId': '12728356', 'RelatedPostId': '906499', 'LinkTypeId': '3'}
{'Id': '749753049', 'CreationDate': '2013-02-18T04:03:30.310', 'PostId': '12728406', 'RelatedPostId': '1733073', 'LinkTypeId': '3'}
{'Id': '749753050', 'CreationDate': '2013-02-18T04:03:30.310', 'PostId': '12729827', 'RelatedPostId': '257288', 'LinkTypeId': '3'}
{'Id': '749753051', 'CreationDate': '2013-02-18T04:03:30.310', 'PostId': '12773386', 'RelatedPostId': '6055347', 'LinkTypeId': '3'}
{'Id': '749753052', 'CreationDate': '2013-02-18T04:03:30.310', 'PostId': '12779538', 'RelatedPostId': '7701771', 'LinkTypeId': '3'}
{'Id': '749753053', 'CreationDate': '2013-02-18T04:03:30.310', 'PostId': '12780040', 'RelatedPostId': '273516', 'LinkTypeId': '3'}
{'Id': '749753054', 'CreationDate': '2013-02-18T04:03:30.310', 'PostId': '127810

{'Id': '749755352', 'CreationDate': '2013-02-18T04:03:30.310', 'PostId': '13560321', 'RelatedPostId': '949433', 'LinkTypeId': '3'}
{'Id': '749755353', 'CreationDate': '2013-02-18T04:03:30.310', 'PostId': '14489645', 'RelatedPostId': '6841830', 'LinkTypeId': '3'}
{'Id': '749755354', 'CreationDate': '2013-02-18T04:03:30.310', 'PostId': '14494632', 'RelatedPostId': '822599', 'LinkTypeId': '3'}
{'Id': '749755355', 'CreationDate': '2013-02-18T04:03:30.310', 'PostId': '14495302', 'RelatedPostId': '10667605', 'LinkTypeId': '3'}
{'Id': '749755356', 'CreationDate': '2013-02-18T04:03:30.310', 'PostId': '14495602', 'RelatedPostId': '2273691', 'LinkTypeId': '3'}
{'Id': '749755357', 'CreationDate': '2013-02-18T04:03:30.310', 'PostId': '14496195', 'RelatedPostId': '3810570', 'LinkTypeId': '3'}
{'Id': '749755358', 'CreationDate': '2013-02-18T04:03:30.310', 'PostId': '14141903', 'RelatedPostId': '451415', 'LinkTypeId': '3'}
{'Id': '749755359', 'CreationDate': '2013-02-18T04:03:30.310', 'PostId': '1414

{'Id': '749760128', 'CreationDate': '2013-02-18T04:08:30.447', 'PostId': '13997140', 'RelatedPostId': '720502', 'LinkTypeId': '3'}
{'Id': '749760129', 'CreationDate': '2013-02-18T04:08:30.447', 'PostId': '13997523', 'RelatedPostId': '1827314', 'LinkTypeId': '3'}
{'Id': '749760130', 'CreationDate': '2013-02-18T04:08:30.447', 'PostId': '13997523', 'RelatedPostId': '8028957', 'LinkTypeId': '3'}
{'Id': '749760131', 'CreationDate': '2013-02-18T04:08:30.447', 'PostId': '13993042', 'RelatedPostId': '2475652', 'LinkTypeId': '3'}
{'Id': '749760132', 'CreationDate': '2013-02-18T04:08:30.447', 'PostId': '13993076', 'RelatedPostId': '9707938', 'LinkTypeId': '3'}
{'Id': '749760133', 'CreationDate': '2013-02-18T04:08:30.447', 'PostId': '13993867', 'RelatedPostId': '1159227', 'LinkTypeId': '3'}
{'Id': '749760134', 'CreationDate': '2013-02-18T04:08:30.447', 'PostId': '13998815', 'RelatedPostId': '908672', 'LinkTypeId': '3'}
{'Id': '749760135', 'CreationDate': '2013-02-18T04:08:30.447', 'PostId': '1398

{'Id': '749763381', 'CreationDate': '2013-02-18T04:08:30.447', 'PostId': '13333658', 'RelatedPostId': '185844', 'LinkTypeId': '3'}
{'Id': '749763382', 'CreationDate': '2013-02-18T04:08:30.447', 'PostId': '13334101', 'RelatedPostId': '81656', 'LinkTypeId': '3'}
{'Id': '749763383', 'CreationDate': '2013-02-18T04:08:30.447', 'PostId': '13335087', 'RelatedPostId': '13277350', 'LinkTypeId': '3'}
{'Id': '749763384', 'CreationDate': '2013-02-18T04:08:30.447', 'PostId': '14730993', 'RelatedPostId': '1483047', 'LinkTypeId': '3'}
{'Id': '749763385', 'CreationDate': '2013-02-18T04:08:30.447', 'PostId': '14739481', 'RelatedPostId': '8447868', 'LinkTypeId': '3'}
{'Id': '749763386', 'CreationDate': '2013-02-18T04:08:30.447', 'PostId': '14748242', 'RelatedPostId': '2973202', 'LinkTypeId': '3'}
{'Id': '749763387', 'CreationDate': '2013-02-18T04:08:30.447', 'PostId': '14749813', 'RelatedPostId': '122942', 'LinkTypeId': '3'}
{'Id': '749763388', 'CreationDate': '2013-02-18T04:08:30.447', 'PostId': '14749

{'Id': '751634169', 'CreationDate': '2013-02-21T12:19:51.507', 'PostId': '15000629', 'RelatedPostId': '15000126', 'LinkTypeId': '3'}
{'Id': '751639899', 'CreationDate': '2013-02-21T12:32:36.413', 'PostId': '14999732', 'RelatedPostId': '9845795', 'LinkTypeId': '3'}
{'Id': '751641124', 'CreationDate': '2013-02-21T12:35:19.953', 'PostId': '15002270', 'RelatedPostId': '605497', 'LinkTypeId': '3'}
{'Id': '751642556', 'CreationDate': '2013-02-21T12:38:28.160', 'PostId': '14996265', 'RelatedPostId': '6714628', 'LinkTypeId': '3'}
{'Id': '751643730', 'CreationDate': '2013-02-21T12:41:12.710', 'PostId': '15002139', 'RelatedPostId': '275944', 'LinkTypeId': '3'}
{'Id': '751657913', 'CreationDate': '2013-02-21T13:11:57.617', 'PostId': '14988956', 'RelatedPostId': '3609231', 'LinkTypeId': '3'}
{'Id': '751658072', 'CreationDate': '2013-02-21T13:12:13.053', 'PostId': '15002994', 'RelatedPostId': '1125084', 'LinkTypeId': '3'}
{'Id': '751660487', 'CreationDate': '2013-02-21T13:16:56.950', 'PostId': '150

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [65]:
import pandas as pd
import sqlite3
name = 'StackOverflow2'

connection = sqlite3.connect('{}.db'.format(name))
c = connection.cursor()

df = pd.read_sql("""SELECT title FROM posts WHERE comment_id == 238452""", connection)
df.title[0]

'C++ RTTI Viable Examples'

In [40]:
df2 = pd.read_sql("""SELECT * FROM posts WHERE comment_id == 17434""", connection)
df2

Unnamed: 0,comment_id,parent_id,comment,title,date,score,tags
0,17434,,<p>I have been reading through the <a href='ht...,When should you use 'friend' in C++?,2008-08-20,310,c++ oop encapsulation friend


In [52]:
clean_text(df.title.values[0])

'When to use friend class in C++'

In [51]:
clean_text(df2.title.values[0])

"When should you use 'friend' in C++?"

In [48]:
import html
import re

re1 = re.compile(r'  +')

def clean_tags(x):
    x = x.replace('<', '').replace('>', ' ').replace('"',"'").strip()
    return re1.sub(' ', html.unescape(x))


def clean_text(x, remove_html=True):
    if remove_html:
        x = re.sub(r'<code>[^>]*</code>', '', x)
        x = re.sub(r'<[^>]*>', '', x)
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ').replace('"',"'").replace('\n', ' ').replace('\r', ' ').strip()
    return re1.sub(' ', html.unescape(x))


In [71]:
%%time
import lxml.etree as ET


all_ids = []
def fast_iter(context):
    """
    http://lxml.de/parsing.html#modifying-the-tree
    Based on Liza Daly's fast_iter
    http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
    See also http://effbot.org/zone/element-iterparse.htm
    """
    c = 0
    row_counter = 0
    for event, elem in context:
        row_counter += 1
        all_ids.append(elem.attrib['Id'])
#         func(elem, *args, **kwargs)
        # It's safe to call clear() here because no descendants will be
        # accessed
        elem.clear()
        # Also eliminate now-empty references from the root node to elem
        for ancestor in elem.xpath('ancestor-or-self::*'):
            while ancestor.getprevious() is not None:
                del ancestor.getparent()[0]
        if row_counter % 1000000 == 0:
            print(row_counter)
        c += 1
        if c == 1000000: break 
    del context
    return row_counter


context = ET.iterparse('data/Comments.xml', events=('end', ))
row_counter = fast_iter(context)

1000000
CPU times: user 12.4 s, sys: 17.6 ms, total: 12.5 s
Wall time: 12.5 s


In [72]:
for i in [94240, 280596, 353966, 630133, 1246697, 2047518, 2890958]:
    print(str(i) in all_ids)

True
False
True
True
True
False
False


In [69]:
for i in [94240, 280596, 353966, 630133, 1246697, 2047518, 2890958]:
    print(str(i) in all_ids)

False
False
False
False
False
False
False


In [70]:
!ls data/

Answers.csv    Posts.xml      stackoverflow.com-Comments.7z  Tags.csv
PostLinks.xml  Questions.csv  stackoverflow.com-Posts.7z     Tags.xml
