### Parse TOI XML

In [None]:
import pandas as pd
from glob import glob
from zipfile import ZipFile
import xmltodict

In [9]:
output_file = 'toi_parsed_all.csv.gz'

### Step 1: Parse all XML files in each zip file to a csv file

In [11]:
cols = set()
for i, fn in enumerate(glob('./data/*.zip')):
    with ZipFile(fn, 'r') as zo:
        # Get list of files names in zip
        files = zo.namelist()
        rows = []
        for zfn in files:
            with zo.open(zfn) as f:
                d = xmltodict.parse(f)
                rows.append(d['Record'])
        df = pd.DataFrame(pd.json_normalize(rows))
        ofn = fn.replace('data', 'csv').replace('zip', 'csv.gz')
        cols.update(df.columns)
        df.to_csv(ofn, index=False, compression='gzip')
        print(i, ofn, len(df.columns))
    #break

0 ./csv/TimesOfIndia_20170928213619_00025.csv.gz 29
1 ./csv/TimesOfIndia_20170928222359_00036.csv.gz 29
2 ./csv/TimesOfIndia_20170928222610_00045.csv.gz 28
3 ./csv/TimesOfIndia_20170928214039_00039.csv.gz 28
4 ./csv/TimesOfIndia_20170928222609_00044.csv.gz 28
5 ./csv/TimesOfIndia_20170928222919_00005.csv.gz 28
6 ./csv/TimesOfIndia_20170928215024_00023.csv.gz 29
7 ./csv/TimesOfIndia_20170928221723_00014.csv.gz 28
8 ./csv/TimesOfIndia_20170928222141_00026.csv.gz 29
9 ./csv/TimesOfIndia_20170928214041_00040.csv.gz 29
10 ./csv/TimesOfIndia_20170928214811_00014.csv.gz 29
11 ./csv/TimesOfIndia_20170928223340_00019.csv.gz 29
12 ./csv/TimesOfIndia_20170928221407_00002.csv.gz 28
13 ./csv/TimesOfIndia_20170928222920_00006.csv.gz 28
14 ./csv/TimesOfIndia_20170928213413_00021.csv.gz 28
15 ./csv/TimesOfIndia_20170928223128_00011.csv.gz 29
16 ./csv/TimesOfIndia_20170928210518_00026.csv.gz 28
17 ./csv/TimesOfIndia_20170928213202_00014.csv.gz 29
18 ./csv/TimesOfIndia_20170928222504_00040.csv.gz 29
19 

In [14]:
# Show list of columns from all zip files
len(cols), cols

(29,
 {'Abstract',
  'ActionCode',
  'AlphaPubDate',
  'Contributor',
  'Contributor.ContribRole',
  'Contributor.FirstName',
  'Contributor.LastName',
  'Contributor.MiddleName',
  'Contributor.NameSuffix',
  'Contributor.OrganizationName',
  'Contributor.OriginalForm',
  'Contributor.PersonName',
  'Contributor.PersonTitle',
  'DateTimeStamp',
  'FullText',
  'LanguageCode',
  'NumericPubDate',
  'ObjectType',
  'Products.Product',
  'Publication.PublicationID',
  'Publication.Qualifier',
  'Publication.Title',
  'Publisher',
  'RecordID',
  'RecordTitle',
  'SourceType',
  'StartPage',
  'URLDocView',
  'Version'})

### Step 2: Merge all csv files to a csv file.

In [33]:
header = True
for i, fn in enumerate(glob('csv/*.csv.gz')):
    #df = pd.read_csv(fn, nrows=100)
    df = pd.read_csv(fn)
    dcols = list(cols - set(df.columns))
    for c in dcols:
        df[c] = None
    df[list(cols)].to_csv(output_file, mode='w' if header else 'a', header=header, index=False, compression='gzip')
    header = False
    print(i, fn, len(df.columns))

0 csv/TimesOfIndia_20170928220631_00024.csv.gz 29
1 csv/TimesOfIndia_20170928215232_00027.csv.gz 29
2 csv/TimesOfIndia_20170928210052_00009.csv.gz 29
3 csv/TimesOfIndia_20170928213935_00036.csv.gz 29
4 csv/TimesOfIndia_20170928214811_00014.csv.gz 29
5 csv/TimesOfIndia_20170928213156_00009.csv.gz 29
6 csv/TimesOfIndia_20170928220633_00025.csv.gz 29
7 csv/TimesOfIndia_20170928223548_00026.csv.gz 29
8 csv/TimesOfIndia_20170928221409_00004.csv.gz 29
9 csv/TimesOfIndia_20170928210047_00007.csv.gz 29
10 csv/TimesOfIndia_20170928213154_00008.csv.gz 29
11 csv/TimesOfIndia_20170928223652_00029.csv.gz 29
12 csv/TimesOfIndia_20170928213306_00017.csv.gz 29
13 csv/TimesOfIndia_20170928213259_00012.csv.gz 29
14 csv/TimesOfIndia_20170928221933_00020.csv.gz 29
15 csv/TimesOfIndia_20170928212320_00031.csv.gz 29
16 csv/TimesOfIndia_20170928215234_00029.csv.gz 29
17 csv/TimesOfIndia_20170928210259_00014.csv.gz 29
18 csv/TimesOfIndia_20170928212947_00004.csv.gz 29
19 csv/TimesOfIndia_20170928213304_00015.

### Step 3: Check output file

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("toi_parsed_all.csv.gz", nrows = 10000)
df

Unnamed: 0,SourceType,Publication.PublicationID,Abstract,Contributor.PersonName,Contributor.LastName,URLDocView,DateTimeStamp,Publication.Qualifier,AlphaPubDate,ActionCode,...,Contributor.NameSuffix,Contributor.MiddleName,FullText,ObjectType,Contributor.OrganizationName,Publication.Title,Publisher,Contributor.OriginalForm,LanguageCode,NumericPubDate
0,Historical Newspapers,54644,,,,http://search.proquest.com/docview/608500584/,20170928172148,"New Delhi, India","Feb 24, 1998",change,...,,,SJqSwldni8 vold apf f181 CflSyZTS Theleadergua...,Credit/Acknowledgement,,The Times of India (1861-current),"Bennett, Coleman & Company Limited",,ENG,19980224
1,Historical Newspapers,54644,,,,http://search.proquest.com/docview/608869332/,20170928172126,"New Delhi, India","May 22, 1996",change,...,,,JHHQk guest Akshay Page 3,"['Table of Contents', 'Front Matter']",,The Times of India (1861-current),"Bennett, Coleman & Company Limited",,ENG,19960522
2,Historical Newspapers,54644,,,,http://search.proquest.com/docview/608623689/,20170928172151,"New Delhi, India","Sep 17, 1997",change,...,,,UB Baseball basketball and football have their...,Advertisement,,The Times of India (1861-current),"Bennett, Coleman & Company Limited",,ENG,19970917
3,Historical Newspapers,54644,MUMBAI : Chief minister Manohar Joshi will see...,,,http://search.proquest.com/docview/608797730/,20170928172250,"New Delhi, India","Sep 16, 1998",change,...,,,By Vldyadhar Date The Times of India News Serv...,"['Feature', 'Article']",,The Times of India (1861-current),"Bennett, Coleman & Company Limited",Vidyadhar Date The Times of India News Service,ENG,19980916
4,Historical Newspapers,54694,,,,http://search.proquest.com/docview/233771878/,20170928172211,"Bombay, India","Aug 3, 1839",change,...,,,Printed and Published for the Bombay Times by ...,General Information,,The Bombay Times and Journal of Commerce (1838...,"Bennett, Coleman & Company Limited",,ENG,18390803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,Historical Newspapers,54644,,,,http://search.proquest.com/docview/741170447/,20170928173348,"New Delhi, India","Jan 14, 1969",change,...,,,German team on way to Rourkela ...,Front Page/Cover Story,,The Times of India (1861-current),"Bennett, Coleman & Company Limited",,ENG,19690114
9996,Historical Newspapers,54644,MUMBAI: An unbeaten halt-century by Yogesh Raj...,,,http://search.proquest.com/docview/741230807/,20170928173243,"New Delhi, India","Feb 10, 2000",change,...,,,CITY SPORT BTTTCxnnrm ...,"['Feature', 'Article']",,The Times of India (1861-current),"Bennett, Coleman & Company Limited",Our Correspondents,ENG,20000210
9997,Historical Newspapers,54644,,,,http://search.proquest.com/docview/614079844/,20170928173321,"New Delhi, India","Jul 17, 1967",change,...,,,AfFnrmijiffJriii- BOMBAY DELHI MAURA,Advertisement,,The Times of India (1861-current),"Bennett, Coleman & Company Limited",,ENG,19670717
9998,Historical Newspapers,54644,,,,http://search.proquest.com/docview/741150898/,20170928173354,"New Delhi, India","Oct 1, 1966",change,...,,,TTie Statesman of Calcutta and Delhi in statem...,"['Feature', 'Article']",,The Times of India (1861-current),"Bennett, Coleman & Company Limited",,ENG,19661001
