## metadata out to CSV

Extracts metadata from trafilatura's xml and json conversions of the scraped html files from Clarkesworld.
The csv can then be edited, cleaned up manually, and the metadata be put back into the filesystem.
#@see meta-in.ipynb

In [14]:
import glob, os
import pandas as pd

In [15]:
files = glob.glob('./data/**/*')
files[:5]

['./data/1/21QgJXT6wVnb-zv2by+b1DVhlI4.xml',
 './data/1/MyOyM9pq+yOEP9mNAfO2VjYVBp4.json',
 './data/1/+B+tTSkvZzVVhXr1YTtbktWOxhY.json',
 './data/1/7VkxKi84dUugRxCOaQ3q2e2VBCM.json',
 './data/1/jIyDx-OHlrPUFTAg5SyvoJX6KEY.txt']

In [16]:
traf_ids = sorted([(os.path.basename(f).split('.')[0], f) for f in files])
traf_ids[:5]

# i decided not to output the trafid's into the CSV file. 
# The fingerprint hash in each trafilatura embedded metadata does match, so I decided to use this to merge/lookup/join etc.

[('++M+2YuuVIycIfW6FiYV87BbjRA', './data/2/++M+2YuuVIycIfW6FiYV87BbjRA.txt'),
 ('+3YBUN7z1a90VaWdoXze4QMmcPY', './data/1/+3YBUN7z1a90VaWdoXze4QMmcPY.txt'),
 ('+3xiKoo7QkBjqEP9MIhtCyxTLCI', './data/1/+3xiKoo7QkBjqEP9MIhtCyxTLCI.xml'),
 ('+49NksTYUKxEKz8i6xv3zblj1xY', './data/1/+49NksTYUKxEKz8i6xv3zblj1xY.xml'),
 ('+6uMh33KnQ-w3K3w5H06U1vopDU', './data/1/+6uMh33KnQ-w3K3w5H06U1vopDU.txt')]

In [17]:
%%time
# generate dataframe from XML sources
frames = []

for tid, fn in [f for f in traf_ids if f[1].endswith('xml')]:
    frame = pd.read_xml(fn, xpath='//doc', attrs_only=True)
    frames.append(frame)

meta = pd.concat(frames)

CPU times: user 1.56 s, sys: 112 ms, total: 1.68 s
Wall time: 1.76 s


In [18]:
meta.columns

Index(['sitename', 'title', 'author', 'date', 'source', 'hostname', 'excerpt',
       'categories', 'tags', 'fingerprint'],
      dtype='object')

In [19]:
%%time

# generate dataframe from JSON sources
frames = []

for tid, fn in [f for f in traf_ids if f[1].endswith('json')]:
    frame = pd.read_json(fn, lines=True)
    del frame['raw_text']
    del frame['text']
    frames.append(frame)

meta = meta.append(pd.concat(frames))

CPU times: user 4.82 s, sys: 242 ms, total: 5.07 s
Wall time: 5.15 s




In [20]:
meta.columns

Index(['sitename', 'title', 'author', 'date', 'source', 'hostname', 'excerpt',
       'categories', 'tags', 'fingerprint', 'id', 'license', 'comments',
       'source-hostname'],
      dtype='object')

### Do some data alterations

- Fill empties from eachother's values. (sitename is in traffy's XML, source-hostname in it's JSON)
- use 'excerpt' to get story 'title' for all entries.
- group by/deduplicate by 'fingerprint'.

In [21]:
# Fill empties from eachother's values. (sitename is in traffy's XML, source-hostname in it's JSON)
meta['sitename'] = meta['sitename'].fillna(value=meta['source-hostname'])
meta['source-hostname'] = meta['source-hostname'].fillna(value=meta['sitename'])

In [22]:
# use 'excerpt' to get story 'title' for all entries.
# Some basic pattern matching works for the clarkesworld website <excerpt> strings.
titles = meta['excerpt'].str.split('This page: ', expand=True)[1].to_frame()
meta['title'] = titles

In [23]:
# group by/deduplicate by 'fingerprint'.
meta = meta.drop_duplicates(subset=['fingerprint'])

## Save to csv

In [24]:
meta.to_csv('./traffy-meta.csv', index=None)