## metadata Into XML and JSON

This is a two step process, but the origin is the CSV file which was build from the trafilatura output.

First, we need to prepare the text corpus into a good arranged filesystem.
Second, we then need to put updated metadata into the textfile corpus headers, and link these files back into the CSV.

This keeps the embdedded metadata up to date with the CSV metadata.

In [1]:
import glob, os, json, re
import pandas as pd
import xml.etree.ElementTree as ET
import xml

_RE_COMBINE_WHITESPACE = re.compile(r" +")

In [2]:
def getCorpus(globpath):
    '''
    Give a globpath, load all the XML and JSON files into a dataframe and return.
    '''
    files = glob.glob(globpath)

    frames = []
    
    for fn in files:
        if fn.endswith('xml'):
            frame = pd.read_xml(fn, xpath='//doc', attrs_only=True)
            frame['filename'] = fn
            frames.append(frame)
            
        elif fn.endswith('json'):
            frame = pd.read_json(fn, lines=True)
            frame['filename'] = fn
            del frame['raw_text']
            del frame['text']
            frames.append(frame)
    
    return pd.concat(frames)

## Load the data

In [3]:
# load csv primary data source
meta = pd.read_csv('./traffy-meta.csv').set_index('fingerprint')
meta.id = meta['id'].fillna(0).astype(int)
meta.head(2)

Unnamed: 0_level_0,sitename,title,author,date,source,hostname,excerpt,categories,tags,id,license,comments,source-hostname
fingerprint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
/6eHxEPR+q2YoP1tDcWkWESJjHk=,Clarkesworld Magazine,A Stopped Clock,Madeline Ashby,2015-11-09,https://clarkesworldmagazine.com/ashby_08_16_r...,clarkesworldmagazine.com,Clarkesworld Science Fiction and Fantasy Magaz...,Short fiction,Cyberwarfare,2027533,,,Clarkesworld Magazine
/6wwXCKld7x8aiwwTeiGscyqFf4=,Clarkesworld Magazine,Staying Loose: A Conversation with Max Gladsto...,Arley Sorg,2022-02-00,https://clarkesworldmagazine.com/gladstone_int...,clarkesworldmagazine.com,Clarkesworld Science Fiction and Fantasy Magaz...,Interview,,3004850,,,Clarkesworld Magazine


In [4]:
%%time

filemeta = getCorpus('./data/**/*')
filemeta = filemeta[['fingerprint', 'filename']].set_index('fingerprint')
filemeta.head()

CPU times: user 7.44 s, sys: 205 ms, total: 7.64 s
Wall time: 7.81 s


Unnamed: 0_level_0,filename
fingerprint,Unnamed: 1_level_1
EreEXVl04TMXumeyideOBeIxoPM=,./data/1/21QgJXT6wVnb-zv2by+b1DVhlI4.xml
AIIZGz2/wU9mFu8wIsVZ9YWTHWU=,./data/1/MyOyM9pq+yOEP9mNAfO2VjYVBp4.json
S5o7E0U97anmTY/+VHEnnI5o2W4=,./data/1/+B+tTSkvZzVVhXr1YTtbktWOxhY.json
iRusfgy2RbSqjShLaVWJQ21sh4E=,./data/1/7VkxKi84dUugRxCOaQ3q2e2VBCM.json
UiQ0PhcBxUAaRlZNQdTeuo0FBEI=,./data/1/GiCvmlE-6touLM7OHLCXxMRgGKw.xml


In [5]:
# Join to get filenames
meta = meta.join(filemeta).reset_index()

## Convert row into destination filename string

In [6]:
%%time

corpusdir = './corpus'

if not os.path.exists(corpusdir):
    os.mkdir(corpusdir)

for index, row in meta.iterrows():
    sourcename = str(row.source).split('m/')[-1].replace('/', '')
    filename, file_extension = os.path.splitext(row.filename)
    
    # Convert CSV record into output JSON Doc and setup outfile name
    outfile = os.path.join(corpusdir, sourcename + '.json')    
    doc = json.loads(row.to_json(orient='index'))

    # #########
    # Process JSON file and put text into the Doc.
    if file_extension == '.json':
        with open(row.filename, 'r') as f:
            data = json.load(f)

        # Take raw text from trafilatura source
        doc['text'] = data['text']

            
    # #########
    # Process XML files into JSON files.
    # load the old xml and get the text into the output JSON
    if file_extension == '.xml':
        # Load the XML file and get the text out, clean it a bit to make it plain text.
        root = ET.parse(row.filename).getroot()
        main = root.find('main')
        text = xml.etree.ElementTree.tostring(root, method='text', encoding="unicode")
        text = _RE_COMBINE_WHITESPACE.sub(' ', text).strip()
        
        # Take raw text from trafilatura source
        doc['text'] = text

    # Output the Doc to JSON file
    with open(outfile, 'w') as f: #  Save to JSON files
        # print('Saving file: ', outfile, file_extension)
        del doc['filename']  #  remove old filename
        json.dump(doc, f)

CPU times: user 1.77 s, sys: 397 ms, total: 2.17 s
Wall time: 2.35 s
