## metadata Into XML and JSON

This is a two step process, but the origin is the CSV file which was build from the trafilatura output.

First, we need to prepare the text corpus into a good arranged filesystem.
Second, we then need to put updated metadata into the textfile corpus headers, and link these files back into the CSV.

This keeps the embdedded metadata up to date with the CSV metadata.

In [2]:
import glob, os, json, re
import pandas as pd
import xml.etree.ElementTree as ET
import xml

_RE_COMBINE_WHITESPACE = re.compile(r" +")

In [4]:
def getCorpus(globpath):
    '''
    Give a globpath, load all the XML and JSON files into a dataframe and return.
    '''
    files = glob.glob(globpath)

    frames = []
    
    for fn in files:
        if fn.endswith('xml'):
            frame = pd.read_xml(fn, xpath='//doc', attrs_only=True)
            frame['filename'] = fn
            frames.append(frame)
            
        elif fn.endswith('json'):
            frame = pd.read_json(fn, lines=True)
            frame['filename'] = fn
            # del frame['raw_text']
            # del frame['text']
            frames.append(frame)
    
    return pd.concat(frames)

## Load the data

In [5]:
# load csv primary data source
meta = pd.read_csv('./traffy-meta.csv').set_index('fingerprint')
meta.id = meta['id'].fillna(0).astype(int)
meta.head(2)

Unnamed: 0_level_0,sitename,title,author,date,source,hostname,excerpt,categories,tags,id,license,comments,source-hostname
fingerprint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
O7W5T1VCmbyT5iH9cE8XySFVptI=,Clarkesworld Magazine,Blue Grey Blue by Yukimi Ogawa,Yukimi Ogawa,2022-01-01,https://clarkesworldmagazine.com/ogawa_12_16/,clarkesworldmagazine.com,Clarkesworld Science Fiction and Fantasy Magaz...,,,0,,,Clarkesworld Magazine
9Mn+AhnUtRqq5BUc8sar9HI9nlM=,Clarkesworld Magazine,The Future is Blue by Catherynne M. Valente,Catherynne M Valente,2022-01-01,https://clarkesworldmagazine.com/valente_03_19...,clarkesworldmagazine.com,Clarkesworld Science Fiction and Fantasy Magaz...,,,0,,,Clarkesworld Magazine


In [27]:
%%time

filemeta = getCorpus('./data/**/*')
# filemeta = filemeta[['fingerprint', 'filename']].set_index('fingerprint')
filemeta = filemeta.set_index('fingerprint')
filemeta.to_csv('./filemeta.csv')
filemeta.head()

CPU times: user 12.5 s, sys: 808 ms, total: 13.3 s
Wall time: 14 s


Unnamed: 0_level_0,sitename,title,author,date,source,hostname,excerpt,categories,tags,filename,id,license,comments,source-hostname
fingerprint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
EreEXVl04TMXumeyideOBeIxoPM=,Clarkesworld Magazine,Clarkesworld Magazine - Science Fiction & Fantasy,Arula Ratnakar,2022-01-01,https://clarkesworldmagazine.com/ratnakar_05_19/,clarkesworldmagazine.com,Clarkesworld Science Fiction and Fantasy Magaz...,,,./data/1/21QgJXT6wVnb-zv2by+b1DVhlI4.xml,,,,
AIIZGz2/wU9mFu8wIsVZ9YWTHWU=,,Clarkesworld Magazine - Science Fiction & Fantasy,Geoff Ryman,2022-01-01 00:00:00,https://clarkesworldmagazine.com/ryman_06_14_r...,clarkesworldmagazine.com,Clarkesworld Science Fiction and Fantasy Magaz...,,,./data/1/MyOyM9pq+yOEP9mNAfO2VjYVBp4.json,,,,Clarkesworld Magazine
S5o7E0U97anmTY/+VHEnnI5o2W4=,,Clarkesworld Magazine - Science Fiction & Fantasy,Genevieve Valentine,2022-01-01 00:00:00,https://clarkesworldmagazine.com/valentine_10_17/,clarkesworldmagazine.com,Clarkesworld Science Fiction and Fantasy Magaz...,,,./data/1/+B+tTSkvZzVVhXr1YTtbktWOxhY.json,,,,Clarkesworld Magazine
iRusfgy2RbSqjShLaVWJQ21sh4E=,,Clarkesworld Magazine - Science Fiction & Fantasy,Bogi Takács,2022-01-01 00:00:00,https://clarkesworldmagazine.com/takacs_04_17/,clarkesworldmagazine.com,Clarkesworld Science Fiction and Fantasy Magaz...,,,./data/1/7VkxKi84dUugRxCOaQ3q2e2VBCM.json,,,,Clarkesworld Magazine
UiQ0PhcBxUAaRlZNQdTeuo0FBEI=,Clarkesworld Magazine,Clarkesworld Magazine - Science Fiction & Fantasy,Rich Larson,2022-01-01,https://clarkesworldmagazine.com/larson_02_15/,clarkesworldmagazine.com,Clarkesworld Science Fiction and Fantasy Magaz...,,,./data/1/GiCvmlE-6touLM7OHLCXxMRgGKw.xml,,,,


In [28]:
# Join to get filenames
meta = meta.join(filemeta, on='fingerprint', rsuffix='_files').reset_index()
# meta.head()

ValueError: cannot insert level_0, already exists

## Convert row into destination filename string

In [6]:
%%time

corpusdir = './corpus'

if not os.path.exists(corpusdir):
    os.mkdir(corpusdir)

for index, row in meta.iterrows():
    sourcename = str(row.source).split('m/')[-1].replace('/', '')
    filename, file_extension = os.path.splitext(row.filename)
    
    # Convert CSV record into output JSON Doc and setup outfile name
    outfile = os.path.join(corpusdir, sourcename + '.json')    
    doc = json.loads(row.to_json(orient='index'))

    # #########
    # Process JSON file and put text into the Doc.
    if file_extension == '.json':
        with open(row.filename, 'r') as f:
            data = json.load(f)

        # Take raw text from trafilatura source
        doc['text'] = data['text']

            
    # #########
    # Process XML files into JSON files.
    # load the old xml and get the text into the output JSON
    if file_extension == '.xml':
        # Load the XML file and get the text out, clean it a bit to make it plain text.
        root = ET.parse(row.filename).getroot()
        main = root.find('main')
        text = xml.etree.ElementTree.tostring(root, method='text', encoding="unicode")
        text = _RE_COMBINE_WHITESPACE.sub(' ', text).strip()
        
        # Take raw text from trafilatura source
        doc['text'] = text

    # Output the Doc to JSON file
    with open(outfile, 'w') as f: #  Save to JSON files
        # print('Saving file: ', outfile, file_extension)
        del doc['filename']  #  remove old filename
        json.dump(doc, f)

CPU times: user 1.77 s, sys: 397 ms, total: 2.17 s
Wall time: 2.35 s
