In [1]:
import pathlib
import pandas as pd
import numpy as np
import ijson
from typing import List
from concurrent.futures import ProcessPoolExecutor
from resources import OPEN_TENDER_JSONS, OPEN_TENDER_EXTRACTION_OUTPUT

In [2]:
OPEN_TENDER_JSONS_PATHS = [file_path for file_path in OPEN_TENDER_JSONS.iterdir() if
                           file_path.is_file() and file_path.name.endswith(".json")]

In [3]:
def flatten_data(y, prefix=""):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '.')
        else:
            out[name[:-1]] = x

    flatten(y,prefix)
    return out

In [4]:
def merge_results(folder_path: pathlib.Path, result_file_name: str, src_name_prefix: str = "",
                  src_name_suffix: str = "", remove_src_files: bool = True):
    selected_paths = [file_path for file_path in folder_path.iterdir()
                      if file_path.is_file() and file_path.name.endswith(src_name_suffix) and file_path.name.startswith(
            src_name_prefix)]

    merged_df = pd.concat([pd.read_parquet(file_path) for file_path in selected_paths], axis=0,
                          ignore_index=True)
    merged_df.reset_index(drop=True,inplace=True)
    merged_df.to_parquet(folder_path / result_file_name, index=None)
    del merged_df
    if remove_src_files:
        for file_path in selected_paths:
            file_path.unlink()

In [5]:
def extract_fields_from_file(json_file_path: pathlib.Path):
    json_file = json_file_path.open(mode="rb")
    documents = ijson.items(json_file, 'item.lots.item.bids.item.bidders.item')
    rows = [flatten_data(document,"bidder.") for document in documents]
    json_file.close()
    result_path = OPEN_TENDER_EXTRACTION_OUTPUT / f"{json_file_path.stem}.parquet"
    result_df = pd.DataFrame(rows)
    result_df.to_parquet(result_path)
    del result_df


In [6]:
def extract_fields_from_multiple_files(json_file_paths: List[pathlib.Path]):
    with ProcessPoolExecutor(max_workers=24) as executor:
        futures = [executor.submit(extract_fields_from_file, json_file_path) for json_file_path in json_file_paths]
        for future in futures:
            future.result()

In [7]:
%%time
extract_fields_from_multiple_files(OPEN_TENDER_JSONS_PATHS)

CPU times: user 26.2 ms, sys: 48.3 ms, total: 74.4 ms
Wall time: 2min 37s


In [8]:
%%time
merge_results(folder_path=OPEN_TENDER_EXTRACTION_OUTPUT,
              result_file_name=f"merged_results.parquet",
              src_name_suffix=".parquet",
              remove_src_files=False
              )

CPU times: user 36.6 s, sys: 7.99 s, total: 44.5 s
Wall time: 47.6 s


In [9]:
merged_info_df = pd.read_parquet(OPEN_TENDER_EXTRACTION_OUTPUT / "merged_results.parquet")

In [10]:
merged_info_df.columns

Index(['bidder.processingOrder', 'bidder.name', 'bidder.address.city',
       'bidder.address.country', 'bidder.address.rawAddress', 'bidder.id',
       'bidder.address.street', 'bidder.address.postcode',
       'bidder.address.nuts', 'bidder.address.ot.nutscode', 'bidder.email',
       'bidder.phone', 'bidder.isSme', 'bidder.address.url',
       'bidder.mainActivities', 'bidder.buyerType', 'bidder.contactPoint',
       'bidder.contactName', 'bidder.address.state'],
      dtype='object')

In [11]:
number_of_bidders = len(merged_info_df)
number_of_sme_bidders = len(merged_info_df[merged_info_df["bidder.isSme"] == True])
percentage_of_sme_bidders = round(100 * number_of_sme_bidders / number_of_bidders, 2)
print("Total number of Bidders: ", number_of_bidders)
print("Total number of Bidders as SME: ", number_of_sme_bidders)
print("Percentage of SME Bidders:", percentage_of_sme_bidders, "%")

Total number of Bidders:  16168260
Total number of Bidders as SME:  937629
Percentage of SME Bidders: 5.8 %
