
# Build a database of potential customers from TED 2014-2022 data
### Query the Contract Award Notices and get:
* Name of the Contracting Authority/Entity
* Title Scope of the procurement (Procedure & Lot)
* The total value of the contract
* CPV Codes + Additional CPV codes
* Name of the Winner/Contractor
	* Company name
	* URL address
	* NUTS Code
	* Email
	* is SME

### Statistical Information about the tenders
* number of tenders received
* number of tenders received from SMEs
### Data of the conclusion of the contract

# Import libraries

In [1]:
import pathlib
import pandas as pd
import numpy as np
import ijson
from typing import List
from concurrent.futures import ProcessPoolExecutor
from resources import OPEN_TENDER_JSONS, OPEN_TENDER_EXTRACTION_OUTPUT

# Define constants for required fields per structure level

In [2]:
TENDER_FIELDS = ["title", "estimatedPrice", "finalPrice", "country", "procedureType", "contractSignatureDate"]
LOT_FIELDS = ["title", "contractSignatureDate", "estimatedPrice"]
BID_FIELDS = ["isWinning", "price", "unitPrices", "isDisqualified", "disqualificationReason", "isConsortium"]
BIDDERS_FIELDS = ["name", "address", "email", "contactPoint", "contactName", "phone", "isLeader", "isSme"]
BUYER_FIELDS = ["name", "address", "email", "contactPoint", "contactName", "phone"]
NOT_NULL_FIELDS = ["bidder_email", "buyer_name", "tender_title", "tender_finalPrice_netAmountEur"]

In [3]:
OPEN_TENDER_JSONS_PATHS = [file_path for file_path in OPEN_TENDER_JSONS.iterdir() if
                           file_path.is_file() and file_path.name.endswith(".json")]

# Define additional functions

In [4]:
def flatten_data(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

In [5]:
def extract_fields_from_document(document: dict) -> list:
    rows = []
    row = {}
    for field in TENDER_FIELDS:
        row["tender_" + field] = document.get(field, None)
    if "cpvs" in document.keys():
        cpv_count = 1
        for cpv in document["cpvs"]:
            if ("isMain" in cpv.keys()) and ("code" in cpv.keys()):
                if cpv["isMain"]:
                    row["tender_cpv_main"] = cpv["code"]

            row[f"tender_cpvs_{cpv_count}"]=cpv["code"]
            cpv_count+=1
    if "buyers" in document.keys():
        for buyer in document["buyers"]:
            for field in BUYER_FIELDS:
                row["buyer_" + field] = buyer.get(field, None)
            if "lots" in document.keys():
                for lot in document["lots"]:
                    for field in LOT_FIELDS:
                        row["lot_" + field] = lot.get(field, None)
                    if "bids" in lot.keys():
                        for bid in lot["bids"]:
                            for field in BID_FIELDS:
                                row["bid_" + field] = bid.get(field, None)
                            if "bidders" in bid.keys():
                                for bidder in bid["bidders"]:
                                    for field in bidder.keys():
                                        row["bidder_" + field] = bidder.get(field, None)
                                    rows.append(flatten_data(row))
    return rows

In [6]:
def filter_dataframe_fields(data: pd.DataFrame) -> pd.DataFrame:
    for not_null_field in NOT_NULL_FIELDS:
        if not_null_field in data.columns:
            tmp_data = data[data[not_null_field].notnull()].copy()
            del data
            data = tmp_data
    return data

In [7]:
def save_list_of_rows(list_of_rows: list, save_path: pathlib.Path, save_header: bool):
    append = not save_header
    result_df = filter_dataframe_fields(pd.DataFrame(list_of_rows))
    float_fields = ["bid_price_netAmount",
                    "bid_price_netAmountEur",
                    "bid_price_netAmountNational",
                    "lot_estimatedPrice_netAmount",
                    "lot_estimatedPrice_netAmountEur",
                    "lot_estimatedPrice_netAmountNational",
                    "lot_estimatedPrice_amountWithVat",
                    "tender_estimatedPrice_netAmount",
                    "tender_finalPrice_amountWithVat",
                    "tender_estimatedPrice_netAmountEur",
                    "bid_price_amountWithVat",
                    "tender_estimatedPrice_netAmountNational",
                    "tender_finalPrice_netAmount",
                    "tender_finalPrice_netAmountNational",
                    "bid_price_minNetAmount",
                    "bid_price_maxNetAmount",
                    "bid_price_vat",
                    "tender_finalPrice_netAmountEur",
                    "tender_estimatedPrice_amountWithVat"
                    ]
    for column_name in result_df.columns:
        if column_name in float_fields:
             if column_name in list(result_df.columns.tolist()):
                result_df[column_name] = result_df[column_name].astype(dtype=np.float32)
        else:
            result_df[column_name] = result_df[column_name].astype(dtype=str)
    result_df.to_parquet(save_path, index=None, append=append)
    del result_df

In [8]:
def merge_results(folder_path: pathlib.Path, result_file_name: str, src_name_prefix: str = "",
                  src_name_suffix: str = "", remove_src_files: bool = True):
    selected_paths = [file_path for file_path in folder_path.iterdir()
                      if file_path.is_file() and file_path.name.endswith(src_name_suffix) and file_path.name.startswith(
            src_name_prefix)]

    merged_df = pd.concat([pd.read_parquet(file_path) for file_path in selected_paths], axis=0,
                          ignore_index=True).reset_index(drop=True)
    merged_df.to_parquet(folder_path / result_file_name, index=None)
    del merged_df
    if remove_src_files:
        for file_path in selected_paths:
            file_path.unlink()


In [9]:
def extract_fields_from_file(json_file_path: pathlib.Path):
    json_file = json_file_path.open(mode="rb")
    documents = ijson.items(json_file, 'item')
    info_results = []
    write_first_chunk = True
    max_rows = 100000
    file_part = 0
    for document in documents:
        result = extract_fields_from_document(document)
        info_results.extend(result)
        if len(info_results) > max_rows:
            result_path = OPEN_TENDER_EXTRACTION_OUTPUT / f"{json_file_path.stem}_part_{file_part}.parquet"
            save_list_of_rows(info_results, result_path, write_first_chunk)
            file_part+=1
            # write_first_chunk = False
            del info_results
            info_results = []
    result_path = OPEN_TENDER_EXTRACTION_OUTPUT / f"{json_file_path.stem}_part_{file_part}.parquet"
    save_list_of_rows(info_results, result_path, write_first_chunk)
    json_file.close()
    merge_results(folder_path=OPEN_TENDER_EXTRACTION_OUTPUT,
                  result_file_name=f"{json_file_path.stem}.parquet",
                  src_name_prefix=json_file_path.stem,
                  src_name_suffix=".parquet")

In [10]:
def extract_fields_from_multiple_files(json_file_paths: List[pathlib.Path]):
    with ProcessPoolExecutor(max_workers=24) as executor:
        futures = [executor.submit(extract_fields_from_file, json_file_path) for json_file_path in json_file_paths]
        for future in futures:
            future.result()

# Extract required fields from OpenTender JSONs

In [11]:
%%time
extract_fields_from_multiple_files(OPEN_TENDER_JSONS_PATHS)

CPU times: user 18.6 ms, sys: 64.3 ms, total: 82.9 ms
Wall time: 5min


# Merge result into a single file

In [None]:
merge_results(folder_path=OPEN_TENDER_EXTRACTION_OUTPUT,
              result_file_name=f"merged_results.parquet",
              src_name_suffix=".parquet",
              remove_src_files=False
              )

In [None]:
merged_info_df = pd.read_parquet(OPEN_TENDER_EXTRACTION_OUTPUT / "merged_results.parquet")

# Compute SME bidders statistics:

In [None]:
number_of_bidders = len(merged_info_df)
number_of_sme_bidders = len(merged_info_df[merged_info_df["bidder_isSme"] == True])
percentage_of_sme_bidders = round(100 * number_of_sme_bidders / number_of_bidders, 2)
print("Total number of Bidders: ", number_of_bidders)
print("Total number of Bidders as SME: ", number_of_sme_bidders)
print("Percentage of SME Bidders:", percentage_of_sme_bidders, "%")

In [None]:
len(merged_info_df.columns)