
# Build a database of potential customers from TED 2014-2022 data
### Query the Contract Award Notices and get:
* Name of the Contracting Authority/Entity
* Title Scope of the procurement (Procedure & Lot)
* The total value of the contract
* CPV Codes + Additional CPV codes
* Name of the Winner/Contractor
	* Company name
	* URL address
	* NUTS Code
	* Email
	* is SME

### Statistical Information about the tenders
* number of tenders received
* number of tenders received from SMEs
### Data of the conclusion of the contract

# Import libraries

In [1]:
import pathlib
import pandas as pd
import ijson
from typing import List
from concurrent.futures import ProcessPoolExecutor
from resources import OPEN_TENDER_JSONS, OPEN_TENDER_EXTRACTION_OUTPUT

# Define constants for required fields per structure level

In [2]:
TENDER_FIELDS = ["title", "estimatedPrice", "finalPrice", "country", "procedureType", "contractSignatureDate"]
LOT_FIELDS = ["title", "contractSignatureDate", "estimatedPrice"]
BID_FIELDS = ["isWinning", "price", "unitPrices", "isDisqualified", "disqualificationReason", "isConsortium"]
BIDDERS_FIELDS = ["name", "address", "email", "contactPoint", "contactName", "phone", "isLeader", "isSME"]
BUYER_FIELDS = ["name", "address", "email", "contactPoint", "contactName", "phone"]
NOT_NULL_FIELDS = ["bidder_email", "buyer_name", "tender_title", "tender_finalPrice_netAmountEur"]

In [3]:
OPEN_TENDER_JSONS_PATHS = [file_path for file_path in OPEN_TENDER_JSONS.iterdir() if
                           file_path.is_file() and file_path.name.endswith(".json")]

# Define additional functions

In [4]:
def flatten_data(y):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        else:
            out[name[:-1]] = x

    flatten(y)
    return out

In [5]:
def extract_fields_from_document(document: dict) -> list:
    rows = []
    row = {}
    for field in TENDER_FIELDS:
        row["tender_" + field] = document.get(field, None)
    if "cpvs" in document.keys():
        row["tender_cpvs"] = []
        for cpv in document["cpvs"]:
            if ("isMain" in cpv.keys()) and ("code" in cpv.keys()):
                if cpv["isMain"]:
                    row["tender_cpv_main"] = cpv["code"]
                else:
                    row["tender_cpvs"].append(cpv["code"])
    if "buyers" in document.keys():
        for buyer in document["buyers"]:
            for field in BUYER_FIELDS:
                row["buyer_" + field] = buyer.get(field, None)
            if "lots" in document.keys():
                for lot in document["lots"]:
                    for field in LOT_FIELDS:
                        row["lot_" + field] = lot.get(field, None)
                    if "bids" in lot.keys():
                        for bid in lot["bids"]:
                            for field in BID_FIELDS:
                                row["bid_" + field] = bid.get(field, None)
                            if "bidders" in bid.keys():
                                for bidder in bid["bidders"]:
                                    for field in bidder.keys():
                                        row["bidder_" + field] = bidder.get(field, None)
                                    rows.append(flatten_data(row))
    return rows

In [6]:
def filter_dataframe_fields(data: pd.DataFrame) -> pd.DataFrame:
    for not_null_field in NOT_NULL_FIELDS:
        if not_null_field in data.columns:
            tmp_data = data[data[not_null_field].notnull()].copy()
            del data
            data = tmp_data
    return data

In [7]:
def save_list_of_rows(list_of_rows: list, save_path: pathlib.Path, save_header: bool):
    mode = 'a' if not save_header else 'w'
    result_df = filter_dataframe_fields(pd.DataFrame(list_of_rows))
    result_df.to_csv(save_path, index=False, header=save_header, mode=mode, sep=";")
    del result_df


In [8]:
def extract_fields_from_file(json_file_path: pathlib.Path):
    json_file = json_file_path.open(mode="rb")
    documents = ijson.items(json_file, 'item')
    info_results = []
    write_first_chunk = True
    max_rows = 30000
    result_csv_path = OPEN_TENDER_EXTRACTION_OUTPUT / f"{json_file_path.stem}.csv"
    for document in documents:
        result = extract_fields_from_document(document)
        info_results.extend(result)
        if len(info_results) > max_rows:
            save_list_of_rows(info_results, result_csv_path, write_first_chunk)
            write_first_chunk = False
            del info_results
            info_results = []
    save_list_of_rows(info_results, result_csv_path, write_first_chunk)
    json_file.close()

In [9]:
def extract_fields_from_multiple_files(json_file_paths: List[pathlib.Path]):
    with ProcessPoolExecutor(max_workers=24) as executor:
        futures = [executor.submit(extract_fields_from_file, json_file_path) for json_file_path in json_file_paths]
        for future in futures:
            future.result()

# Extract required fields from OpenTender JSONs

In [10]:
%%time
extract_fields_from_multiple_files(OPEN_TENDER_JSONS_PATHS)

CPU times: user 22.8 ms, sys: 60.2 ms, total: 83.1 ms
Wall time: 4min 19s


# Merge result CSVs into a single CSV

In [11]:
OPEN_TENDER_CSV_PATHS = [file_path for file_path in OPEN_TENDER_EXTRACTION_OUTPUT.iterdir() if
                         file_path.is_file() and file_path.name.endswith(".csv")]

In [12]:
merged_info_df = pd.concat(
    [pd.read_csv(csv_file_path, on_bad_lines='skip', sep=";") for csv_file_path in OPEN_TENDER_CSV_PATHS], axis=0,
    ignore_index=True).reset_index(drop=True)

  [pd.read_csv(csv_file_path, on_bad_lines='skip', sep=";") for csv_file_path in OPEN_TENDER_CSV_PATHS], axis=0,
  [pd.read_csv(csv_file_path, on_bad_lines='skip', sep=";") for csv_file_path in OPEN_TENDER_CSV_PATHS], axis=0,
  [pd.read_csv(csv_file_path, on_bad_lines='skip', sep=";") for csv_file_path in OPEN_TENDER_CSV_PATHS], axis=0,
  [pd.read_csv(csv_file_path, on_bad_lines='skip', sep=";") for csv_file_path in OPEN_TENDER_CSV_PATHS], axis=0,
  [pd.read_csv(csv_file_path, on_bad_lines='skip', sep=";") for csv_file_path in OPEN_TENDER_CSV_PATHS], axis=0,
  [pd.read_csv(csv_file_path, on_bad_lines='skip', sep=";") for csv_file_path in OPEN_TENDER_CSV_PATHS], axis=0,
  [pd.read_csv(csv_file_path, on_bad_lines='skip', sep=";") for csv_file_path in OPEN_TENDER_CSV_PATHS], axis=0,
  [pd.read_csv(csv_file_path, on_bad_lines='skip', sep=";") for csv_file_path in OPEN_TENDER_CSV_PATHS], axis=0,
  [pd.read_csv(csv_file_path, on_bad_lines='skip', sep=";") for csv_file_path in OPEN_TENDER_CSV

In [13]:
merged_info_df.to_csv(OPEN_TENDER_EXTRACTION_OUTPUT / "merged_results.csv")

# Compute SME bidders statistics:

In [38]:
number_of_bidders = len(merged_info_df)
number_of_sme_bidders = len(merged_info_df[merged_info_df["bidder_isSme"]==True])
percentage_of_sme_bidders = round(100*number_of_sme_bidders/number_of_bidders,2)
print("Total number of Bidders: ",number_of_bidders)
print("Total number of Bidders as SME: ",number_of_sme_bidders)
print("Percentage of SME Bidders:", percentage_of_sme_bidders, "%")

Total number of Bidders:  4465528
Total number of Bidders as SME:  18531
Percentage of SME Bidders: 0.41 %
