# Imports

In [None]:
import json
from collections import Counter
from itertools import chain
from pathlib import Path

import numpy as np
import pandas as pd
import regex
import unidecode

from src.utils.utils import evaluate_cell, fill_na, unify_colname

In [None]:
# Load data
dir_df = Path("data/metadata/insiders.parquet")
df_in = pd.read_parquet(dir_df)
dir_df = Path("data/metadata/outsiders.parquet")
df_ou = pd.read_parquet(dir_df)
dir_df = Path("data/metadata/minors.parquet")
df_mi = pd.read_parquet(dir_df)

In [None]:
print(df_in.shape)
print(df_ou.shape)
print(df_mi.shape)

In [None]:
# Load info from gencat
tend_cat = pd.read_csv("data/metadata/gencat.csv")

# Aux functions

In [None]:
def process_str(el: str):
    s = regex.sub(r"(^[^\w({[]*)|([^\w)}\]]*$)|([^\w:)(/-\s])", "", el).lower().strip()
    s = regex.sub(r"\s+", " ", s)
    return s


def process_cpv(el: str):
    return regex.sub(r"\D", "", el)

In [None]:
# Get general tender info (title, object, winner, etc)
df_in.columns = [unify_colname(c) for c in df_in.columns]
df_ou.columns = [unify_colname(c) for c in df_ou.columns]
df_mi.columns = [unify_colname(c) for c in df_mi.columns]

In [None]:
use_cols = [
    "id",
    "summary",
    "title",
    # "updated",
    "ContractFolderStatus.ContractFolderID",
    # "ContractFolderStatus.ContractFolderStatusCode",
    # "ContractFolderStatus.LocatedContractingParty.Party.PartyIdentification.ID",
    # "ContractFolderStatus.LocatedContractingParty.Party.PartyName.Name",
    "ContractFolderStatus.ProcurementProject.Name",
    # "ContractFolderStatus.ProcurementProject.TypeCode",
    "ContractFolderStatus.ProcurementProject.RequiredCommodityClassification.ItemClassificationCode",
    # "ContractFolderStatus.ProcurementProject.RealizedLocation.CountrySubentityCode",
    # "ContractFolderStatus.ProcurementProject.PlannedPeriod.DurationMeasure",
    # "ContractFolderStatus.ProcurementProject.PlannedPeriod.StartDate",
    # "ContractFolderStatus.ProcurementProject.PlannedPeriod.EndDate",
    "ContractFolderStatus.TenderResult.WinningParty.PartyIdentification.ID",
    # "ContractFolderStatus.TenderResult.WinningParty.PartyName.Name",
    "ContractFolderStatus.ProcurementProject.RealizedLocation.CountrySubentity",
    # "ContractFolderStatus.TenderingTerms.FundingProgramCode",
    # "ContractFolderStatus.TenderingTerms.FundingProgram",
    # "ContractFolderStatus.TenderResult.WinningParty.PartyLegalEntity.CompanyTypeCode",
]

In [None]:
rename_columns = {
    "title": "title",
    "summary": "summary",
    "ContractFolderStatus.ContractFolderID": "id",
    "ContractFolderStatus.ProcurementProject.Name": "project",
    "ContractFolderStatus.TenderResult.WinningParty.PartyIdentification.ID": "winner",
    "ContractFolderStatus.ProcurementProject.RequiredCommodityClassification.ItemClassificationCode": "cpv",
    "ContractFolderStatus.ProcurementProject.RealizedLocation.CountrySubentity": "location",
    "denominacio": "title",
    "objecte_contracte": "summary",
    "codi_expedient": "id",
    "codi_cpv": "cpv",
}

# PLACE

In [None]:
df = pd.concat(
    [
        df_in[[c for c in use_cols if c in df_in.columns]],
        df_ou[[c for c in use_cols if c in df_ou.columns]],
        # df_mi[[c for c in use_cols if c in df_mi.columns]],
    ]
)
index_names = df.index.names
df.reset_index(inplace=True)
df["identifier"] = df[index_names].astype(str).agg("/".join, axis=1)
# df.drop(index_names, inplace=True, axis=1)
df.set_index("identifier", inplace=True)
df = df.applymap(fill_na, fill=np.nan)
df.shape

### Get relevant data

In [None]:
# Get only relevant data
tender_filt = (
    df[
        [
            "title",
            "summary",
            "ContractFolderStatus.ContractFolderID",
            # "ContractFolderStatus.ProcurementProject.Name",
            # "ContractFolderStatus.TenderResult.WinningParty.PartyIdentification.ID",
            "ContractFolderStatus.ProcurementProject.RequiredCommodityClassification.ItemClassificationCode",
            "ContractFolderStatus.ProcurementProject.RealizedLocation.CountrySubentity",
        ]
    ]
    .rename(columns=rename_columns)
    .dropna(how="all")
    .explode("cpv")
    # .explode("winner")
)

# Clean columns
tender_filt["id_orig"] = tender_filt["id"]
tender_filt["id_norm"] = tender_filt["id"].apply(
    lambda x: unidecode.unidecode(regex.sub(r"[^\p{L}\d]+", "-", x))
    if not pd.isna(x)
    else np.nan
)
# tender_filt["winner"] = tender_filt["winner"].apply(evaluate)
tender_filt["cpv"] = tender_filt["cpv"].apply(evaluate_cell)
tender_filt = tender_filt.explode("cpv")  # .explode("winner")
tender_filt["cpv"] = tender_filt["cpv"].astype(str).apply(process_cpv)
tender_filt["location"] = tender_filt["location"].apply(
    lambda x: unidecode.unidecode(x.lower()) if not pd.isna(x) else np.nan
)
tender_filt[[c for c in tender_filt.columns if c != "id"]] = (
    tender_filt[[c for c in tender_filt.columns if c != "id"]]
    .astype(str)
    .applymap(process_str)
)
# Create a cpv_div column for matching
tender_filt = tender_filt.replace({"": np.nan, "0": np.nan})
tender_filt["cpv_div"] = tender_filt["cpv"].apply(
    lambda x: x[:2] if not pd.isna(x) else np.nan
)
tender_filt = tender_filt.reset_index()

### Get data from cat

In [None]:
# Filter using only those from Cat
agg_tender_orig = (
    tender_filt[
        tender_filt["location"].str.contains(
            "|".join(
                [
                    "nan",
                    "espana",
                    "cataluna",
                    "catalunya",
                    "barcelona",
                    "tarragona",
                    "girona",
                    "gerona",
                    "lleida",
                    "lerida",
                ]
            )
        )
    ]
    .drop("identifier", axis=1)
    .groupby(["id_orig"])
    .agg(list)
)
valid_agg_tender_orig = agg_tender_orig[
    agg_tender_orig["title"].apply(lambda x: len(set(x)) == 1)
]
valid_agg_tender_orig = valid_agg_tender_orig.applymap(
    lambda x: Counter(x).most_common()[0][0]
)
valid_agg_tender_orig = valid_agg_tender_orig.reset_index().reset_index()
valid_agg_tender_orig["index"] = valid_agg_tender_orig["index"].apply(lambda x: [x])
print(valid_agg_tender_orig.shape)
display(valid_agg_tender_orig.head())

# GENCAT

In [None]:
# Clean tenders cat
tend_cat_filt = (
    tend_cat[
        [
            "denominacio",
            "objecte_contracte",
            "codi_expedient",
            "codi_cpv",
        ]
    ]
    .rename(columns=rename_columns)
    .astype(str)
)
tend_cat_filt["id_orig"] = tend_cat_filt["id"]
tend_cat_filt["id_norm"] = tend_cat_filt["id"].apply(
    lambda x: unidecode.unidecode(regex.sub(r"[^\p{L}\d]+", "-", x))
    if not pd.isna(x)
    else np.nan
)
tend_cat_filt["cpv"] = tend_cat_filt["cpv"].astype(str).apply(process_cpv)
tend_cat_filt[[c for c in tend_cat_filt if c != "id"]] = (
    tend_cat_filt[[c for c in tend_cat_filt if c != "id"]]
    .astype(str)
    .applymap(process_str)
)
tend_cat_filt = tend_cat_filt.replace({"": np.nan, "0": np.nan})
tend_cat_filt["cpv_div"] = tend_cat_filt["cpv"].apply(
    lambda x: x[:2] if not pd.isna(x) else np.nan
)
tend_cat_filt = tend_cat_filt.reset_index(drop=True)

In [None]:
# Aggregate by original ID
valid_tend_cat_orig = tend_cat_filt.groupby("id_orig").agg(list)
valid_tend_cat_orig = valid_tend_cat_orig[
    valid_tend_cat_orig["title"].apply(lambda x: len(set(x)) == 1)
]
valid_tend_cat_orig = valid_tend_cat_orig.applymap(
    lambda x: Counter(x).most_common()[0][0]
)
valid_tend_cat_orig = valid_tend_cat_orig.reset_index().reset_index()
valid_tend_cat_orig["index"] = valid_tend_cat_orig["index"].apply(lambda x: [x])
print(len(valid_tend_cat_orig))
display(valid_tend_cat_orig.head())

# TO DO
- Hacer match con budget

# Match PLACE & GENCAT

In [None]:
# Match tenders using orig id
use_tend_agg_orig = valid_agg_tender_orig[["id_orig", "index", "title", "cpv_div"]]
use_tend_cat_orig = valid_tend_cat_orig[["id_orig", "index", "title", "cpv_div"]]

### By ID

In [None]:
# Match tenders by ID orig
matched_id_orig = pd.merge(
    use_tend_agg_orig,
    use_tend_cat_orig,
    how="inner",
    left_on=["id_orig"],
    right_on=["id_orig"],
    suffixes=["_agg", "_cat"],
).reset_index(drop=True)
matched_id_orig["title"] = matched_id_orig[["title_agg", "title_cat"]].apply(
    lambda x: Counter(list(x)), axis=1
)
matched_id_orig["cpv_div"] = matched_id_orig[["cpv_div_agg", "cpv_div_cat"]].apply(
    lambda x: Counter(list(x)), axis=1
)
matched_id_orig = matched_id_orig[
    ["id_orig", "title", "cpv_div", "index_agg", "index_cat"]
]
idx_agg_orig = set(chain.from_iterable(matched_id_orig["index_agg"]))
idx_cat_orig = set(chain.from_iterable(matched_id_orig["index_cat"]))

# Stats
num_agg_idx_found = len(idx_agg_orig)
num_cat_idx_found = len(idx_cat_orig)
num_agg_idx_original = len(use_tend_agg_orig)
num_cat_idx_original = len(use_tend_cat_orig)

print("Indices from PLACE matched with id:")
print(
    f"{num_agg_idx_found} out of {num_agg_idx_original} ({num_agg_idx_found/num_agg_idx_original*100:.3f}%)"
)
print("Indices from GENCAT matched with id:")
print(
    f"{num_cat_idx_found} out of {num_cat_idx_original} ({num_cat_idx_found/num_cat_idx_original*100:.3f}%)"
)
display(matched_id_orig.head())

### By title and CPV

In [None]:
# Match tenders by title and cpv orig
matched_title_orig = pd.merge(
    use_tend_agg_orig,
    use_tend_cat_orig,
    how="inner",
    left_on=[
        "title",
        "cpv_div",
    ],
    right_on=[
        "title",
        "cpv_div",
    ],
    suffixes=["_agg", "_cat"],
).reset_index(drop=True)

matched_title_orig = matched_title_orig[
    matched_title_orig["id_orig_agg"] != matched_title_orig["id_orig_cat"]
]

# Split to get those ids that only appear in agg or cat
m_x = matched_title_orig.loc[
    ~matched_title_orig["id_orig_agg"].isin(matched_title_orig["id_orig_cat"]),
    ["id_orig_agg", "index_agg", "title", "cpv_div"],
].rename({"id_orig_agg": "id_orig"}, axis=1)
m_y = matched_title_orig.loc[
    ~matched_title_orig["id_orig_cat"].isin(matched_title_orig["id_orig_agg"]),
    ["id_orig_cat", "index_cat", "title", "cpv_div"],
].rename({"id_orig_cat": "id_orig"}, axis=1)

# Group by id
matched_title_orig = (
    pd.concat([m_x, m_y])
    .groupby("id_orig")
    .agg(
        {
            "title": Counter,
            "cpv_div": Counter,
            "index_agg": sum,
            "index_cat": sum,
        }
    )
    .reset_index()
    .replace({0: None})
)

# Stats
num_agg_title_found = len(matched_title_orig["index_agg"].explode().dropna())
num_cat_title_found = len(matched_title_orig["index_cat"].explode().dropna())

print("Indices from PLACE matched with title:")
print(
    f"{num_agg_title_found} out of {num_agg_idx_original} ({num_agg_title_found/num_agg_idx_original*100:.3f}%)"
)
print("Indices from GENCAT matched with title:")
print(
    f"{num_cat_title_found} out of {num_cat_idx_original} ({num_cat_title_found/num_cat_idx_original*100:.3f}%)"
)
print(
    "The found indices are only present in PLACE or GENCAT, but they match by title, so we asume they are the same and assign the id."
)
display(matched_title_orig.head())

### All (id, name, cpv)

In [None]:
# Combine both
matched_final_orig = pd.merge(
    matched_id_orig,
    matched_title_orig,
    how="outer",
    left_on="id_orig",
    right_on="id_orig",
)

title = []
cpv_div = []
index_agg = []
index_cat = []
for tx, ty, cx, cy, iax, iay, icx, icy in matched_final_orig[
    [
        "title_x",
        "title_y",
        "cpv_div_x",
        "cpv_div_y",
        "index_agg_x",
        "index_agg_y",
        "index_cat_y",
        "index_cat_x",
    ]
].values:
    t = dict()
    c = dict()
    ia = []
    ic = []
    t.update(tx if not pd.isna(tx) else dict())
    t.update(ty if not pd.isna(ty) else dict())
    c.update(cx if not pd.isna(cx) else dict())
    c.update(cy if not pd.isna(cy) else dict())
    ia.extend(iax if isinstance(iax, list) else [])
    ia.extend(iay if isinstance(iay, list) else [])
    ic.extend(icx if isinstance(icx, list) else [])
    ic.extend(icy if isinstance(icy, list) else [])
    title.append(t)
    cpv_div.append(c)
    index_agg.append(ia)
    index_cat.append(ic)

matched_final_orig["title"] = title
matched_final_orig["cpv_div"] = cpv_div
matched_final_orig["index_agg"] = index_agg
matched_final_orig["index_cat"] = index_cat
matched_final_orig["index_agg"] = matched_final_orig["index_agg"].apply(
    lambda x: list(set(x))
)
matched_final_orig["index_cat"] = matched_final_orig["index_cat"].apply(
    lambda x: list(set(x))
)

matched_final_orig = matched_final_orig[
    ["id_orig", "title", "cpv_div", "index_agg", "index_cat"]
]


# Stats
num_matched = len(matched_final_orig)
num_agg_idx_found = len(matched_final_orig["index_agg"].explode().dropna())
num_cat_idx_found = len(matched_final_orig["index_cat"].explode().dropna())

print(f"Total number of matches: {num_matched}")
print("Indices from PLACE matched with id:")
print(
    f"{num_agg_idx_found} out of {num_agg_idx_original} ({num_agg_idx_found/num_agg_idx_original*100:.3f}%)"
)
print("Indices from GENCAT matched with id:")
print(
    f"{num_cat_idx_found} out of {num_cat_idx_original} ({num_cat_idx_found/num_cat_idx_original*100:.3f}%)"
)

display(matched_final_orig.head())

In [None]:
# Get elements that appear in gencat but not in place
cat_indices = matched_final_orig["index_cat"].explode().dropna().tolist()
use_tend_cat_orig[~use_tend_cat_orig.index.isin(cat_indices)]