# Merging of ``DataFrames``

## Merging

In [1]:
from pandas import DataFrame, concat
from viedoors import CADLoader, NPALoader, FileMerger, HMLoader, count_duplicates
from viedoors import BSTLoader, FLTLoader, FMLoader, eliminate_duplicates

In [2]:
obj = "420"

cad = CADLoader(file=f"data/{obj}/cad.xlsx", title="CAD")
npa = NPALoader(file=f"data/{obj}/npa.xlsx", title="NPA")
hm = HMLoader(file=f"data/{obj}/hm.xls", title="HM")
bst = BSTLoader(file=f"data/{obj}/bst.xlsx", title="BST")
flt = FLTLoader(file=f"data/{obj}/flt.xlsx", title="FLT")
fm = FMLoader()

df_npa = npa.get_data(prefixed=True)
df_cad = cad.get_data(prefixed=True)
df_hm = hm.get_data(prefixed=True)
df_bst = bst.get_data(prefixed=True)
df_flt = flt.get_data(prefixed=True)
df_fm = fm.get_data(prefixed=True)

# Merge Files

In [3]:
merger = FileMerger(files=[df_cad, df_npa, df_hm, df_bst, df_flt, df_fm], how="left")
merge = merger.get_data_merge()

## Test Consolidation

In [4]:
merge = eliminate_duplicates(merge, "CAD___gar_tuernummer_alt", "NPA___alte_tuernummer")
merge = eliminate_duplicates(merge, "CAD___gar_tuernummer_alt", "HM___tuer_nr_alt")
merge = eliminate_duplicates(merge, "CAD___gar_flucht_tuer_nr", "NPA___fluchtwegs_tuer_nr")

# TODO Dev: Untested
merge = eliminate_duplicates(merge, "NPA___alte_tuernummer", "FM___brandmeldernr")

In [5]:
merge[merge["merge"].duplicated()]

Unnamed: 0,CAD___gar_tuernummer_bauteil,CAD___gar_tuernummer_ebene,CAD___gar_tuernummer_modul,CAD___gar_tuernummer_aks_nr,CAD___gar_tuernummer_nummer,CAD___gar_tuernummer_alt,CAD___gar_flucht_tuer_nr,CAD___gar_tuer_breite,CAD___gar_tuer_hoehe,CAD___gar_bsk,...,FM___nettacount,FM___tax,FM___gesuanbot,FM___arbeitszeit,FM___material,FM___schadensmeldung,FM___aufnahme,FM___kleinregie_summe,FM___zyklus,FM___integration_aks
14,420,0,A,205,1,036E,--,90,200,---,...,,,,,,,,,,
76,420,0,A,1205,2,012,--,85,200,T30,...,,,,,,,,,,
111,420,0,A,2001,2,052,55,78,220,---,...,,,,,,,,,,
116,420,0,A,2001,3,052E,--,250,300,---,...,,,,,,,,,,
164,420,0,A,2601,3,045A,--,750,452,---,...,,,,,,,,,,
304,420,0,B,1303,2,---,--,90,200+OL70,---,...,,,,,,,,,,
315,420,0,B,1406,1,---,--,60,198,---,...,,,,,,,,,,
316,420,0,B,1406,1,---,--,60,198,---,...,,,,,,,,,,
317,420,0,B,1406,1,---,--,60,198,---,...,,,,,,,,,,
318,420,0,B,1406,1,---,--,60,198,---,...,,,,,,,,,,


## Analysis and Storing

In [6]:
merge.to_excel("matching/420_match_file.xlsx")

In [7]:
for dataset in [df_npa, df_hm, df_bst, df_flt, df_fm]:
    print("---")
    print(dataset.columns[0].split("___")[0]+"-Datenfile")

    fm = FileMerger(files=[df_cad, dataset], how="inner")

    a = len(dataset)
    b = len(fm.get_data_merge())

    print(f"Datensätze: {a}.")
    print(f"Übereinstimmungen mit CAD-Datenfile: {b} ({round(b/a*100,0)}%)")


---
NPA-Datenfile
Datensätze: 416.
Übereinstimmungen mit CAD-Datenfile: 428 (103.0%)
---
HM-Datenfile
Datensätze: 42.
Übereinstimmungen mit CAD-Datenfile: 30 (71.0%)
---
BST-Datenfile
Datensätze: 74.
Übereinstimmungen mit CAD-Datenfile: 75 (101.0%)
---
FLT-Datenfile
Datensätze: 107.
Übereinstimmungen mit CAD-Datenfile: 68 (64.0%)
---
FM-Datenfile
Datensätze: 6438.
Übereinstimmungen mit CAD-Datenfile: 29 (0.0%)


In [9]:
for dataset in [df_npa, df_hm, df_bst, df_flt, df_fm]:
    name = dataset.columns[0].split("___")[0]+"-Datenfile"

    if name =="FM-Datenfile":
        break

    print("---")
    print(name)

    fm = FileMerger(files=[df_cad, dataset], how="inner")
    nm = fm.find_non_matching_rows()
    nm.to_excel(f"non_matching/CAD_versus_{name}.xlsx")

    dp = count_duplicates(dataset)
    dp.to_excel(f"duplicates/CAD_versus_{name}.xlsx")


---
NPA-Datenfile
---
HM-Datenfile
---
BST-Datenfile
---
FLT-Datenfile
