# Merging of ``DataFrames``

## Merging

In [1]:
from pandas import DataFrame, concat
from viedoors import CADLoader, NPALoader, FileMerger, HMLoader
from viedoors import BSTLoader, FLTLoader, FMLoader, eliminate_duplicates

In [2]:
obj = "420"

cad = CADLoader(file=f"data/{obj}/cad.xlsx", title="CAD")
npa = NPALoader(file=f"data/{obj}/npa.xlsx", title="NPA")
hm = HMLoader(file=f"data/{obj}/hm.xls", title="HM")
bst = BSTLoader(file=f"data/{obj}/bst.xlsx", title="BST")
flt = FLTLoader(file=f"data/{obj}/flt.xlsx", title="FLT")
fm = FMLoader()

df_npa = npa.get_data(prefixed=True)
df_cad = cad.get_data(prefixed=True)
df_hm = hm.get_data(prefixed=True)
df_bst = bst.get_data(prefixed=True)
df_flt = flt.get_data(prefixed=True)
df_fm = fm.get_data(prefixed=True)

# Merge Files

In [9]:
merger = FileMerger(files=[df_cad, df_npa, df_hm, df_bst, df_flt, df_fm], how="left")
merge = merger.get_data_merge()

## Test Consolidation

In [None]:
merge = eliminate_duplicates(merge, "CAD___gar_tuernummer_alt", "NPA___alte_tuernummer")
merge = eliminate_duplicates(merge, "CAD___gar_tuernummer_alt", "HM___tuer_nr_alt")
merge = eliminate_duplicates(merge, "CAD___gar_flucht_tuer_nr", "NPA___fluchtwegs_tuer_nr")

In [12]:
merge[merge["merge"].duplicated()]

Unnamed: 0,CAD___gar_tuernummer_bauteil,CAD___gar_tuernummer_ebene,CAD___gar_tuernummer_modul,CAD___gar_tuernummer_aks_nr,CAD___gar_tuernummer_nummer,CAD___gar_tuernummer_alt,CAD___gar_flucht_tuer_nr,CAD___gar_tuer_breite,CAD___gar_tuer_hoehe,CAD___gar_bsk,...,FM___nettacount,FM___tax,FM___gesuanbot,FM___arbeitszeit,FM___material,FM___schadensmeldung,FM___aufnahme,FM___kleinregie_summe,FM___zyklus,FM___integration_aks
14,420,0,A,205,1,036E,--,90,200,---,...,,,,,,,,,,
76,420,0,A,1205,2,012,--,85,200,T30,...,,,,,,,,,,
111,420,0,A,2001,2,052,55,78,220,---,...,,,,,,,,,,
116,420,0,A,2001,3,052E,--,250,300,---,...,,,,,,,,,,
164,420,0,A,2601,3,045A,--,750,452,---,...,,,,,,,,,,
304,420,0,B,1303,2,---,--,90,200+OL70,---,...,,,,,,,,,,
315,420,0,B,1406,1,---,--,60,198,---,...,,,,,,,,,,
316,420,0,B,1406,1,---,--,60,198,---,...,,,,,,,,,,
317,420,0,B,1406,1,---,--,60,198,---,...,,,,,,,,,,
400,420,2,A,416,1,217,5,85+45,210+OL55,T30,...,,,,,1.0,0.0,0.0,1.0,589.0,420 02A0416.01


## Analysis and Storing

In [18]:
merge.to_excel("matching/420_match_file.xlsx")

OSError: Cannot save file into a non-existent directory: 'matching'

In [13]:
for dataset in [df_npa, df_hm, df_bst, df_flt, df_fm]:
    print("---")
    print(dataset.columns[0].split("___")[0]+"-Datenfile")

    fm = FileMerger(files=[df_cad, dataset], how="inner")

    a = len(dataset)
    b = len(fm.get_data_merge())

    print(f"Datensätze: {a}.")
    print(f"Übereinstimmungen mit CAD-Datenfile: {b} ({round(b/a*100,0)}%)")


---
NPA-Datenfile
Datensätze: 346.
Übereinstimmungen mit CAD-Datenfile: 323 (93.0%)
---
HM-Datenfile
Datensätze: 33.
Übereinstimmungen mit CAD-Datenfile: 0 (0.0%)
---
BST-Datenfile
Datensätze: 96.
Übereinstimmungen mit CAD-Datenfile: 93 (97.0%)
---
FLT-Datenfile
Datensätze: 106.
Übereinstimmungen mit CAD-Datenfile: 0 (0.0%)
---
FM-Datenfile
Datensätze: 6438.
Übereinstimmungen mit CAD-Datenfile: 0 (0.0%)


In [8]:
for dataset in [df_npa, df_hm, df_bst, df_flt, df_fm]:
    name = dataset.columns[0].split("___")[0]+"-Datenfile"
    print("---")
    print(name)

    fm = FileMerger(files=[df_cad, dataset], how="inner")
    nm = fm.find_non_matching_rows()
    nm.to_excel(f"non_matching/CAD_versus_{name}.xlsx")

    dp = fm.find_duplicates()
    dp.to_excel(f"duplicates/CAD_versus_{name}.xlsx")



---
NPA-Datenfile
    CAD___gar_tuernummer_bauteil CAD___gar_tuernummer_ebene  \
31                           420                         00   
83                           420                         00   
84                           420                         00   
85                           420                         00   
86                           420                         00   
..                           ...                        ...   
578                          NaN                        NaN   
579                          NaN                        NaN   
602                          NaN                        NaN   
603                          NaN                        NaN   
604                          NaN                        NaN   

    CAD___gar_tuernummer_modul CAD___gar_tuernummer_aks_nr  \
31                           A                        0408   
83                           A                        2601   
84                           A         

KeyboardInterrupt: 