# Merging of ``DataFrames``

## Merging

In [1]:
from pandas import DataFrame
from viedoors import CADLoader, NPALoader, FileMerger, HMLoader
from viedoors import BSTLoader, FLTLoader, FMLoader

In [2]:
cad = CADLoader(file="data/420/420_gesamt_20250122.xlsx", title="CAD")
npa = NPALoader(file="data/420/Tür Aufnahme Obj. 420.xlsx", title="NPA")
hm = HMLoader(file="data/420/Schrack_HM_Obj.420.xls", title="HM")
bst = BSTLoader(file="data/420/Sisando_BST_Obj.420.xlsx", title="BST")
flt = FLTLoader(file="data/420/Sisando_FLT_Obj.420.xlsx", title="FLT")
fm = FMLoader()

df_npa = npa.get_data(prefixed=True)
df_cad = cad.get_data(prefixed=True)
df_hm = hm.get_data(prefixed=True)
df_bst = bst.get_data(prefixed=True)
df_flt = flt.get_data(prefixed=True)
df_fm = fm.get_data(prefixed=True)

In [3]:
merger = FileMerger(files=[df_cad, df_npa, df_hm, df_bst, df_flt, df_fm], how="left")
merge = merger.get_data_merge()

In [4]:
merge.loc[merge.duplicated("merge")]

Unnamed: 0,CAD___gar_tuernummer_bauteil,CAD___gar_tuernummer_ebene,CAD___gar_tuernummer_modul,CAD___gar_tuernummer_aks_nr,CAD___gar_tuernummer_nummer,CAD___gar_tuernummer_alt,CAD___gar_flucht_tuer_nr,CAD___gar_tuer_breite,CAD___gar_tuer_hoehe,CAD___gar_bsk,...,FM___nettacount,FM___tax,FM___gesuanbot,FM___arbeitszeit,FM___material,FM___schadensmeldung,FM___aufnahme,FM___kleinregie_summe,FM___zyklus,FM___integration_aks
13,420,0,A,205,1,002,--,165,200+OL82,T30,...,,,,,,,,,,
14,420,0,A,205,1,036E,--,90,200,---,...,,,,,,,,,,
15,420,0,A,205,1,036E,--,90,200,---,...,,,,,,,,,,
27,420,0,A,408,1,0A0408.01,--,110,225,---,...,,,,,,,,,,
74,420,0,A,1205,2,010A,--,157,200,T30,...,,,,,,,,,,
75,420,0,A,1205,2,012,--,85,200,T30,...,,,,,,,,,,
76,420,0,A,1205,2,012,--,85,200,T30,...,,,,,,,,,,
110,420,0,A,2001,2,052B,55,---,---,---,...,,,,,,,,,,
111,420,0,A,2001,2,052,55,78,220,---,...,,,,,,,,,,
112,420,0,A,2001,2,052,55,78,220,---,...,,,,,,,,,,


In [5]:
merge.loc[~merge.duplicated("merge")]

Unnamed: 0,CAD___gar_tuernummer_bauteil,CAD___gar_tuernummer_ebene,CAD___gar_tuernummer_modul,CAD___gar_tuernummer_aks_nr,CAD___gar_tuernummer_nummer,CAD___gar_tuernummer_alt,CAD___gar_flucht_tuer_nr,CAD___gar_tuer_breite,CAD___gar_tuer_hoehe,CAD___gar_bsk,...,FM___nettacount,FM___tax,FM___gesuanbot,FM___arbeitszeit,FM___material,FM___schadensmeldung,FM___aufnahme,FM___kleinregie_summe,FM___zyklus,FM___integration_aks
0,420,00,A,0104,01,036D,--,90,200,---,...,,,,,,,,,,
1,420,00,A,0106,01,001,65,170,200,---,...,,,,,,,,,,
2,420,00,A,0106,02,004,63,157,200,T30,...,,,1,1,1,0,0,0,589,420 00A0106.02
3,420,00,A,0108,01,029A,--,165,200,T30,...,,,,,,,,,,
4,420,00,A,0108,02,003A,--,85,200,---,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438,420,K1,A,2108,01,5K/1,--,80,200,Nein,...,,,,,,,,,,
439,420,K1,A,2205,01,5K/2,--,80,200,Nein,...,,,,,,,,,,
440,420,K1,A,2308,01,01A,--,112,243,Nein,...,,,,,,,,,,
441,420,K1,A,2506,01,5K/3,--,80,200,Nein,...,,,,,,,,,,


## Analysis and Storing

In [6]:
merge.to_excel("matching/420_match_file.xlsx")

In [7]:
for dataset in [df_npa, df_hm, df_bst, df_flt, df_fm]:
    print("---")
    print(dataset.columns[0].split("___")[0]+"-Datenfile")

    fm = FileMerger(files=[df_cad, dataset], how="inner")

    a = len(dataset)
    b = len(fm.get_data_merge())

    print(f"Datensätze: {a}.")
    print(f"Übereinstimmungen mit CAD-Datenfile: {b} ({round(b/a*100,0)}%)")


---
NPA-Datenfile
Datensätze: 417.
Übereinstimmungen mit CAD-Datenfile: 416 (100.0%)
---
HM-Datenfile
Datensätze: 42.
Übereinstimmungen mit CAD-Datenfile: 30 (71.0%)
---
BST-Datenfile
Datensätze: 74.
Übereinstimmungen mit CAD-Datenfile: 75 (101.0%)
---
FLT-Datenfile
Datensätze: 100.
Übereinstimmungen mit CAD-Datenfile: 68 (68.0%)
---
FM-Datenfile
Datensätze: 6438.
Übereinstimmungen mit CAD-Datenfile: 29 (0.0%)


In [8]:
for dataset in [df_npa, df_hm, df_bst, df_flt, df_fm]:
    name = dataset.columns[0].split("___")[0]+"-Datenfile"
    print("---")
    print(name)

    fm = FileMerger(files=[df_cad, dataset], how="inner")
    nm = fm.find_non_matching_rows()
    nm.to_excel(f"non_matching/CAD_versus_{name}.xlsx")

    dp = fm.find_duplicates()
    dp.to_excel(f"duplicates/CAD_versus_{name}.xlsx")



---
NPA-Datenfile
    CAD___gar_tuernummer_bauteil CAD___gar_tuernummer_ebene  \
31                           420                         00   
83                           420                         00   
84                           420                         00   
85                           420                         00   
86                           420                         00   
..                           ...                        ...   
578                          NaN                        NaN   
579                          NaN                        NaN   
602                          NaN                        NaN   
603                          NaN                        NaN   
604                          NaN                        NaN   

    CAD___gar_tuernummer_modul CAD___gar_tuernummer_aks_nr  \
31                           A                        0408   
83                           A                        2601   
84                           A         

KeyboardInterrupt: 