In [38]:
import pandas as pd
import numpy as np
import os
import glob
import datetime

In [39]:
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)
pd.set_option('display.min_rows', 100)
pd.set_option('display.expand_frame_repr', True)

## Read SFTP File

In [40]:
import pandas as pd
from google.cloud import storage
import io

class Storage_Bucket_Operations:

    def __init__(self):
        self.bucket_name = "miag-m360-test-bucket"
        self.download_files_path = "Downloaded Files"

    def readFromBucket(self, sftp_file):
        client = storage.Client(project='cf-hada-bsc-mcctk-mia-kg')
        bucket = client.get_bucket(self.bucket_name)
        blob = bucket.blob(f"{sftp_file}")
        csv_data = blob.download_as_text()
        sftp_df = pd.read_csv(io.StringIO(csv_data), index_col=False, dtype = {"Store":str,"Supplier number (MIAG)":str,"Remittance advice number":str,"Supplier number (Sales Line)":str,"Document number":str, "Invoice number":str})
        return sftp_df

storage_bucket_ops = Storage_Bucket_Operations()
sftp_df = storage_bucket_ops.readFromBucket("miag.35.258800.20240920.620.csv")

In [41]:
sftp_df.shape

(26, 18)

## Update SDP

In [42]:
import pandas as pd
from sqlalchemy import create_engine, text
import pandasql as ps

class DB_Instance_Operations:

    def __init__(self):
        self.db_url = (
            "postgresql+psycopg2://postgres:9rk$Y}gib9kZEucj@10.32.111.54:5432/MIAG-M360_UAT"
            "?sslmode=require"
            "&sslrootcert=C:/Users/sappidi.reddy/Downloads/hada-bsc-miag-m360-psql-pp-server-ca.pem"
            "&sslcert=C:/Users/sappidi.reddy/Downloads/hada-bsc-miag-m360-psql-pp-client-cert.pem"
            "&sslkey=C:/Users/sappidi.reddy/Downloads/hada-bsc-miag-m360-psql-pp-client-key.pem"
        )
        self.engine = create_engine(self.db_url)

    def readSDPTable(self):
        # query = "Delete from sdp_pool;"
        # with self.engine.connect() as connection:
        #     connection.execute(text(query))
        #     connection.commit()
        # print("Rows deleted")
        query = "select * from sdp_pool"
        sdp_df = pd.read_sql_query(query, self.engine)
        return sdp_df

    def updateSDP(self, sdp, sftp_df):
        sq1 = "SELECT DISTINCT `Supplier number (Sales Line)`, `Supplier number (MIAG)`, `Supplier name`, `Contract area` FROM sftp_df"
        miag2 = ps.sqldf(sq1, locals())
        if sdp.shape[0] == 0:
            sdp = miag2.copy()
        else:
            sdpq = "Select distinct * from sdp"
            sdp_dist_df = ps.sqldf(sdpq)
            new_supp_in_sftp_query = "SELECT * from sftp_df where `Supplier number (Sales Line)` not in (SELECT `Supplier_Number_Sales` FROM sdp)"
            new_supp_df = ps.sqldf(new_supp_in_sftp_query)
            push_to_sdp_query = "Select `Supplier number (Sales Line)`, `Supplier number (MIAG)`, `Supplier name`, `Contract area` from new_supp_df union Select `Supplier_Number_Sales`, `Supplier_Number_MIAG`, `Supplier_Name`, `Contract_Area` from sdp"
            sdp = ps.sqldf(push_to_sdp_query)
        return sdp

    def writeSDPTable(self, sdp_df):
        column_mapping = {
            'Supplier number (Sales Line)': 'Supplier_Number_Sales',
            'Supplier number (MIAG)': 'Supplier_Number_MIAG',
            'Supplier name': 'Supplier_Name',
            'Contract area': 'Contract_Area'
        }
        sdp_df.rename(columns=column_mapping, inplace=True)
        with self.engine.begin() as connection:
            delete_query = text("Delete from sdp_pool")
            connection.execute(delete_query)
            sdp_df.to_sql('sdp_pool', connection, if_exists='append', index=False)
        print("Written back to SDP Table of DB Instance...")

    def getSupplierNumberForMMSIC(self):
        new_supplier_list_for_mmsic = []
        supplier_list_for_mmsic = self.readSDPTable()['Supplier_Number_Sales'].to_list()
        for i in range(len(supplier_list_for_mmsic)):
            new_supplier_list_for_mmsic.append(str(0) + supplier_list_for_mmsic[i][1:])
        return new_supplier_list_for_mmsic

    def getSupplierNumberForSISIC(self):
        new_supplier_list_for_sisic = []
        supplier_list_for_sisic = self.readSDPTable()['Supplier_Number_Sales'].to_list()
        for i in range(len(supplier_list_for_sisic)):
            new_supplier_list_for_sisic.append(str(1) + supplier_list_for_sisic[i][1:])
        return new_supplier_list_for_sisic

    def getSupplierNumberForFI(self):
        new_supplier_list_for_fi = []
        supplier_list_for_fi = self.readSDPTable()['Supplier_Number_Sales'].to_list()
        for i in range(len(supplier_list_for_fi)):
            new_supplier_list_for_fi.append(supplier_list_for_fi[i][5:])
        return new_supplier_list_for_fi

    def writeRawICTable(self, extracted_ic_df):
        column_mapping = {
            'LIFNR': 'LIFNR',
            'BELNR': 'BELNR',
            'RENR': 'RENR',
            'REDAT': 'REDAT',
            'LFSNR': 'LFSNR',
            'GEBRF': 'GEBRF',
            'GSMWB': 'GSMWB',
            'GSMWF': 'GSMWF',
            'WAERS': 'WAERS',
            'WENUM': 'WENUM',
            'RGDAT': 'RGDAT',
            'ABGST': 'ABGST',
            'AUFNR': 'AUFNR',
            'VORGN': 'VORGN',
            'GJAHR': 'GJAHR',
            'WEDAT': 'WEDAT',
            'DEBNOTNO': 'DEBNOTNO'
        }
        extracted_ic_df.rename(columns=column_mapping, inplace=True)
        with self.engine.connect() as connection:
            delete_query = text("Delete from raw_ic")
            connection.execute(delete_query)
            connection.commit()
            extracted_ic_df.to_sql('raw_ic', self.engine, if_exists='append', index=False)
        print("Written to Intermediate IC Table of DB Instance...")

    def writeRawFITable(self, extracted_fi_df):
        column_mapping = {
                'MANDT': 'MANDT',
                'Document_type': 'Document_type',
                'document_type_desc': 'document_type_desc',
                'GJAHR': 'GJAHR',
                'BUKRS': 'BUKRS',
                'GSBER': 'GSBER',
                'PRCTR': 'PRCTR',
                'store_or_dc': 'store_or_dc',
                'KOSTL': 'KOSTL',
                'month_in_fin_year': 'month_in_fin_year',
                'BELNR': 'BELNR',
                'XBLNR': 'XBLNR',
                'AUGBL': 'AUGBL',
                'AUGDT': 'AUGDT',
                'ZFBDT': 'ZFBDT',
                'ZBD1T': 'ZBD1T',
                'ZBD2T': 'ZBD2T',
                'NETDT': 'NETDT',
                'BUZEI': 'BUZEI',
                'altkt': 'altkt',
                'hkont': 'hkont',
                'suppl_no': 'suppl_no',
                'BLDAT': 'BLDAT',
                'BUDAT': 'BUDAT',
                'CPUDT': 'CPUDT',
                'partition_date': 'partition_date',
                'dana_ingestion_date': 'dana_ingestion_date',
                'shkzg': 'shkzg',
                'Amount_in_local_currency': 'Amount_in_local_currency',
                'Amount_in_document_currency': 'Amount_in_document_currency',
                'Tax_in_local_currency': 'Tax_in_local_currency',
                'Tax_in_document_currency': 'Tax_in_document_currency',
                'WAERS': 'WAERS',
                'Batch_Input_session_name': 'Batch_Input_session_name',
                'sgtxt': 'sgtxt'
}

        extracted_fi_df.rename(columns=column_mapping, inplace=True)
        with self.engine.connect() as connection:
            delete_query = text("Delete from raw_fi")
            connection.execute(delete_query)
            connection.commit()
        extracted_fi_df.to_sql('raw_fi', self.engine, if_exists='append', index=False)
        print("Written to Intermediate FI Table of DB Instance...")

    def readRawICTable(self):
        query = "select * from raw_ic"
        df_ic = pd.read_sql_query(query, self.engine)
        return df_ic

    def readRawFITable(self):
        query = "select * from raw_fi"
        df_fi = pd.read_sql_query(query, self.engine)
        return df_fi

db_instance_ops = DB_Instance_Operations()
sdp_df = db_instance_ops.readSDPTable()
sdp_df = db_instance_ops.updateSDP(sdp_df, sftp_df)
db_instance_ops.writeSDPTable(sdp_df)
sdp_supplier_list_for_mmsic = db_instance_ops.getSupplierNumberForMMSIC()
sdp_supplier_list_for_sisic = db_instance_ops.getSupplierNumberForSISIC()
sdp_supplier_list_for_fi = db_instance_ops.getSupplierNumberForFI()

Written back to SDP Table of DB Instance...


## Get Supplier Numbers

In [43]:
sdp_supplier_list_for_mmsic

['0000010074', '0000010095']

In [44]:
sdp_supplier_list_for_sisic

['1000010074', '1000010095']

In [45]:
sdp_supplier_list_for_fi

['10074', '10095']

## Extract MMSIC, SIS, FI data from BigQuery

In [151]:
from google.cloud import bigquery


class BigQuery_Operations:
    def __init__(self):
        self.client = bigquery.Client()

    def extract_MMSIC(self, sdp_supplier_list_for_mmsic):
        add_string = ""
        for i in range(len(sdp_supplier_list_for_mmsic)):
            add_string += "'"
            add_string += str(sdp_supplier_list_for_mmsic[i])
            add_string += "'"
            add_string += ", "
        add_string = add_string[:-2]
        query = f"""
                select LIFNR,BELNR,RENR, REDAT, LFSNR, GEBRF, GSMWB, GSMWF,
              WAERS,WENUM,RGDAT,ABGST,AUFNR,T1.VORGN, T1.GJAHR,T2.WEDAT, DEBNOTNO 
              from 
              `metro-bi-dl-tur-prod.ingest_fgtf_mmsic.mmsic_to_dana_gr_invoice_header` AS T1
            LEFT JOIN 
            (SELECT DISTINCT  VORGN , WEDAT, GJAHR 
            from
            `metro-bi-dl-tur-prod.ingest_fgtf_mmsic.mmsic_to_dana_gr_table_header`) AS T2
            ON 
            T1.VORGN = T2.VORGN
            WHERE
            T1.LIFNR IN ({add_string})
            and
            T1.REDAT >= '20230901'
            ORDER BY 
            T1.dana_ingestion_timestamp ;
                """
        extracted_mmsic_df = self.client.query(query).to_dataframe()
        return extracted_mmsic_df

    def extract_SISIC(self, sdp_supplier_list_for_sisic):
        add_string = ""
        for i in range(len(sdp_supplier_list_for_sisic)):
            add_string += "'"
            add_string += str(sdp_supplier_list_for_sisic[i])
            add_string += "'"
            add_string += ", "
        add_string = add_string[:-2]
        query = f"""
                select LIFNR,BELNR,RENR, REDAT, LFSNR, GEBRF, GSMWB, GSMWF,
              WAERS,WENUM,RGDAT,ABGST,AUFNR,T1.VORGN, T1.GJAHR,T2.WEDAT, DEBNOTNO 
              from 
              `metro-bi-dl-tur-prod.ingest_fgtf_mmsic.sis_to_dana_gr_invoice_header` AS T1
            LEFT JOIN 
            (SELECT DISTINCT  VORGN , WEDAT, GJAHR 
            from
            `metro-bi-dl-tur-prod.ingest_fgtf_mmsic.sis_to_dana_gr_table_header`) AS T2
            ON 
            T1.VORGN = T2.VORGN
            WHERE
            T1.LIFNR IN ({add_string})
            and
            T1.REDAT >= '20230901'
            ORDER BY 
            T1.dana_ingestion_timestamp ;
                """
        extracted_sisic_df = self.client.query(query).to_dataframe()
        return extracted_sisic_df

    def extract_FI(self, sdp_supplier_list_for_fi):
        add_string = ""
        for i in range(len(sdp_supplier_list_for_fi)):
            add_string += str(sdp_supplier_list_for_fi[i])
            add_string += ", "
        add_string = add_string[:-2]
        query = f"""
                  DECLARE country STRING DEFAULT 'tur';
        DECLARE current_fiscal_year INT64 DEFAULT 2024;
        DECLARE store_flag STRING DEFAULT 'prctr';-- or 'gsber';
        DECLARE end_month_id INT64 DEFAULT EXTRACT(YEAR FROM DATE_SUB(CURRENT_DATE(), INTERVAL 1 MONTH)) * 100 + EXTRACT(MONTH FROM DATE_SUB(CURRENT_DATE(), INTERVAL 1 MONTH));    
        DECLARE start_year INT64 DEFAULT 2024;
        DECLARE end_year INT64 DEFAULT 2024;
        CREATE OR REPLACE TABLE metro-bi-wb-mag-figov-s00.data_integrity_proj.sap_tur_360data_BELNR_testdoc
        AS(
        WITH fidoc AS (
        SELECT * ,
        MAX  (dana_ingestion_timestamp) over (PARTITION BY MANDT, BELNR, GJAHR, BUKRS, bseg.BUZEI) as max_timestamp,
        ROW_NUMBER () over (PARTITION BY MANDT, BELNR, GJAHR, BUKRS, bseg.BUZEI order by dana_ingestion_timestamp DESC) as rn
        FROM metro-bi-dl-tur-prod.ingest_fgtf_sap.fidoc fi,
        UNNEST (zbseg) AS bseg
        WHERE 1=1
        AND gjahr BETWEEN start_year AND end_year
        ),
        fidoc_unique AS (
        SELECT *
        , CASE  WHEN store_flag = 'gsber' THEN fi.gsber
            WHEN store_flag = 'prctr' THEN fi.prctr
        END AS business_area
        FROM fidoc fi
        WHERE fi.dana_ingestion_timestamp  = max_timestamp
        AND rn = 1
        )
        SELECT
        fi.MANDT
        , zbkpf.blart AS Document_type
        , doc_type.ltext as document_type_desc
        , GJAHR
        , BUKRS
        , GSBER
        , PRCTR
        , cast(prctr as int64)-cast(bukrs as int64)*10000 as store_or_dc
        , KOSTL
        , zbkpf.monat as month_in_fin_year
        , BELNR
        , zbkpf.XBLNR
        --, AUFNR
        , AUGBL
        , AUGDT
        , ZFBDT
        , ZBD1T
        , ZBD2T
        , NETDT
        , BUZEI
        , altkt
        , hkont
        , MOD(SAFE_CAST(fi.z_dana_lfa1.lifnr AS int64), 100000) as suppl_no
        , zbkpf.BLDAT
        , zbkpf.BUDAT
        , zbkpf.CPUDT
        , date(fi.PARTITIONTIME) partition_date
        , date(fi.dana_ingestion_timestamp) dana_ingestion_date
        , shkzg
        ,      CASE WHEN shkzg = 'H' THEN (-1) * fi.dmbtr
                    ELSE fi.dmbtr
                    END                 as Amount_in_local_currency
        ,      CASE WHEN shkzg = 'H' THEN (-1) * fi.wrbtr
                    ELSE fi.wrbtr
                    END                 as Amount_in_document_currency
        ,      CASE WHEN shkzg = 'H' THEN (-1) * fi.mwsts
                    ELSE fi.mwsts
                    END                 as Tax_in_local_currency
        ,      CASE WHEN shkzg = 'H' THEN (-1) * fi.wmwst
                    ELSE fi.wmwst
                    END                 as Tax_in_document_currency
        , zbkpf.WAERS
        , ZBKPF.GRPID AS Batch_Input_session_name
        , sgtxt
        -- *
        FROM fidoc_unique fi
        LEFT JOIN
        ( select * from metro-bi-dl-tur-prod.ingest_fgtf_sap.t003t AS doc_type
            WHERE 1=1
            AND doc_type.spras = 'EN'
            qualify dana_ingestion_timestamp = max(dana_ingestion_timestamp) over (partition by BLART, MANDT, SPRAS, SYSID)
            order by doc_type.blart
        ) AS doc_type
            ON zbkpf.blart = doc_type.blart
        where 1=1    
        and MOD(SAFE_CAST(fi.z_dana_lfa1.lifnr AS int64), 100000) IN ({add_string}) 
        )
                """
        extracted_fi_full_df = self.client.query(query).to_dataframe()
        query2 = f"select * from `metro-bi-wb-mag-figov-s00.data_integrity_proj.sap_tur_360data_BELNR_testdoc`"
        extracted_fi_df = self.client.query(query2).to_dataframe()
        # extracted_fi_df = extracted_fi_df.drop('BUZEI', axis=1)
        return extracted_fi_df

bq_ops = BigQuery_Operations()
mmsic_df = bq_ops.extract_MMSIC(sdp_supplier_list_for_mmsic)
sisic_df = bq_ops.extract_SISIC(sdp_supplier_list_for_sisic)
df_fi = bq_ops.extract_FI(sdp_supplier_list_for_fi)



## Suitable conversion of MMSIC

In [125]:
mmsic_df.shape

(31286, 17)

In [128]:
# 1. Strip first 5 digits from the value in column LIFNR
# Strip leading zeros from the LIFNR column
mmsic_df['LIFNR'] = mmsic_df['LIFNR'].apply(lambda x: str(int(x)) if pd.notna(x) else x)

# 2. Clean BELNR: Strip spaces, replace '' with NaN, and remove leading 0s
mmsic_df['BELNR'] = mmsic_df['BELNR'].str.strip()  # Remove leading/trailing spaces
mmsic_df['BELNR'] = mmsic_df['BELNR'].replace(r'^\s*$', np.nan, regex=True)  # Replace empty strings/spaces with NaN
mmsic_df['BELNR'] = mmsic_df['BELNR'].apply(lambda x: x.lstrip('0') if pd.notna(x) else x)  # Remove leading 0s

# 3. Remove leading 0s from LFSNR
mmsic_df['LFSNR'] = mmsic_df['LFSNR'].apply(lambda x: x.lstrip('0') if pd.notna(x) else x)

# 4. Clean WENUM: Replace all 0's with one '0', else remove leading zeros
mmsic_df['WENUM'] = mmsic_df['WENUM'].apply(
    lambda x: '0' if x == '0'*len(str(x)) else str(x).lstrip('0') if pd.notna(x) else x
)

# 5. Remove leading 0s from ABGST
mmsic_df['ABGST'] = mmsic_df['ABGST'].apply(lambda x: x.lstrip('0') if pd.notna(x) else x)

# 6. Remove leading 0s from AUFNR
mmsic_df['AUFNR'] = mmsic_df['AUFNR'].apply(lambda x: x.lstrip('0') if pd.notna(x) else x)

# 7. Replace None with NaN in WEDAT
mmsic_df['WEDAT'] = mmsic_df['WEDAT'].fillna(value=np.nan)

# Replace '' or consecutive spaces with NaN and remove leading zeros in column DEBNOTNO
mmsic_df['DEBNOTNO'] = mmsic_df['DEBNOTNO'].replace(r'^\s*$', np.nan, regex=True)  # Replace '' or spaces with NaN
mmsic_df['DEBNOTNO'] = mmsic_df['DEBNOTNO'].apply(lambda x: str(int(x)) if pd.notna(x) else x)  # Remove leading zeros

In [129]:
mmsic_df

Unnamed: 0,LIFNR,BELNR,RENR,REDAT,LFSNR,GEBRF,GSMWB,GSMWF,WAERS,WENUM,RGDAT,ABGST,AUFNR,VORGN,GJAHR,WEDAT,DEBNOTNO
0,10074,,AIB2023000019333,20230905,26474,106864.80,0.00,17810.80,TRY,249942,20230906,52,45296886,20230000386564,2023,,
1,10074,,AIB2023000019333,20230905,26474,106864.80,0.00,17810.80,TRY,249942,20230906,52,45296886,20230000386564,2023,,
2,10074,,AIB2023000019917,20230912,27386,7008.00,0.00,1168.00,TRY,251671,20230913,52,45296591,20230000397341,2023,,
3,10074,,AIB2023000019918,20230912,27387,15768.00,0.00,2628.00,TRY,251672,20230913,52,45296886,20230000397342,2023,,
4,10074,,AIB2023000019919,20230912,27388,282409.20,0.00,47068.20,TRY,251673,20230913,52,45297916,20230000397343,2023,,
5,10074,,AIB2023000019917,20230912,27386,7008.00,0.00,1168.00,TRY,251671,20230913,52,45296591,20230000397341,2023,,
6,10074,,AIB2023000019918,20230912,27387,15768.00,0.00,2628.00,TRY,251672,20230913,52,45296886,20230000397342,2023,,
7,10074,,AIB2023000019919,20230912,27388,282409.20,0.00,47068.20,TRY,251673,20230913,52,45297916,20230000397343,2023,,
8,10074,,AIB2023000020499,20230919,28204,192368.40,0.00,32061.40,TRY,253530,20230922,52,45299011,20230000412848,2023,,
9,10074,,AIB2023000020499,20230919,28204,192368.40,0.00,32061.40,TRY,253530,20230922,52,45299011,20230000412848,2023,,


In [131]:
mmsic_df.dtypes

LIFNR       object
BELNR       object
RENR        object
REDAT       object
LFSNR       object
GEBRF       object
GSMWB       object
GSMWF       object
WAERS       object
WENUM       object
RGDAT       object
ABGST       object
AUFNR       object
VORGN       object
GJAHR       object
WEDAT       object
DEBNOTNO    object
dtype: object

##  Suitable conversion of SISIC

In [145]:
sisic_df.shape

(22626, 17)

In [146]:
# 2. Clean BELNR: Strip spaces, replace '' with NaN, and remove leading 0s
sisic_df['BELNR'] = sisic_df['BELNR'].str.strip()  # Remove leading/trailing spaces
sisic_df['BELNR'] = sisic_df['BELNR'].replace(r'^\s*$', np.nan, regex=True)  # Replace empty strings/spaces with NaN
sisic_df['BELNR'] = sisic_df['BELNR'].apply(lambda x: x.lstrip('0') if pd.notna(x) else x)  # Remove leading 0s

# 3. Remove leading 0s from LFSNR
sisic_df['LFSNR'] = sisic_df['LFSNR'].apply(lambda x: x.lstrip('0') if pd.notna(x) else x)

# 4. Clean WENUM: Replace all 0's with one '0', else remove leading zeros
sisic_df['WENUM'] = sisic_df['WENUM'].apply(
    lambda x: '0' if x == '0'*len(str(x)) else str(x).lstrip('0') if pd.notna(x) else x
)

# 5. Remove leading 0s from ABGST
sisic_df['ABGST'] = sisic_df['ABGST'].apply(lambda x: x.lstrip('0') if pd.notna(x) else x)

# 6. Remove leading 0s from AUFNR
sisic_df['AUFNR'] = sisic_df['AUFNR'].apply(lambda x: x.lstrip('0') if pd.notna(x) else x)

# 7. Replace None with NaN in WEDAT
sisic_df['WEDAT'] = sisic_df['WEDAT'].fillna(value=np.nan)

# Replace empty strings, single space, or consecutive spaces in 'VORGN' and 'DEBNOTNO' with NaN
sisic_df['VORGN'] = sisic_df['VORGN'].replace(r'^\s*$', np.nan, regex=True)  # For VORGN
sisic_df['DEBNOTNO'] = sisic_df['DEBNOTNO'].replace(r'^\s*$', np.nan, regex=True)  # For DEBNOTNO

In [147]:
sisic_df

Unnamed: 0,LIFNR,BELNR,RENR,REDAT,LFSNR,GEBRF,GSMWB,GSMWF,WAERS,WENUM,RGDAT,ABGST,AUFNR,VORGN,GJAHR,WEDAT,DEBNOTNO
0,1000010074,810290849,AIF2024000000533,20240517,14582,358313.28,0.00,59718.88,TRY,316899,20240527,258,45333498,20240003353472,2024,20240517,
1,1000010095,,PI02024000031396,20240531,1,1525.19,0.00,138.65,TRY,0,20240531,1,888888888,,2024,20240411,
2,1000010095,,PI02024000031396,20240531,1,1525.19,0.00,138.65,TRY,0,20240531,1,888888888,,2024,20240617,
3,1000010095,,PI02024000031396,20240531,1,1525.19,0.00,138.65,TRY,0,20240531,1,888888888,,2024,20240522,
4,1000010095,,PI02024000031396,20240531,1,1525.19,0.00,138.65,TRY,0,20240531,1,888888888,,2024,20240912,
5,1000010095,,PI02024000031396,20240531,1,1525.19,0.00,138.65,TRY,0,20240531,1,888888888,,2024,20231011,
6,1000010095,,PI02024000031396,20240531,1,1525.19,0.00,138.65,TRY,0,20240531,1,888888888,,2024,20231028,
7,1000010095,,PI02024000031396,20240531,1,1525.19,0.00,138.65,TRY,0,20240531,1,888888888,,2024,20231103,
8,1000010095,,PI02024000031396,20240531,1,1525.19,0.00,138.65,TRY,0,20240531,1,888888888,,2024,20231107,
9,1000010095,,PI02024000031396,20240531,1,1525.19,0.00,138.65,TRY,0,20240531,1,888888888,,2024,20240511,


In [148]:
sisic_df.dtypes

LIFNR       object
BELNR       object
RENR        object
REDAT       object
LFSNR       object
GEBRF       object
GSMWB       object
GSMWF       object
WAERS       object
WENUM       object
RGDAT       object
ABGST       object
AUFNR       object
VORGN       object
GJAHR       object
WEDAT       object
DEBNOTNO    object
dtype: object

## Combining MMSIC, SSIC into IC

In [149]:
df_ic = pd.concat([mmsic_df, sisic_df], ignore_index=True)

In [150]:
df_ic

Unnamed: 0,LIFNR,BELNR,RENR,REDAT,LFSNR,GEBRF,GSMWB,GSMWF,WAERS,WENUM,RGDAT,ABGST,AUFNR,VORGN,GJAHR,WEDAT,DEBNOTNO
0,10074,810290849,AIB2023000019333,20230905,26474,106864.80,0.00,17810.80,TRY,249942,20230906,52,45296886,20230000386564,2023,,
1,10074,,AIB2023000019333,20230905,26474,106864.80,0.00,17810.80,TRY,249942,20230906,52,45296886,20230000386564,2023,,
2,10074,,AIB2023000019917,20230912,27386,7008.00,0.00,1168.00,TRY,251671,20230913,52,45296591,20230000397341,2023,,
3,10074,,AIB2023000019918,20230912,27387,15768.00,0.00,2628.00,TRY,251672,20230913,52,45296886,20230000397342,2023,,
4,10074,,AIB2023000019919,20230912,27388,282409.20,0.00,47068.20,TRY,251673,20230913,52,45297916,20230000397343,2023,,
5,10074,,AIB2023000019917,20230912,27386,7008.00,0.00,1168.00,TRY,251671,20230913,52,45296591,20230000397341,2023,,
6,10074,,AIB2023000019918,20230912,27387,15768.00,0.00,2628.00,TRY,251672,20230913,52,45296886,20230000397342,2023,,
7,10074,,AIB2023000019919,20230912,27388,282409.20,0.00,47068.20,TRY,251673,20230913,52,45297916,20230000397343,2023,,
8,10074,,AIB2023000020499,20230919,28204,192368.40,0.00,32061.40,TRY,253530,20230922,52,45299011,20230000412848,2023,,
9,10074,,AIB2023000020499,20230919,28204,192368.40,0.00,32061.40,TRY,253530,20230922,52,45299011,20230000412848,2023,,


##  Suitable conversion of FI

In [153]:
df_fi.dtypes

MANDT                           object
Document_type                   object
document_type_desc              object
GJAHR                            Int64
BUKRS                           object
GSBER                           object
PRCTR                           object
store_or_dc                      Int64
KOSTL                           object
month_in_fin_year                Int64
BELNR                           object
XBLNR                           object
AUGBL                           object
AUGDT                           dbdate
ZFBDT                           dbdate
ZBD1T                          float64
ZBD2T                          float64
NETDT                           dbdate
BUZEI                            Int64
altkt                           object
hkont                           object
suppl_no                         Int64
BLDAT                           dbdate
BUDAT                           dbdate
CPUDT                           dbdate
partition_date           

In [163]:
dtype_conversion = {
    "MANDT": "int64",
    "Document_type": "object",
    "document_type_desc": "object",
    "GJAHR": "object",
    "BUKRS": "object",
    "GSBER": "float64",
    "PRCTR": "float64",
    "store_or_dc": "object",
    "KOSTL": "float64",
    "month_in_fin_year": "int64",
    "BELNR": "object",
    "XBLNR": "object",
    "AUGBL": "float64",
    "AUGDT": "datetime64[ns]",
    "ZFBDT": "datetime64[ns]",
    "ZBD1T": "Int64",
    "ZBD2T": "float64",
    "NETDT": "float64",
    "BUZEI": "object",
    "altkt": "int64",
    "hkont": "int64",
    "suppl_no": "object",
    "BLDAT": "datetime64[ns]",
    "BUDAT": "datetime64[ns]",
    "CPUDT": "datetime64[ns]",
    "partition_date": "datetime64[ns]",
    "dana_ingestion_date": "datetime64[ns]",
    "shkzg": "object",
    "Amount_in_local_currency": "float64",
    "Amount_in_document_currency": "float64",
    "Tax_in_local_currency": "float64",
    "Tax_in_document_currency": "float64",
    "WAERS": "object",
    "Batch_Input_session_name": "object",
    "sgtxt": "object"
}

# Convert the columns to the specified data types
for column, dtype in dtype_conversion.items():
    if "datetime" in dtype:
        df_fi[column] = pd.to_datetime(df_fi[column], errors='coerce')  # Handle invalid dates gracefully
    else:
        df_fi[column] = df_fi[column].astype(dtype, errors='ignore')

df_fi['store_or_dc'] = pd.to_numeric(df_fi['store_or_dc'], errors='coerce')

# Convert to nullable integer type
df_fi['store_or_dc'] = df_fi['store_or_dc'].astype('Int64')
df_fi['store_or_dc'] = df_fi['store_or_dc'].astype('object')

# 1. Remove leading 0's from BELNR column
df_fi['BELNR'] = df_fi['BELNR'].apply(lambda x: str(int(x)) if pd.notna(x) else x)

# 2. Convert NETDT from NaT to NaN and change its datatype from dbdate to float64
df_fi['NETDT'] = pd.to_numeric(df_fi['NETDT'], errors='coerce')  # Coerce NaT to NaN
df_fi['NETDT'] = df_fi['NETDT'].astype('float64')

# 3. Replace None with NaN in Batch_Input_session_name and sgtxt columns
df_fi['Batch_Input_session_name'] = df_fi['Batch_Input_session_name'].replace({None: np.nan})
df_fi['sgtxt'] = df_fi['sgtxt'].replace({None: np.nan})

In [164]:
df_fi

Unnamed: 0,MANDT,Document_type,document_type_desc,GJAHR,BUKRS,GSBER,PRCTR,store_or_dc,KOSTL,month_in_fin_year,BELNR,XBLNR,AUGBL,AUGDT,ZFBDT,ZBD1T,ZBD2T,NETDT,BUZEI,altkt,hkont,suppl_no,BLDAT,BUDAT,CPUDT,partition_date,dana_ingestion_date,shkzg,Amount_in_local_currency,Amount_in_document_currency,Tax_in_local_currency,Tax_in_document_currency,WAERS,Batch_Input_session_name,sgtxt
0,100,WE,Goods receiving (inv,2024,3142,,31426091.0,6091,,1,810008260,AIB2023000021730,,NaT,NaT,,,,3,1562110000,1136105000,10074,2023-10-03,2023-10-03,2023-10-09,2023-11-01,2023-11-01,S,28879.40,28879.40,,,TRY,3142_MMSIC,
1,100,WE,Goods receiving (inv,2024,3142,,31426091.0,6091,,1,810031055,AIB2023000022668,,NaT,NaT,,,,3,1562110000,1136105000,10074,2023-10-17,2023-10-18,2023-10-26,2023-11-01,2023-11-01,S,41147.40,41147.40,,,TRY,3142_MMSIC,
2,100,WE,Goods receiving (inv,2024,3142,,31426091.0,6091,,1,810034656,AIB2023000022938,,NaT,NaT,,,,3,1562110000,1136105000,10074,2023-10-19,2023-10-25,2023-10-30,2023-11-01,2023-11-01,S,3144.00,3144.00,,,TRY,3142_MMSIC,
3,100,WE,Goods receiving (inv,2024,3142,,31426091.0,6091,,2,810040668,AIB2023000023347,,NaT,NaT,,,,3,1562110000,1136105000,10074,2023-10-25,2023-11-01,2023-11-02,2023-11-06,2023-11-06,S,36809.20,36809.20,,,TRY,3142_MMSIC,
4,100,WE,Goods receiving (inv,2024,3142,,31426091.0,6091,,2,810043353,AIA2023000012474,,NaT,NaT,,,,3,1562110000,1136105000,10074,2023-10-31,2023-11-01,2023-11-06,2023-11-16,2023-11-16,S,23050.40,23050.40,,,TRY,3142_MMSIC,
5,100,WE,Goods receiving (inv,2024,3142,,31426091.0,6091,,2,810043369,AID2023000008457,,NaT,NaT,,,,3,1562110000,1136105000,10074,2023-10-13,2023-11-01,2023-11-06,2023-11-16,2023-11-16,S,16714.60,16714.60,,,TRY,3142_MMSIC,
6,100,WE,Goods receiving (inv,2024,3142,,31421021.0,1021,,2,810043501,AIB2022000027094,,NaT,NaT,,,,3,1562110000,1136105000,10074,2022-12-13,2023-11-01,2023-11-06,2023-11-16,2023-11-16,S,220.76,220.76,,,TRY,3142_MMSIC,
7,100,WE,Goods receiving (inv,2024,3142,,31421039.0,1039,,2,810043549,AIB2022000027096,,NaT,NaT,,,,3,1562110000,1136105000,10074,2022-12-13,2023-11-01,2023-11-06,2023-11-16,2023-11-16,S,235.58,235.58,,,TRY,3142_MMSIC,
8,100,WE,Goods receiving (inv,2024,3142,,31421041.0,1041,,2,810043550,AIB2022000027090,,NaT,NaT,,,,3,1562110000,1136105000,10074,2022-12-13,2023-11-01,2023-11-06,2023-11-16,2023-11-16,S,900.48,900.48,,,TRY,3142_MMSIC,
9,100,WE,Goods receiving (inv,2024,3142,,31421047.0,1047,,2,810043560,AIB2022000027054,,NaT,NaT,,,,3,1562110000,1136105000,10074,2022-12-13,2023-11-01,2023-11-06,2023-11-16,2023-11-16,S,51.84,51.84,,,TRY,3142_MMSIC,


In [165]:
df_fi.dtypes

MANDT                                   int64
Document_type                          object
document_type_desc                     object
GJAHR                                  object
BUKRS                                  object
GSBER                                 float64
PRCTR                                 float64
store_or_dc                            object
KOSTL                                 float64
month_in_fin_year                       int64
BELNR                                  object
XBLNR                                  object
AUGBL                                 float64
AUGDT                          datetime64[ns]
ZFBDT                          datetime64[ns]
ZBD1T                                   Int64
ZBD2T                                 float64
NETDT                                 float64
BUZEI                                  object
altkt                                   int64
hkont                                   int64
suppl_no                          

## Pre-processing Code from here

In [None]:
df_ic.to_csv(r'C:\Users\sappidi.reddy\Documents\Dummy\combined_ic.csv', index=False)

In [169]:
df_fi.shape

(545, 35)

In [170]:
df_ic.shape

(53912, 17)

In [171]:
sftp_df.shape

(26, 18)

In [172]:
df_fi = df_fi[df_fi['Document_type'] != 'PM']

In [173]:
df_fi['XBLNR'] = df_fi['XBLNR'].astype(str).apply(lambda x: x.zfill(10) if x.isnumeric() else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fi['XBLNR'] = df_fi['XBLNR'].astype(str).apply(lambda x: x.zfill(10) if x.isnumeric() else x)


In [174]:
df_fi.shape

(479, 35)

In [175]:
df_ic.shape

(53912, 17)

In [176]:
sftp_df.shape

(26, 18)

In [179]:
doc_to_type_ic = dict(zip(df_ic['BELNR'], df_ic['RENR']))

    # Fill missing values in FI_df['XBLNR'] using the mapping and FI_df['BELNR']
df_fi['XBLNR'] = df_fi['XBLNR'].fillna(df_fi['BELNR'].map(doc_to_type_ic))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_fi['XBLNR'] = df_fi['XBLNR'].fillna(df_fi['BELNR'].map(doc_to_type_ic))


In [180]:
doc_to_type_fi = dict(zip(df_fi['XBLNR'], df_fi['BELNR']))

df_ic['BELNR'] = df_ic['BELNR'].str.strip().replace('', np.nan)
    # Fill missing values in CI_df['BELNR'] using the mapping and CI_df['RENR']
df_ic['BELNR'] = df_ic['BELNR'].fillna(df_ic['RENR'].map(doc_to_type_fi))

In [184]:
grouped_fi = df_fi.groupby(['BELNR', 'suppl_no'], as_index=False)

# Step 2: Populate values for ZFBDT, ZBD1T, ZBD2T
def populate_values(group):
    group['ZFBDT'] = group['ZFBDT'].fillna(method='ffill').fillna(method='bfill')
    group['ZBD1T'] = group['ZBD1T'].fillna(method='ffill').fillna(method='bfill')
    group['ZBD2T'] = group['ZBD2T'].fillna(method='ffill').fillna(method='bfill')
    return group

df_fi_filled = grouped_fi.apply(populate_values)

# Reset index to avoid ambiguity
df_fi_filled.reset_index(drop=True, inplace=True)

# Step 3: Find the record with the highest absolute TDC, then lowest BUZEI in case of ties
def select_record(group):
    max_abs_tdc = group['Amount_in_local_currency'].abs().max()
    filtered = group[group['Amount_in_local_currency'].abs() == max_abs_tdc]
    return filtered.loc[filtered['BUZEI'].idxmin()]

df_fi = df_fi_filled.groupby(['BELNR', 'suppl_no'], as_index=False).apply(select_record).reset_index(drop=True)



  group['ZFBDT'] = group['ZFBDT'].fillna(method='ffill').fillna(method='bfill')
  group['ZBD1T'] = group['ZBD1T'].fillna(method='ffill').fillna(method='bfill')
  group['ZBD2T'] = group['ZBD2T'].fillna(method='ffill').fillna(method='bfill')
  group['ZBD1T'] = group['ZBD1T'].fillna(method='ffill').fillna(method='bfill')
  df_fi_filled = grouped_fi.apply(populate_values)
  df_fi = df_fi_filled.groupby(['BELNR', 'suppl_no'], as_index=False).apply(select_record).reset_index(drop=True)


In [185]:
df_fi.shape

(132, 35)

In [186]:
def get_max_populated_row(group):
    # Count non-null values for each row
    non_null_counts = group.notnull().sum(axis=1)
    # Get the index of the row with the maximum count
    max_index = non_null_counts.idxmax()
    return group.loc[max_index]

In [187]:
df_ic = df_ic.groupby('RENR').apply(get_max_populated_row).reset_index(drop=True)

  df_ic = df_ic.groupby('RENR').apply(get_max_populated_row).reset_index(drop=True)


In [188]:
df_ic.shape

(107, 17)

In [189]:
df_fi.shape

(132, 35)

In [190]:
sftp_df.shape

(26, 18)

## standarized the data

In [191]:
#preprocess ic/FI/sftp document numbers. make the length 10 or above

df_ic['LIFNR'] = df_ic['LIFNR'].apply(
    lambda x: '10000' + str(x) if not pd.isna(x) and len(str(x)) < 10 else str(x) if not pd.isna(x) else '0' * 10
)

df_fi['suppl_no'] = df_fi['suppl_no'].apply(
    lambda x: '10000' + str(x) if not pd.isna(x) and len(str(x)) < 10 else str(x) if not pd.isna(x) else '0' * 10
)

df_ic['BELNR'] = df_ic['BELNR'].apply(
     lambda x: '0' * (10 - len(str(x))) + str(x) if not pd.isna(x) else '0' * 10)

df_fi['BELNR'] = df_fi['BELNR'].apply(
     lambda x: '0' * (10 - len(str(x))) + str(x) if not pd.isna(x) else '0' * 10)

sftp_df['Document number'] = sftp_df['Document number'].apply(
     lambda x: '0' * (10 - len(str(x))) + str(x) if not pd.isna(x) else '0' * 10)
     


In [192]:
def transform_number(num):
    if pd.isna(num):  # Check for NaN values
        return np.nan   # Return NaN if the input is NaN
    num_str = str(num)  # Convert to string

    # If the number already starts with '10000', return it as is
    if num_str.startswith('10000'):
        return num_str
    
    stripped_num = num_str.lstrip('0')  # Remove leading zeros
    final_num = stripped_num.zfill(6)  # Ensure it has at least 6 digits
    return '1000' + final_num  # Prepend '10000'

# Apply the function to the relevant column

df_ic['LIFNR'] = df_ic['LIFNR'].apply(transform_number)

In [193]:
#PREPROCESS DATE TIME. make all %d.%m.%Y

for col in df_ic.select_dtypes(include=['datetime64[ns]', 'datetime']):
    df_ic[col] = df_ic[col].dt.strftime('%d.%m.%Y')

for col in df_fi.select_dtypes(include=['datetime64[ns]', 'datetime']):
    df_fi[col] = df_fi[col].dt.strftime('%d.%m.%Y')
    

In [194]:
df_fi['BLDAT']

0      02.10.2023
1      02.10.2023
2      02.10.2023
3      02.10.2023
4      02.10.2023
5      03.11.2023
6      31.10.2023
7      31.10.2023
8      30.11.2023
9      30.11.2023
10     31.12.2023
11     31.01.2024
12     29.02.2024
13     29.02.2024
14     31.03.2024
15     31.03.2024
16     31.03.2024
17     31.03.2024
18     30.04.2024
19     31.05.2024
20     30.06.2024
21     12.07.2024
22     31.07.2024
23     31.07.2024
24     15.08.2024
          ...    
107    22.08.2024
108    29.08.2024
109    05.09.2024
110    09.09.2024
111    20.09.2024
112    03.11.2023
113    03.11.2023
114    03.11.2023
115    03.11.2023
116    04.11.2023
117    04.11.2023
118    15.11.2023
119    23.11.2023
120    18.01.2024
121    19.01.2024
122    29.01.2024
123    18.03.2024
124    18.03.2024
125    09.05.2024
126    10.05.2024
127    18.07.2024
128    18.07.2024
129    31.07.2024
130    31.07.2024
131    15.07.2024
Name: BLDAT, Length: 132, dtype: object

In [195]:
#convert string types into %d.%m.%Y. IC frame
for col in df_ic.select_dtypes(include=['object']):
    try:
        # Convert to datetime, assuming the format is 'YYYYMMDD'
        df_ic[col] = pd.to_datetime(df_ic[col], format='%Y%m%d')
        # Optionally, you can convert it to DMY format
        df_ic[col] = df_ic[col].dt.strftime('%d.%m.%Y')
    except ValueError:
        # If conversion fails, just continue
        pass

In [196]:
#convert string types into %d.%m.%Y. IC frame
for col in df_fi.select_dtypes(include=['object']):
    try:
        # Convert to datetime, assuming the format is 'YYYYMMDD'
        df_fi[col] = pd.to_datetime(df_fi[col], format='%Y%m%d')
        # Optionally, you can convert it to DMY format
        df_fi[col] = df_fi[col].dt.strftime('%d.%m.%Y')
    except ValueError:
        # If conversion fails, just continue
        pass

In [197]:
df_ic.to_csv(r"C:\Users\sappidi.reddy\Documents\Dummy\ic_data_cleaned.csv", index=False)

In [198]:
df_fi.to_csv(r"C:\Users\sappidi.reddy\Documents\Dummy\fi_data_cleaned.csv", index=False)

In [199]:
df_fi['Amount_in_local_currency'] = df_fi['Amount_in_local_currency'] * -1

In [200]:
#merge FI and IC

merge_fi_ic = df_fi.merge(df_ic, left_on=[ "BELNR","XBLNR"], right_on=[
         "BELNR","RENR"], how="outer")


In [201]:
merge_fi_ic.to_csv(r"C:\Users\sappidi.reddy\Documents\Dummy\merge1_check.csv", index=False)

In [202]:
print(merge_fi_ic.shape)

(172, 51)


In [203]:
#check lengths and dtyes of merged frame
measurer = np.vectorize(len)
res = measurer(merge_fi_ic.values.astype(str)).max(axis=0)
res = dict(zip(merge_fi_ic, measurer(merge_fi_ic.values.astype(str)).max(axis=0)))
print(res)

{'MANDT': np.int64(5), 'Document_type': np.int64(3), 'document_type_desc': np.int64(20), 'GJAHR_x': np.int64(6), 'BUKRS': np.int64(4), 'GSBER': np.int64(3), 'PRCTR': np.int64(10), 'store_or_dc': np.int64(6), 'KOSTL': np.int64(3), 'month_in_fin_year': np.int64(4), 'BELNR': np.int64(10), 'XBLNR': np.int64(16), 'AUGBL': np.int64(12), 'AUGDT': np.int64(10), 'ZFBDT': np.int64(10), 'ZBD1T': np.int64(4), 'ZBD2T': np.int64(3), 'NETDT': np.int64(3), 'BUZEI': np.int64(3), 'altkt': np.int64(12), 'hkont': np.int64(12), 'suppl_no': np.int64(10), 'BLDAT': np.int64(10), 'BUDAT': np.int64(10), 'CPUDT': np.int64(10), 'partition_date': np.int64(10), 'dana_ingestion_date': np.int64(10), 'shkzg': np.int64(3), 'Amount_in_local_currency': np.int64(10), 'Amount_in_document_currency': np.int64(10), 'Tax_in_local_currency': np.int64(3), 'Tax_in_document_currency': np.int64(3), 'WAERS_x': np.int64(3), 'Batch_Input_session_name': np.int64(12), 'sgtxt': np.int64(25), 'LIFNR': np.int64(10), 'RENR': np.int64(16), '

In [204]:
### sftp with merged, ic and fi data
merged_df = merge_fi_ic.merge(
        sftp_df, left_on=["BELNR", "XBLNR"],
        right_on=["Document number", "Invoice number"],
        how="outer")

In [205]:
merged_df[['BLDAT', 'RGDAT']]

Unnamed: 0,BLDAT,RGDAT
0,,07.09.2023
1,,07.09.2023
2,,07.09.2023
3,,07.09.2023
4,,07.09.2023
5,,06.06.2024
6,,06.06.2024
7,,06.06.2024
8,,13.09.2023
9,,13.09.2023


In [206]:
df_fi['BLDAT']

0      02.10.2023
1      02.10.2023
2      02.10.2023
3      02.10.2023
4      02.10.2023
5      03.11.2023
6      31.10.2023
7      31.10.2023
8      30.11.2023
9      30.11.2023
10     31.12.2023
11     31.01.2024
12     29.02.2024
13     29.02.2024
14     31.03.2024
15     31.03.2024
16     31.03.2024
17     31.03.2024
18     30.04.2024
19     31.05.2024
20     30.06.2024
21     12.07.2024
22     31.07.2024
23     31.07.2024
24     15.08.2024
          ...    
107    22.08.2024
108    29.08.2024
109    05.09.2024
110    09.09.2024
111    20.09.2024
112    03.11.2023
113    03.11.2023
114    03.11.2023
115    03.11.2023
116    04.11.2023
117    04.11.2023
118    15.11.2023
119    23.11.2023
120    18.01.2024
121    19.01.2024
122    29.01.2024
123    18.03.2024
124    18.03.2024
125    09.05.2024
126    10.05.2024
127    18.07.2024
128    18.07.2024
129    31.07.2024
130    31.07.2024
131    15.07.2024
Name: BLDAT, Length: 132, dtype: object

In [207]:
merged_df.to_csv(r"C:\Users\sappidi.reddy\Documents\Dummy\merge2_check_sftp.csv", index=False)

In [208]:
merged_df.shape

(172, 69)

In [209]:
# Split the ARKTX column by '#'
split_columns = merged_df['ARKTX'].str.split('#', expand=True)

# Extract Business Year and Line Item No., ensuring they stay as integers if ARKTX is not blank
# merged_df['Business Year SFTP'] = split_columns[3].apply(lambda x: int(x) if pd.notnull(x) and x.isdigit() else pd.NA).astype('Int64')
# merged_df['Line Item No. SFTP'] = split_columns[4].apply(lambda x: int(x) if pd.notnull(x) and x.isdigit() else pd.NA).astype('Int64')

merged_df['Business Year SFTP'] = split_columns[3].apply(
    lambda x: str(int(x)) if pd.notnull(x) and x.isdigit() else pd.NA
).astype('string')

merged_df['Line Item No. SFTP'] = split_columns[4].apply(
    lambda x: str(int(x)) if pd.notnull(x) and x.isdigit() else pd.NA
).astype('string')




In [210]:
merged_df.shape

(172, 71)

In [211]:
merged_df['Doc no. combined'] = merged_df['BELNR'].fillna(merged_df['Document number'])

In [212]:
merged_df.shape

(172, 72)

## populate ARKTX

In [214]:
# Assuming merged_df is your DataFrame
merged_df["year"] = merged_df.BLDAT.apply(lambda x: x.split('.')[2] if isinstance(x, str) and len(x.split('.')) > 2 else None)
merged_df["month"] = merged_df.BLDAT.apply(lambda x: x.split('.')[1] if isinstance(x, str) and len(x.split('.')) > 1 else None)
merged_df["day"] = merged_df.BLDAT.apply(lambda x: x.split('.')[0] if isinstance(x, str) and len(x.split('.')) > 0 else None)

mask_empty_ARKTX = merged_df["ARKTX"].isna()

# Apply the logic only on those rows
merged_df.loc[mask_empty_ARKTX, "ARKTX"] = merged_df[mask_empty_ARKTX].apply(
    lambda row: (
        f"{row['Document_type']}#{row['BELNR']}#{row['year']}{row['month']}{row['day']}#{row['year']}#00{row['BUZEI']}"
        if all(pd.notna([row['Document_type'], row['BELNR'], row['year'], row['month'], row['day'], row['BUZEI']]))
        else np.nan
    ),
    axis=1
)

## net due calculations

In [215]:
merged_df['ZFBDT'] = pd.to_datetime(merged_df['ZFBDT'], format='%d.%m.%Y')

merged_df['ZBD1T'] = pd.to_numeric(merged_df['ZBD1T'], errors='coerce')
merged_df['ZBD2T'] = pd.to_numeric(merged_df['ZBD2T'], errors='coerce')

merged_df['NET_DUE_DATE'] = merged_df.apply(
    lambda row: row['ZFBDT'] + pd.Timedelta(days=row['ZBD1T'] if pd.notna(row['ZBD1T']) else (row['ZBD2T'] if pd.notna(row['ZBD2T']) else 0)),
    axis=1
)
merged_df['NET_DUE_DATE'] = merged_df['NET_DUE_DATE'].dt.strftime('%d.%m.%Y')

## Invoice status assignment

In [216]:
ic_tran_status_df = pd.read_csv(r"C:\Users\sappidi.reddy\Downloads\Sample MIAG 3\ic_transaction_status_R.csv", encoding='windows-1252', dtype={'TRANSACTION STATUS (ABGST)': 'str', 'LBL - \nTRANSACTION STATUS (ABGST)':'str', 'VIPA \nTRANSACTION STATUS (ABGST)':'str',"360 invoice status proposal (25.4.)":'str'})
ic_tran_status_df = ic_tran_status_df[["TRANSACTION STATUS (ABGST)", "LBL - \nTRANSACTION STATUS (ABGST)", "VIPA \nTRANSACTION STATUS (ABGST)", "360 invoice status proposal (25.4.)"]]
ic_tran_status_df["ABGST"] = ic_tran_status_df["TRANSACTION STATUS (ABGST)"].fillna(ic_tran_status_df['LBL - \nTRANSACTION STATUS (ABGST)']).fillna(ic_tran_status_df["VIPA \nTRANSACTION STATUS (ABGST)"])
ic_tran_status_df = ic_tran_status_df[["ABGST", "360 invoice status proposal (25.4.)"]]
abgst_status_dict = ic_tran_status_df.set_index('ABGST')['360 invoice status proposal (25.4.)'].to_dict()
len(abgst_status_dict)

197

In [217]:
# Step 1: Populate 'Inv Stat' with "cleared-MIAG" for rows where 'RAN' is present
merged_df.loc[merged_df['Remittance advice number'].notna(), 'INVOICE_STATUS'] = "cleared-MIAG"

# Step 2: Populate 'Inv Stat' with "cleared-FI" for rows where 'Inv Stat' is null and 'AUGBL' is present
merged_df.loc[merged_df['INVOICE_STATUS'].isna() & merged_df['AUGBL'].notna(), 'INVOICE_STATUS'] = "cleared-FI"

# Step 3: Populate 'Inv Stat' with "Invoice Approval completed" for rows where 'Inv Stat' is null and 'BLDAT' is present
merged_df.loc[merged_df['INVOICE_STATUS'].isna() & merged_df['BLDAT'].notna(), 'INVOICE_STATUS'] = "Invoice approval completed"

# Step 4: Populate 'Inv Stat' based on mapping from abgst_status_dict for rows where 'Inv Stat' is null and 'ABGST' is present
merged_df.loc[merged_df['INVOICE_STATUS'].isna() & merged_df['ABGST'].notna(), 'INVOICE_STATUS'] = merged_df['ABGST'].map(abgst_status_dict)

# Step 5: Populate remaining 'Inv Stat' as "In progress" where 'Inv Stat' is still null
merged_df['INVOICE_STATUS'].fillna("In progress", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['INVOICE_STATUS'].fillna("In progress", inplace=True)


In [218]:
merged_df.shape

(172, 77)

## GR Invoice Implementation

In [219]:
# Separate rows with Doc no. combined having all 0's
df_all_zeros = merged_df[merged_df['Doc no. combined'] == '0000000000']

# Exclude rows with Doc no. combined having all 0's for further grouping
df_non_zero = merged_df[merged_df['Doc no. combined'] != '0000000000']

# Group by Doc no. combined
grouped = df_non_zero.groupby('Doc no. combined')

# Filter groups based on their lengths
df_group_len_2 = grouped.filter(lambda x: len(x) == 2)
df_group_len_1 = grouped.filter(lambda x: len(x) == 1)
df_group_len_3 = grouped.filter(lambda x: len(x) > 2)

# Now we have the three dataframes
# df_all_zeros: rows with Doc no. combined having all 0's
# df_group_len_1: groups with length 1
# df_group_len_2: groups with length 2
# df_group_len_3: groups with greater than length 2


In [220]:
print("All zeros : ", len(df_all_zeros))
print("Length 1 : ", len(df_group_len_1))
print("Length 2 : ", len(df_group_len_2))
print("Length 3 : ", len(df_group_len_3))

All zeros :  19
Length 1 :  143
Length 2 :  10
Length 3 :  0


In [222]:
df_group_len_2.to_csv(r'C:\Users\sappidi.reddy\Documents\Dummy\gr_invs_len_2.csv', index=False)

In [223]:
df_group_len_3.to_csv(r'C:\Users\sappidi.reddy\Documents\Dummy\gr_invs_len_3.csv', index=False)

In [224]:
# Group the dataframe by 'Doc no. combined'
grouped = df_group_len_2.groupby('Doc no. combined')

# Initialize empty dataframes
both_cleared_or_one_progress = pd.DataFrame()
gr_invoice_records = pd.DataFrame()
not_cleared_miag_records = pd.DataFrame()

# Iterate over each group
for doc_no, group in grouped:
    statuses = group['INVOICE_STATUS'].tolist()
    
    # Check conditions for both_cleared_or_one_progress
    if statuses == ['cleared-MIAG', 'cleared-MIAG'] or \
       ('cleared-MIAG' in statuses and 'In progress' in statuses):
        both_cleared_or_one_progress = pd.concat([both_cleared_or_one_progress, group])
    # Check conditions for gr_invoice_records
    elif 'cleared-MIAG' in statuses and any(status in ['cleared-FI', 'Invoice approval completed'] for status in statuses):
        gr_invoice_records = pd.concat([gr_invoice_records, group])
    elif all(status != 'cleared-MIAG' for status in statuses):
        not_cleared_miag_records = pd.concat([not_cleared_miag_records, group])

# Reset index for the new DataFrames if needed
both_cleared_or_one_progress.reset_index(drop=True, inplace=True)
gr_invoice_records.reset_index(drop=True, inplace=True)
not_cleared_miag_records.reset_index(drop=True, inplace=True)

# both_cleared_or_one_progress and gr_invoice_records now contain the desired groups


In [225]:
print("Length 2 : ", len(df_group_len_2))
print("Both cleared or one progress : ", len(both_cleared_or_one_progress))
print("GR Invoice Records : ", len(gr_invoice_records))
print("not_cleared_miag_records ", len(not_cleared_miag_records))

Length 2 :  10
Both cleared or one progress :  4
GR Invoice Records :  0
not_cleared_miag_records  6


In [226]:
both_cleared_or_one_progress

Unnamed: 0,MANDT,Document_type,document_type_desc,GJAHR_x,BUKRS,GSBER,PRCTR,store_or_dc,KOSTL,month_in_fin_year,BELNR,XBLNR,AUGBL,AUGDT,ZFBDT,ZBD1T,ZBD2T,NETDT,BUZEI,altkt,hkont,suppl_no,BLDAT,BUDAT,CPUDT,...,Supplier number (Sales Line),Supplier number (MIAG),Supplier name,VAT number,Document number,Invoice number,Document type,Document date,Remittance advice number,Value date,Currency,Gross amount,Description,Contract indicator,Store,Company code,ARKTX,Business Year SFTP,Line Item No. SFTP,Doc no. combined,year,month,day,NET_DUE_DATE,INVOICE_STATUS
0,100.0,WE,Goods receiving (inv,2024.0,3142.0,,31426091.0,6091.0,,8.0,810290849,AIF2024000000533,,,2024-05-17,70.0,,,1.0,2521100000.0,1332110000.0,1000010074.0,17.05.2024,17.05.2024,28.05.2024,...,1000010074.0,3500000073.0,DİVERSEY KİMYA SAN VE TİC A.Ş.,2950016000.0,810290849.0,AIF2024000000533,WE,17.05.2024,3517071238.0,08.08.2024,TRY,358313.28,cleared,MIAG contract,6091.0,3142.0,WE#0810290849#20240517#2024#001,2024.0,1.0,810290849,2024.0,5.0,17.0,26.07.2024,cleared-MIAG
1,,,,,,,,,,,810290849,,,,NaT,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,810290849,,,,,In progress
2,100.0,WE,Goods receiving (inv,2024.0,3142.0,,31426091.0,6091.0,,9.0,810300358,AIB2024000010311,,,2024-05-28,70.0,,,1.0,2521100000.0,1332110000.0,1000010074.0,28.05.2024,01.06.2024,04.06.2024,...,1000010074.0,3500000073.0,DİVERSEY KİMYA SAN VE TİC A.Ş.,2950016000.0,810300358.0,AIB2024000010311,WE,28.05.2024,3517071238.0,08.08.2024,TRY,508680.12,cleared,MIAG contract,6091.0,3142.0,WE#0810300358#20240528#2024#001,2024.0,1.0,810300358,2024.0,5.0,28.0,06.08.2024,cleared-MIAG
3,,,,,,,,,,,810300358,,,,NaT,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,810300358,,,,,In progress


In [232]:
df_group_len_2

Unnamed: 0,MANDT,Document_type,document_type_desc,GJAHR_x,BUKRS,GSBER,PRCTR,store_or_dc,KOSTL,month_in_fin_year,BELNR,XBLNR,AUGBL,AUGDT,ZFBDT,ZBD1T,ZBD2T,NETDT,BUZEI,altkt,hkont,suppl_no,BLDAT,BUDAT,CPUDT,...,Supplier number (Sales Line),Supplier number (MIAG),Supplier name,VAT number,Document number,Invoice number,Document type,Document date,Remittance advice number,Value date,Currency,Gross amount,Description,Contract indicator,Store,Company code,ARKTX,Business Year SFTP,Line Item No. SFTP,Doc no. combined,year,month,day,NET_DUE_DATE,INVOICE_STATUS
55,,,,,,,,,,,810046023,,,,NaT,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,810046023,,,,,In progress
56,,,,,,,,,,,810046023,,,,NaT,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,810046023,,,,,Invoice approval completed
58,,,,,,,,,,,810060014,,,,NaT,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,810060014,,,,,In progress
59,,,,,,,,,,,810060014,,,,NaT,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,810060014,,,,,Invoice approval completed
96,100.0,WE,Goods receiving (inv,2024.0,3142.0,,31426091.0,6091.0,,8.0,810290849,AIF2024000000533,,,2024-05-17,70.0,,,1.0,2521100000.0,1332110000.0,1000010074.0,17.05.2024,17.05.2024,28.05.2024,...,1000010074.0,3500000073.0,DİVERSEY KİMYA SAN VE TİC A.Ş.,2950016000.0,810290849.0,AIF2024000000533,WE,17.05.2024,3517071238.0,08.08.2024,TRY,358313.28,cleared,MIAG contract,6091.0,3142.0,WE#0810290849#20240517#2024#001,2024.0,1.0,810290849,2024.0,5.0,17.0,26.07.2024,cleared-MIAG
97,,,,,,,,,,,810290849,,,,NaT,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,810290849,,,,,In progress
98,100.0,WE,Goods receiving (inv,2024.0,3142.0,,31426091.0,6091.0,,9.0,810300358,AIB2024000010311,,,2024-05-28,70.0,,,1.0,2521100000.0,1332110000.0,1000010074.0,28.05.2024,01.06.2024,04.06.2024,...,1000010074.0,3500000073.0,DİVERSEY KİMYA SAN VE TİC A.Ş.,2950016000.0,810300358.0,AIB2024000010311,WE,28.05.2024,3517071238.0,08.08.2024,TRY,508680.12,cleared,MIAG contract,6091.0,3142.0,WE#0810300358#20240528#2024#001,2024.0,1.0,810300358,2024.0,5.0,28.0,06.08.2024,cleared-MIAG
99,,,,,,,,,,,810300358,,,,NaT,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,810300358,,,,,In progress
111,100.0,WE,Goods receiving (inv,2024.0,3142.0,,31426091.0,6091.0,,11.0,810371920,AIA2024000008852,,,2024-07-31,70.0,,,1.0,2521100000.0,1332110000.0,1000010074.0,31.07.2024,01.08.2024,06.08.2024,...,,,,,,,,,,,,,,,,,WE#0810371920#20240731#2024#001.0,,,810371920,2024.0,7.0,31.0,09.10.2024,Invoice approval completed
112,,,,,,,,,,,810371920,,,,NaT,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,810371920,,,,,In progress


In [233]:
not_cleared_miag_records

Unnamed: 0,MANDT,Document_type,document_type_desc,GJAHR_x,BUKRS,GSBER,PRCTR,store_or_dc,KOSTL,month_in_fin_year,BELNR,XBLNR,AUGBL,AUGDT,ZFBDT,ZBD1T,ZBD2T,NETDT,BUZEI,altkt,hkont,suppl_no,BLDAT,BUDAT,CPUDT,...,Supplier number (Sales Line),Supplier number (MIAG),Supplier name,VAT number,Document number,Invoice number,Document type,Document date,Remittance advice number,Value date,Currency,Gross amount,Description,Contract indicator,Store,Company code,ARKTX,Business Year SFTP,Line Item No. SFTP,Doc no. combined,year,month,day,NET_DUE_DATE,INVOICE_STATUS
0,,,,,,,,,,,810046023,,,,NaT,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,810046023,,,,,In progress
1,,,,,,,,,,,810046023,,,,NaT,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,810046023,,,,,Invoice approval completed
2,,,,,,,,,,,810060014,,,,NaT,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,810060014,,,,,In progress
3,,,,,,,,,,,810060014,,,,NaT,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,810060014,,,,,Invoice approval completed
4,100.0,WE,Goods receiving (inv,2024.0,3142.0,,31426091.0,6091.0,,11.0,810371920,AIA2024000008852,,,2024-07-31,70.0,,,1.0,2521100000.0,1332110000.0,1000010074.0,31.07.2024,01.08.2024,06.08.2024,...,,,,,,,,,,,,,,,,,WE#0810371920#20240731#2024#001.0,,,810371920,2024.0,7.0,31.0,09.10.2024,Invoice approval completed
5,,,,,,,,,,,810371920,,,,NaT,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,810371920,,,,,In progress


In [238]:
if(len(gr_invoice_records)==0):
    gr_invoice_records = pd.DataFrame(columns=df_group_len_2.columns)

In [239]:
# Group by 'Doc no. combined'
grouped = gr_invoice_records.groupby('Doc no. combined')

# Initialize lists to store the rows based on INVOICE_STATUS
cleared_records = []
other_records = []

# Iterate through each group
for name, group in grouped:
    # Separate rows based on 'INVOICE_STATUS' value
    cleared_record = group[group['INVOICE_STATUS'] == 'cleared-MIAG']
    other_record = group[group['INVOICE_STATUS'] != 'cleared-MIAG']
    
    # Append to the lists if the record exists
    if not cleared_record.empty:
        cleared_records.append(cleared_record.iloc[0])  # Assuming there's only one 'cleared-MIAG' record per group
    if not other_record.empty:
        other_records.append(other_record.iloc[0])      # Assuming there's only one non-'cleared-MIAG' record per group

# Convert lists to DataFrames if needed
cleared_df = pd.DataFrame(cleared_records)
other_df = pd.DataFrame(other_records)


In [241]:
print("GR Invoice Records : ", len(gr_invoice_records))
print("Cleared df : ", len(cleared_df))
print("Non Cleared df : ", len(other_df))

GR Invoice Records :  0
Cleared df :  0
Non Cleared df :  0


In [243]:
if(len(cleared_df)==0):
    cleared_df = pd.DataFrame(columns=gr_invoice_records.columns)

In [244]:
if(len(other_df)==0):
    other_df = pd.DataFrame(columns=gr_invoice_records.columns)

## Validation starts

In [245]:
cleared_Df_sorted = cleared_df.sort_values('Doc no. combined').reset_index(drop=True)
other_df_sorted = other_df.sort_values('Doc no. combined').reset_index(drop=True)

# Combine the relevant columns for side-by-side display
combined = pd.DataFrame({
    'Doc no. combined': cleared_Df_sorted['Doc no. combined'],  # Assuming Doc no. matches
    'Diff amount': cleared_Df_sorted['Gross amount'] - other_df_sorted['Amount_in_local_currency'],
})

In [246]:
len(combined[abs(combined['Diff amount'])>0.0001])

0

## Validation ends

In [247]:
cleared_df

Unnamed: 0,MANDT,Document_type,document_type_desc,GJAHR_x,BUKRS,GSBER,PRCTR,store_or_dc,KOSTL,month_in_fin_year,BELNR,XBLNR,AUGBL,AUGDT,ZFBDT,ZBD1T,ZBD2T,NETDT,BUZEI,altkt,hkont,suppl_no,BLDAT,BUDAT,CPUDT,...,Supplier number (Sales Line),Supplier number (MIAG),Supplier name,VAT number,Document number,Invoice number,Document type,Document date,Remittance advice number,Value date,Currency,Gross amount,Description,Contract indicator,Store,Company code,ARKTX,Business Year SFTP,Line Item No. SFTP,Doc no. combined,year,month,day,NET_DUE_DATE,INVOICE_STATUS


In [248]:
other_df

Unnamed: 0,MANDT,Document_type,document_type_desc,GJAHR_x,BUKRS,GSBER,PRCTR,store_or_dc,KOSTL,month_in_fin_year,BELNR,XBLNR,AUGBL,AUGDT,ZFBDT,ZBD1T,ZBD2T,NETDT,BUZEI,altkt,hkont,suppl_no,BLDAT,BUDAT,CPUDT,...,Supplier number (Sales Line),Supplier number (MIAG),Supplier name,VAT number,Document number,Invoice number,Document type,Document date,Remittance advice number,Value date,Currency,Gross amount,Description,Contract indicator,Store,Company code,ARKTX,Business Year SFTP,Line Item No. SFTP,Doc no. combined,year,month,day,NET_DUE_DATE,INVOICE_STATUS


In [249]:
# Ensure correct data types in both dataframes for comparison
other_df['BUZEI'] = other_df['BUZEI'].fillna(0).astype('string')
other_df['GJAHR_x'] = other_df['GJAHR_x'].astype('string')
other_df['Document_type'] = other_df['Document_type'].astype(str)
cleared_df['Document type'] = cleared_df['Document type'].astype(str)
cleared_df['Gross amount'] = cleared_df['Gross amount'].astype(float)
other_df['Amount_in_local_currency'] = other_df['Amount_in_local_currency'].astype(float)

# List to store Doc no. combined values that fail validation
failed_doc_nos = []

# Loop through each row in cleared_df and perform validations
for index, row1 in cleared_df.iterrows():
    # Get the corresponding row in other_df based on 'Doc no. combined'
    row2 = other_df[other_df['Doc no. combined'] == row1['Doc no. combined']]

    # Proceed only if there is exactly one matching row in other_df
    if len(row2) == 1:
        row2 = row2.iloc[0]

        # Perform validation checks
        if (
            row1['Line Item No. SFTP'] == row2['BUZEI'] and
            row1['Document type'] == row2['Document_type'] and
            row1['Business Year SFTP'] == row2['GJAHR_x'] and
            row1['Gross amount'] == row2['Amount_in_local_currency']
        ):
            cleared_df.at[index, 'NET_DUE_DATE'] = row2['NET_DUE_DATE']
            
            # Check if 'Store' column in row1 is not empty
            if pd.notnull(row1['Store']) and row1['Store'] != '':
                # Directly remove row2 from other_df
                other_df = other_df.drop(row2.name)
            else:
                # Copy 'store_or_dc' value from row2 to 'Store' column in row1
                cleared_df.at[index, 'Store'] = row2['store_or_dc']
                # Remove row2 from other_df
                other_df = other_df.drop(row2.name)
        else:
            # If validation fails, add the 'Doc no. combined' to the failed list
            failed_doc_nos.append(row1['Doc no. combined'])
    # else:
    #     # If no match or multiple matches, add to failed list
    #     failed_doc_nos.append(row1['Doc no. combined'])

# Output the updated cleared_df, other_df, and the failed list
cleared_df.reset_index(drop=True, inplace=True)
other_df.reset_index(drop=True, inplace=True)

In [250]:
cleared_df['Line Item No. SFTP'].isna().sum()

np.int64(0)

In [251]:
print("Cleared df : ", len(cleared_df))
print("Non Cleared df : ", len(other_df))
print("Failed docs : ", len(failed_doc_nos))

Cleared df :  0
Non Cleared df :  0
Failed docs :  0


In [252]:
failed_doc_nos

[]

In [254]:
import pandas as pd

# Split the DataFrame into two: len_3_cleared_df and len_3_other_df
len_3_cleared_df = df_group_len_3[df_group_len_3['INVOICE_STATUS'] == 'cleared-MIAG'].copy()
len_3_other_df = df_group_len_3[df_group_len_3['INVOICE_STATUS'] != 'cleared-MIAG'].copy()

# Convert data types as specified
len_3_other_df['BUZEI'] = len_3_other_df['BUZEI'].astype('string')
len_3_other_df['GJAHR_x'] = len_3_other_df['GJAHR_x'].astype('string')
len_3_other_df['Document_type'] = len_3_other_df['Document_type'].astype(str)
len_3_cleared_df['Document type'] = len_3_cleared_df['Document type'].astype(str)
len_3_cleared_df['Gross amount'] = len_3_cleared_df['Gross amount'].astype(float)
len_3_other_df['Amount_in_local_currency'] = len_3_other_df['Amount_in_local_currency'].astype(float)

# Iterate over rows in len_3_other_df
rows_to_delete = []
for index_other, row2 in len_3_other_df.iterrows():
    # Find matching row in len_3_cleared_df
    match = len_3_cleared_df[
        (len_3_cleared_df['Line Item No. SFTP'] == row2['BUZEI']) &
        (len_3_cleared_df['Document type'] == row2['Document_type']) &
        (len_3_cleared_df['Business Year SFTP'] == row2['GJAHR_x']) &
        (len_3_cleared_df['Gross amount'] == row2['Amount_in_local_currency']) &
        (len_3_cleared_df['Doc no. combined'] == row2['Doc no. combined'])
    ]
    
    if not match.empty:
        # Take the first matched row (assuming only one match is expected)
        row1 = match.iloc[0]

        len_3_cleared_df.loc[match.index[0], 'NET_DUE_DATE'] = row2['NET_DUE_DATE']
        
        # Check the Store column in row1
        if pd.isna(row1['Store']):
            # Check the store_or_dc column in row2
            if not pd.isna(row2.get('store_or_dc')):
                len_3_cleared_df.loc[match.index[0], 'Store'] = row2['store_or_dc']
        
        # Mark the row2 for deletion
        rows_to_delete.append(index_other)

# Delete rows from len_3_other_df that were processed
len_3_other_df.drop(index=rows_to_delete, inplace=True)

# Resulting DataFrames: len_3_cleared_df and len_3_other_df


In [255]:
print("Length 3 : ", len(df_group_len_3))
print("len_3_cleared_df : ", len(len_3_cleared_df))
print("len_3_other_df : ", len(len_3_other_df))

Length 3 :  0
len_3_cleared_df :  0
len_3_other_df :  0


In [256]:
merged_df = pd.concat([len_3_cleared_df, len_3_other_df, cleared_df, other_df, not_cleared_miag_records, both_cleared_or_one_progress, df_group_len_1, df_all_zeros], ignore_index=True)

# Optionally, reset the index if you want a clean index
merged_df.reset_index(drop=True, inplace=True)

  merged_df = pd.concat([len_3_cleared_df, len_3_other_df, cleared_df, other_df, not_cleared_miag_records, both_cleared_or_one_progress, df_group_len_1, df_all_zeros], ignore_index=True)
  merged_df = pd.concat([len_3_cleared_df, len_3_other_df, cleared_df, other_df, not_cleared_miag_records, both_cleared_or_one_progress, df_group_len_1, df_all_zeros], ignore_index=True)


In [257]:
len(merged_df)

172

In [258]:
merged_df.shape

(172, 77)

## creating the final file

In [259]:
loadfile_df = pd.DataFrame([])

In [260]:
doc_to_type = dict(zip(merged_df['Supplier number (Sales Line)'],
                           merged_df['Supplier number (MIAG)']))
doc_to_type

{nan: nan, '1000010074': '3500000073', '1000010095': '3500000094'}

In [261]:
len(doc_to_type)

3

# COMPANY CODE

In [262]:
company_code = 3142
loadfile_df['COMPANY_CODE'] = company_code

In [263]:
loadfile_df[['COMPANY_CODE']].isna().sum()

COMPANY_CODE    0
dtype: int64

# Supplier number (Sales Line)

In [264]:
merged_df['Supplier number (Sales Line)'].isna().sum()

np.int64(146)

In [265]:
loadfile_df['SUPPLIER_NO'] = merged_df['Supplier number (Sales Line)'].fillna(
    merged_df['suppl_no']
).fillna(
    merged_df['LIFNR']
)

In [266]:
loadfile_df['SUPPLIER_NO'].isna().sum()

np.int64(0)

# MIAG_SUPPLIER_NO

In [267]:
loadfile_df['MIAG_SUPPLIER_NO'] = merged_df['Supplier number (MIAG)']

In [268]:
loadfile_df['MIAG_SUPPLIER_NO'].isna().sum()

np.int64(146)

In [269]:
loadfile_df['MIAG_SUPPLIER_NO'] = loadfile_df['MIAG_SUPPLIER_NO'].fillna(loadfile_df['SUPPLIER_NO'].map(doc_to_type))

In [270]:
loadfile_df['MIAG_SUPPLIER_NO'].isna().sum()


np.int64(0)

# ORDER_NO

In [271]:
loadfile_df['ORDER_NO'] = merged_df['AUFNR']

In [272]:
loadfile_df['ORDER_NO'].isna().sum()

np.int64(65)

# DOC Type

In [273]:
loadfile_df['DOC_TYPE'] = merged_df['Document type'].where(merged_df['Document type'].notna(), merged_df['Document_type'])

In [274]:
loadfile_df['DOC_TYPE'].isna().sum()

np.int64(40)

# Invoice number

In [275]:
loadfile_df['INVOICE_NO'] = merged_df['Invoice number'].where(merged_df['Invoice number'].notna(), merged_df['XBLNR'])

In [276]:
loadfile_df['INVOICE_NO'].isna().sum()

np.int64(40)

In [277]:
loadfile_df['INVOICE_NO'] = merged_df['Invoice number'].fillna(
    merged_df['XBLNR']
).fillna(
    merged_df['RENR']
)

In [278]:
loadfile_df['INVOICE_NO'].isna().sum()

np.int64(0)

# INVOICE_DATE

In [279]:
loadfile_df['INVOICE_DATE']	=  merged_df['Document date'].fillna(merged_df['REDAT'])

# DELIVERY_NOTE_NO

In [280]:
loadfile_df['DELIVERY_NOTE_NO'] = merged_df['LFSNR']

# TOTAL_AMT_DC

In [281]:
loadfile_df['TOTAL_AMT_DC'] = merged_df['Gross amount'].fillna(
    merged_df['GEBRF']
).fillna(
    merged_df['Amount_in_local_currency']
)

In [282]:
loadfile_df['TOTAL_AMT_DC'].isna().sum()

np.int64(0)

# TOTAL_VAT_DC

In [283]:
loadfile_df['TOTAL_VAT_DC']	= merged_df['GSMWF']

# CURRENCY

In [284]:
company_code = 3142
loadfile_df['COMPANY_CODE'] = company_code

In [285]:
loadfile_df['COMPANY_CODE'].isna().sum()

np.int64(0)

In [286]:
country_currency_dict = {3142: 'TRY'}

In [287]:
loadfile_df['CURRENCY'] = loadfile_df['COMPANY_CODE'].map(country_currency_dict)

In [288]:
loadfile_df['CURRENCY'].isna().sum()

np.int64(0)

# OTHER COLUMNS

In [289]:
condition = (loadfile_df['DOC_TYPE'] == 'MV')

loadfile_df['PRE_FINANCE_DATE'] = np.where(condition, merged_df['Value date'], '')

loadfile_df['GOODS_RECEIPT_NO']	= merged_df['WENUM']

loadfile_df['GOODS_RECEIPT_DATE'] = merged_df['WEDAT']

loadfile_df['INVOICE_ENTRY_DATE'] = merged_df['RGDAT'].where(merged_df['RGDAT'].notna(), merged_df['BLDAT'])

loadfile_df['INVOICE_STATUS'] = merged_df['INVOICE_STATUS']

loadfile_df['INVOICE_STATUS_INTERNAL'] = merged_df['ABGST']

loadfile_df['NET_DUE_DATE'] = merged_df['NET_DUE_DATE']

loadfile_df['DEBIT_NOTE_NO'] = merged_df['DEBNOTNO']

loadfile_df['REMITTANCE_ADVICE_NO'] = np.where(
    merged_df['INVOICE_STATUS'] == 'cleared-MIAG', 
    merged_df['Remittance advice number'], 
    '')

loadfile_df['CLEARING_DATE'] =  merged_df['Value date'].where(merged_df['Value date'].notna(), merged_df['AUGDT'])

loadfile_df['DOCUMENT_NO']	= merged_df['BELNR'].where(merged_df['BELNR'].notna(), merged_df['Document number'])

# store_no

In [290]:
loadfile_df['STORE_NO'] =  merged_df['Store'].fillna(merged_df['store_or_dc'])

In [291]:
loadfile_df['STORE_NO'].isna().sum()

np.int64(44)

# ARKTX

In [292]:
loadfile_df['ARKTX'] =  merged_df['ARKTX']

# Remaining columns

In [293]:
current_date = datetime.date.today()
formatted_current_date = current_date.strftime("%d.%m.%Y")
loadfile_df['MATCHING_DATE'] = formatted_current_date
loadfile_df['MATCH_STATUS'] = 'No Matching Requested'
loadfile_df['SYNC_DATE']	= formatted_current_date
loadfile_df['SYNC_STATUS'] = '1'
loadfile_df= loadfile_df.fillna('')

In [294]:
loadfile_df['INVOICE_NO'].replace('nan', '', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  loadfile_df['INVOICE_NO'].replace('nan', '', inplace=True)


In [295]:
loadfile_df

Unnamed: 0,COMPANY_CODE,SUPPLIER_NO,MIAG_SUPPLIER_NO,ORDER_NO,DOC_TYPE,INVOICE_NO,INVOICE_DATE,DELIVERY_NOTE_NO,TOTAL_AMT_DC,TOTAL_VAT_DC,CURRENCY,PRE_FINANCE_DATE,GOODS_RECEIPT_NO,GOODS_RECEIPT_DATE,INVOICE_ENTRY_DATE,INVOICE_STATUS,INVOICE_STATUS_INTERNAL,NET_DUE_DATE,DEBIT_NOTE_NO,REMITTANCE_ADVICE_NO,CLEARING_DATE,DOCUMENT_NO,STORE_NO,ARKTX,MATCHING_DATE,MATCH_STATUS,SYNC_DATE,SYNC_STATUS
0,3142,1000010074,3500000073,999999999,,AIB2024000007238,27.03.2024,MASRAF,10706.64,1784.44,TRY,,0,24.10.2023,29.03.2024,In progress,771,,24148003,,,0810046023,,,19.11.2024,No Matching Requested,19.11.2024,1
1,3142,1000010074,3500000073,110394286,,AID2024000008147,25.10.2024,GAD2024000031043,239558.12,39926.35,TRY,,0,,31.10.2024,Invoice approval completed,208,,,,,0810046023,,,19.11.2024,No Matching Requested,19.11.2024,1
2,3142,1000010074,3500000073,888888888,,AIA2023000009998,07.09.2023,COST_INVOICE,7944.32,588.47,TRY,,0,17.01.2022,07.09.2023,In progress,771,,,,,0810060014,,,19.11.2024,No Matching Requested,19.11.2024,1
3,3142,1000010074,3500000073,110464987,,AIC2024000011044,05.11.2024,GAD2024000032006,62149.20,10358.20,TRY,,0,,13.11.2024,Invoice approval completed,1203,,,,,0810060014,,,19.11.2024,No Matching Requested,19.11.2024,1
4,3142,1000010074,3500000073,45343957,WE,AIA2024000008852,31.07.2024,21319,175418.88,29236.48,TRY,,374170,31.07.2024,05.08.2024,Invoice approval completed,258,09.10.2024,,,,0810371920,6091.0,WE#0810371920#20240731#2024#001.0,19.11.2024,No Matching Requested,19.11.2024,1
5,3142,1000010074,3500000073,888888888,,AIA2023000009997,07.09.2023,COST_INVOICE,11453.30,848.39,TRY,,0,10.02.2022,07.09.2023,In progress,771,,,,,0810371920,,,19.11.2024,No Matching Requested,19.11.2024,1
6,3142,1000010074,3500000073,45333498,WE,AIF2024000000533,17.05.2024,14582,358313.28,59718.88,TRY,,316899,17.05.2024,27.05.2024,cleared-MIAG,258,26.07.2024,,03517071238,08.08.2024,0810290849,6091,WE#0810290849#20240517#2024#001,19.11.2024,No Matching Requested,19.11.2024,1
7,3142,1000010074,3500000073,45296886,,AIB2023000019333,05.09.2023,26474,106864.80,17810.80,TRY,,249942,,06.09.2023,In progress,52,,,,,0810290849,,,19.11.2024,No Matching Requested,19.11.2024,1
8,3142,1000010074,3500000073,45335609,WE,AIB2024000010311,28.05.2024,15110,508680.12,84780.02,TRY,,319548,28.05.2024,03.06.2024,cleared-MIAG,258,06.08.2024,,03517071238,08.08.2024,0810300358,6091,WE#0810300358#20240528#2024#001,19.11.2024,No Matching Requested,19.11.2024,1
9,3142,1000010074,3500000073,999999999,,AIB2024000007239,27.03.2024,MASRAF,825.98,137.66,TRY,,0,26.01.2022,29.03.2024,In progress,771,,24148004,,,0810300358,,,19.11.2024,No Matching Requested,19.11.2024,1


In [296]:
loadfile_df['DOCUMENT_NO'] = loadfile_df['DOCUMENT_NO'].replace('0000000000', '')

In [297]:
len(loadfile_df[loadfile_df['DEBIT_NOTE_NO']!=''])

8

In [298]:
loadfile_df['DEBIT_NOTE_NO'] = loadfile_df['DEBIT_NOTE_NO'].apply(lambda x: str(x).strip() if str(x).strip() else '')


In [299]:
loadfile_df.dtypes

COMPANY_CODE                int64
SUPPLIER_NO                object
MIAG_SUPPLIER_NO           object
ORDER_NO                   object
DOC_TYPE                   object
INVOICE_NO                 object
INVOICE_DATE               object
DELIVERY_NOTE_NO           object
TOTAL_AMT_DC               object
TOTAL_VAT_DC               object
CURRENCY                   object
PRE_FINANCE_DATE           object
GOODS_RECEIPT_NO           object
GOODS_RECEIPT_DATE         object
INVOICE_ENTRY_DATE         object
INVOICE_STATUS             object
INVOICE_STATUS_INTERNAL    object
NET_DUE_DATE               object
DEBIT_NOTE_NO              object
REMITTANCE_ADVICE_NO       object
CLEARING_DATE              object
DOCUMENT_NO                object
STORE_NO                   object
ARKTX                      object
MATCHING_DATE              object
MATCH_STATUS               object
SYNC_DATE                  object
SYNC_STATUS                object
dtype: object

In [300]:
len(loadfile_df[loadfile_df['DEBIT_NOTE_NO']==''])

164

In [301]:
current_datetime = datetime.datetime.now().strftime("%Y%m%d")
        
loadfile_df.to_csv(
            r"C:\Users\sappidi.reddy\Documents\Dummy\load.360.35." + current_datetime + ".001_test_internal.csv", sep=',', index=False)

loadfile_df.to_csv(
            r"C:\Users\sappidi.reddy\Documents\Dummy\load.360.35." + current_datetime + ".001_internal.csv", sep=';', index=False)