### All copied from final_dataprep_v1.ipynb to testrun fully and merge the df on FK and PKs for first usecase

# Bewegungen.csv prep

In [24]:
# consolidated :::
import pandas as pd
from idna.idnadata import joining_types

# List of columns to keep (ran into memory issues...)
columns_to_keep = [
    "WORNUM", "STATUS", "SRC_ITEM", "SRC_LOT", "SRC_QACODE", "DST_LOT",
    "DST_QACODE", "SRC_WA", "SRC_X", "SRC_Y", "SRC_Z", "DST_WA", "OUTNUM",
    "LISNUM", "SUMLIS", "TRNNUM", "CRTDAT", "TRNDAT", "USERID", #"LOADDAT" not needed (just the date when its imported into DWH)
]

dtypes = {
    "WORNUM": "int32",
    "STATUS": "int32",
    "MOVTYP": "category",
    "MOVKEY": "category",
    "SRC_ITEM": "category",
    "SRC_LOT": "string",
    "SRC_QACODE": "category",
    "DST_LOT": "string",
    "DST_QACODE": "category",
    "SRC_WA": "category",
    "SRC_X": "category",
    "SRC_Y": "category",
    "SRC_Z": "category",
    "DST_WA": "category",
    "CONQTY": "int32",
    "OUTNUM": "int32",
    "RELNUM": "int32",
    "LISNUM": "int32",
    "SUMLIS": "int32",
    "TRNNUM": "int32",
    "USERID": "category",
}

# Load the CSV with optimized settings, and only load necessary cols
df_bewegungen = pd.read_csv(
    '../Data/bewegungen.csv',
    usecols=columns_to_keep,  # Only load the specified columns
    dtype=dtypes, # use optimized, manually set data types
    #usecols=columns_to_use,  # Only load required columns
    parse_dates=["CRTDAT", "TRNDAT"],
    low_memory=False
)

# Clean SRC_LOT column (Column "Artikelcharge", this LOT usually is a 3 digit int. An Article can have multiple LOTs. I simplify by removing leading zeros and clean up the column from wrong manual usererrors.
df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].fillna('').astype(str)  # Handle NaNs and convert to string
df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].apply(lambda x: '0' if x == '000' else x.lstrip('0'))  # Remove leading zeros
df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].apply(lambda x: x if x.isdigit() and len(x) <= 3 else None)  # Validate format
# keep value if: consists of digits, and its length is less than or equal to 3 -- otherwise replace with none

# Clean DST_LOT column
df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].fillna('').astype(str)  # Handle NaNs and convert to string
df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].apply(lambda x: '0' if x == '000' else x.lstrip('0'))  # Remove leading zeros
df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].apply(lambda x: x if x.isdigit() and len(x) <= 3 else None)  # Validate format
# keep value if: consists of digits, and its length is less than or equal to 3 -- otherwise replace with none

# Clean STATUS column (10= Offen, 50= Bestätigt, 95= Storniert -- for my purposes I'm only interested in "Bestätigt" rows.
# Filter the DataFrame to keep only rows with status == 50 (abgeschlossen)
df = df_bewegungen[df_bewegungen['STATUS'] == 50]

# Remove rows where USER == 'LXONE'
df = df[df['USERID'] != 'LXONE']

df_bewegungen = df_bewegungen.astype({
    "SRC_LOT": "category",
    "DST_LOT": "category",
})

# Drop rows with missing values from the DataFrame - dropna (by default, without parameters) removes entire rows which have a NaN or null value
df_bewegungen.dropna(inplace=True)

# Verify the changes
print(f"Updated DataFrame shape: {df_bewegungen.shape}")
# :::

Updated DataFrame shape: (20094135, 19)


In [25]:
df_bewegungen.head()

Unnamed: 0,WORNUM,STATUS,SRC_ITEM,SRC_LOT,SRC_QACODE,DST_LOT,DST_QACODE,SRC_WA,SRC_X,SRC_Y,SRC_Z,DST_WA,OUTNUM,LISNUM,SUMLIS,TRNNUM,CRTDAT,TRNDAT,USERID
0,289678987,50,36529803,1,H,1,H,EG,1601,2,D15,WA,31313376,289679194,289679311,599324284,2024-04-25 01:04:53,2024-04-25 08:16:39,24769
1,289870561,50,44118619,1,H,1,H,EG,1780,2,E03,WA,31320349,289874022,289874463,599670338,2024-04-25 17:34:53,2024-04-25 19:29:47,178141
2,289833599,50,13091411,1,H,1,H,EG,2215,4,F13,WA,31324968,289835037,289835121,599596179,2024-04-25 15:24:37,2024-04-25 16:42:37,LAESSIG
3,289815259,50,9636829,0,H,0,H,EG,2401,2,C03,WA,31333117,289815703,289815805,599541258,2024-04-25 14:34:37,2024-04-25 14:56:15,GUERBUEZ
4,289839097,50,16359571,1,H,1,H,EG,2004,7,G05,WA,31346782,289840157,289725531,599576127,2024-04-25 15:39:56,2024-04-25 16:03:15,104044


# ? maybe remove _ check for NaN / missing values in DF

In [5]:
# check for null values
# Total count of missing values in the DataFrame
total_missing = df_bewegungen.isnull().sum().sum()
print(f"Total missing values in the DataFrame: {total_missing}")

# Filter DataFrame to show only rows with at least one missing value
rows_with_missing_values = df_bewegungen[df_bewegungen.isnull().any(axis=1)]

# Display the rows with missing values
print(rows_with_missing_values)

Total missing values in the DataFrame: 0
Empty DataFrame
Columns: [WORNUM, STATUS, SRC_ITEM, SRC_LOT, SRC_QACODE, DST_LOT, DST_QACODE, SRC_WA, SRC_X, SRC_Y, SRC_Z, DST_WA, OUTNUM, LISNUM, SUMLIS, TRNNUM, CRTDAT, TRNDAT, USERID, LOADDAT]
Index: []


# WA kopf.csv prep

In [19]:
#Consolidated:::
# List of columns to keep (ran into memory issues...)
columns_to_keep = [
    "OUTNUM", "DOCNUM", "STATUS", "PICCOD", "CUSNUM",
    "SHPTYP", "TOUR", "ORDDAT", "DLVDAT", "CRTDAT", "TRNDAT"
]

dtypes = {
    "OUTNUM": "int32",
    "DOCNUM": "category",
    "ORDNUM": "category", # dont think i need this, seems it achieves the same as DOCNUM
    "STATUS": "int32",
    "PICCOD": "category",
    "CUSNUM": "category", # kunde
    "SHPTYP": "category", # versandart
    "TOUR": "category",
}

df_wa_kopf = pd.read_csv(
    '../Data/wa_kopf.csv',
    usecols=columns_to_keep,  # Only load the specified columns
    dtype=dtypes,
    #usecols=columns_to_use,  # Only load required columns
    #dtype=dtypes,            # Use optimized data types
    parse_dates=["ORDDAT", "DLVDAT", "CRTDAT", "TRNDAT"],
    low_memory=False
)
# Filter the DataFrame to keep only rows with status == 90 (abgeschlossene)
df_wa_kopf = df_wa_kopf[df_wa_kopf['STATUS'] == 90]

# Drop rows with missing values from the DataFrame - some early data and regression tests lead to PICCOD, SHPTYP and TOUR being empty (~86 rows)
df_wa_kopf.dropna(inplace=True) # inplace=True modifies DataFrame directly without having to create a new one

# Step 1: Merge main DataFrame with customers
# df_orders = pd.merge(df_bewegungen, df_wa_kopf, on='OUTNUM', how='inner')

#:::

In [20]:
# check for null values
# Total count of missing values in the DataFrame
total_missing = df_wa_kopf.isnull().sum().sum()
print(f"Total missing values in the DataFrame: {total_missing}")

# Filter DataFrame to show only rows with at least one missing value
rows_with_missing_values = df_wa_kopf[df_wa_kopf.isnull().any(axis=1)]

# Display the rows with missing values
print(rows_with_missing_values)

Total missing values in the DataFrame: 0
Empty DataFrame
Columns: [OUTNUM, DOCNUM, STATUS, PICCOD, CUSNUM, ORDDAT, DLVDAT, SHPTYP, TOUR, CRTDAT, TRNDAT]
Index: []


In [4]:
df_wa_kopf.head()

Unnamed: 0,OUTNUM,DOCNUM,ORDNUM,STATUS,PICCOD,CUSNUM,ORDDAT,DLVDAT,SHPTYP,TOUR,CRTDAT,TRNDAT,USERID,TRNNUM,LOADDAT
0,30723507,33054472,llo229691223,90,BS17,100986,2024-03-20 11:38:02,2024-03-20,2,85,2024-03-20 11:39:08,2024-03-20 18:13:18,LXONE,546327413,2024-03-21 01:54:14.176700
1,30711706,33052260,llo229681872,90,BS17,94536,2024-03-20 05:53:37,2024-03-20,2,85,2024-03-20 05:54:13,2024-03-20 19:58:26,LXONE,546357022,2024-03-21 01:54:14.176700
2,30730649,33056533,llo229700285,90,BS17,90715,2024-03-20 15:22:37,2024-03-20,2,83,2024-03-20 15:22:47,2024-03-20 17:32:19,LXONE,546301836,2024-03-21 01:54:14.176700
3,30712103,33052611,llo229682417,90,BS15-S,165269,2024-03-20 05:55:07,2024-03-20,124,0,2024-03-20 05:56:00,2024-03-20 11:13:51,176551,546066479,2024-03-21 01:54:14.176700
4,30733401,33057203,llo229704047,90,BS17,2542,2024-03-20 16:52:31,2024-03-20,2,94,2024-03-20 16:52:38,2024-03-20 21:05:25,LXONE,546369019,2024-03-21 01:54:14.176700


# WA Positionen prep

In [15]:
### Consolidated

# List of columns to keep (ran into memory issues...)
columns_to_keep = [
    "OUTNUM", "STATUS", "ITEM", "LOT",
    "ORDQTY", "CONQTY", "CRTDAT", "TRNDAT", "USERID", "TRNNUM"
]

dtypes = {
    #"OUTLIN": "int32", -- not used
    "OUTNUM": "int32",
    "STATUS": "int32",
    "ITEM": "category",
    "LOT": "string", # 3 character string, e.G: 001, 002, 006, 012 etc.
    "ORDQTY": "int32",
    "CONQTY": "float32", # somehow it thinks these values are float data
    "USERID": "category",
    "TRNNUM": "int32"
}

# Load the CSV with optimized settings, and only load necessary cols
df_wa_positionen = pd.read_csv(
    '../Data/wa_positionen.csv',
    usecols=columns_to_keep,  # Only load the specified columns
    dtype=dtypes,
    #usecols=columns_to_use,  # Only load required columns
    #dtype=dtypes,            # Use optimized data types
    parse_dates=["CRTDAT", "TRNDAT"],
    low_memory=False
)

# Drop rows with missing values from the DataFrame
df_wa_positionen.dropna(inplace=True)

# Drop rows where CONQTY has a fractional part (i.e., a value with anything after the decimal point)
df_wa_positionen = df_wa_positionen[df_wa_positionen["CONQTY"] % 1 == 0]

# Convert CONQTY column to integer type to reflect that it no longer has fractions
df_wa_positionen["CONQTY"] = df_wa_positionen["CONQTY"].astype(int)

## WRITE TO CSV todo

# :::

In [21]:
df_wa_positionen.head()

Unnamed: 0,OUTNUM,STATUS,ITEM,LOT,ORDQTY,CONQTY,CRTDAT,TRNDAT,USERID,TRNNUM
0,9684339,90,4299149,3,11,7,2021-03-30 18:00:39,2022-03-08 23:15:11,PASAMONTES,536001122
1,10756704,90,28872808,2,83,83,2021-05-28 14:15:29,2022-10-18 17:33:08,168388,807604606
2,10756724,90,19020166,6,40,40,2021-05-28 14:15:35,2022-10-18 17:32:58,168388,807604231
3,11244298,90,35282807,1,3,3,2021-06-25 16:44:21,2022-02-21 15:06:45,MUAMET,518249888
4,11244340,90,35448829,1,4,4,2021-06-25 16:44:27,2022-02-21 15:06:45,MUAMET,518249888


In [12]:
# check for null values
# Total count of missing values in the DataFrame
total_missing = df_wa_positionen.isnull().sum().sum()
print(f"Total missing values in the DataFrame: {total_missing}")

# Filter DataFrame to show only rows with at least one missing value
rows_with_missing_values = df_wa_positionen[df_wa_positionen.isnull().any(axis=1)]

# Display the rows with missing values
print(rows_with_missing_values)

Total missing values in the DataFrame: 2
           OUTNUM  STATUS      ITEM   LOT  ORDQTY  CONQTY              CRTDAT  \
5628146  21434921      90  32858196  <NA>     200   200.0 2022-11-28 14:24:50   
6574655  22390030      90  33565450  <NA>       1     1.0 2023-01-09 14:29:44   

                     TRNDAT USERID     TRNNUM  
5628146 2022-11-28 14:28:38  LXONE  862445145  
6574655 2023-01-09 14:33:39  LXONE  920984668  


In [13]:
# Filter rows where CONQTY contains an actual float value
float_rows = df_wa_positionen[df_wa_positionen["CONQTY"] % 1 != 0]

# Display the rows with float values in CONQTY
print("Rows with actual float values in the CONQTY column:")
print(float_rows)

Rows with actual float values in the CONQTY column:
           OUTNUM  STATUS      ITEM  LOT  ORDQTY  CONQTY              CRTDAT  \
1986356  17732331      90  30231025  001       1     0.5 2022-05-17 09:44:33   

                     TRNDAT   USERID     TRNNUM  
1986356 2022-06-08 18:04:05  GREPPER  641785865  


In [14]:
count_float_values = len(float_rows)
print(f"Number of rows with float values in CONQTY: {count_float_values}")

Number of rows with float values in CONQTY: 1


# Joins

In [None]:

# = pd.merge(df_bewegungen, df_wa_kopf, on='OUTNUM', how='inner')


In [26]:
df_full = (
    pd.merge(df_bewegungen, df_wa_kopf, on='OUTNUM', how='inner')
    .merge(df_wa_positionen, on='OUTNUM', how='inner')
)

print(df_full.head())


      WORNUM  STATUS_x  SRC_ITEM SRC_LOT SRC_QACODE DST_LOT DST_QACODE SRC_WA  \
0  289678987        50  36529803       1          H       1          H     EG   
1  289870561        50  44118619       1          H       1          H     EG   
2  289833599        50  13091411       1          H       1          H     EG   
3  289815259        50   9636829       0          H       0          H     EG   
4  289839097        50  16359571       1          H       1          H     EG   

  SRC_X SRC_Y  ...            TRNDAT_y STATUS      ITEM  LOT  ORDQTY  CONQTY  \
0  1601    02  ... 2024-04-25 19:31:31     90  36529803  001       9       9   
1  1780    02  ... 2024-04-25 20:10:13     90  44118619  001       2       2   
2  2215    04  ... 2024-04-25 18:40:10     90  13091411  001       1       1   
3  2401    02  ... 2024-04-25 19:25:39     90   9636829  000       1       1   
4  2004    07  ... 2024-04-25 18:39:20     90  16359571  001       1       1   

               CRTDAT           

In [27]:
df_full.head()
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19628893 entries, 0 to 19628892
Data columns (total 38 columns):
 #   Column      Dtype         
---  ------      -----         
 0   WORNUM      int32         
 1   STATUS_x    int32         
 2   SRC_ITEM    category      
 3   SRC_LOT     category      
 4   SRC_QACODE  category      
 5   DST_LOT     category      
 6   DST_QACODE  category      
 7   SRC_WA      category      
 8   SRC_X       category      
 9   SRC_Y       category      
 10  SRC_Z       category      
 11  DST_WA      category      
 12  OUTNUM      int32         
 13  LISNUM      int32         
 14  SUMLIS      int32         
 15  TRNNUM_x    int32         
 16  CRTDAT_x    datetime64[ns]
 17  TRNDAT_x    datetime64[ns]
 18  USERID_x    category      
 19  DOCNUM      category      
 20  STATUS_y    int32         
 21  PICCOD      category      
 22  CUSNUM      category      
 23  ORDDAT      datetime64[ns]
 24  DLVDAT      datetime64[ns]
 25  SHPTYP      cate

In [14]:
num_rows = len(df_full)
print(f"Number of rows: {num_rows}")

Number of rows: 19628991


In [29]:
# check for null values
# Total count of missing values in the DataFrame
total_missing = df_full.isnull().sum().sum()
print(f"Total missing values in the DataFrame: {total_missing}")

# Filter DataFrame to show only rows with at least one missing value
rows_with_missing_values = df_full[df_full.isnull().any(axis=1)]

# Display the rows with missing values
print(rows_with_missing_values)

Total missing values in the DataFrame: 0
Empty DataFrame
Columns: [WORNUM, STATUS_x, SRC_ITEM, SRC_LOT, SRC_QACODE, DST_LOT, DST_QACODE, SRC_WA, SRC_X, SRC_Y, SRC_Z, DST_WA, OUTNUM, LISNUM, SUMLIS, TRNNUM_x, CRTDAT_x, TRNDAT_x, USERID_x, DOCNUM, STATUS_y, PICCOD, CUSNUM, ORDDAT, DLVDAT, SHPTYP, TOUR, CRTDAT_y, TRNDAT_y, STATUS, ITEM, LOT, ORDQTY, CONQTY, CRTDAT, TRNDAT, USERID_y, TRNNUM_y]
Index: []

[0 rows x 38 columns]


Export Dataframe to Parquet file

In [28]:
import pyarrow

# Save the fully joined DataFrame to a Parquet file for better performance
output_file = 'joined_data_v2.parquet'
df_full.to_parquet(output_file, index=False)

print(f"Fully joined DataFrame saved to {output_file}")

Fully joined DataFrame saved to joined_data_v2.parquet


In [None]:
# reading parquet file later

df_loaded = pd.read_parquet('joined_data.parquet')
print(df_loaded.head())


##TEMP
