### All copied from final_dataprep_v1.ipynb to testrun fully and merge the df on FK and PKs for first usecase

# Bewegungen.csv prep

In [80]:
# consolidated :::
import pandas as pd
from idna.idnadata import joining_types

# List of columns to keep (ran into memory issues...)
columns_to_keep = [
    "STATUS", "SRC_ITEM", "SRC_LOT", "SRC_QACODE",
    "DST_QACODE", "SRC_WA", "SRC_X", "SRC_Y", "SRC_Z", "OUTNUM",
    "LISNUM", "SUMLIS", "TRNNUM", "TRNDAT", "USERID", #  "WORNUM", "DST_LOT", "DST_WA", "LOADDAT", "CRTDAT" not needed (just the date when its imported into DWH)
]

dtypes = {
#    "WORNUM": "int32",
    "STATUS": "int32",
#    "MOVTYP": "category",
#    "MOVKEY": "category",
    "SRC_ITEM": "category",
    "SRC_LOT": "string",
    "SRC_QACODE": "category",
    #"DST_LOT": "string",
    "DST_QACODE": "category",
    "SRC_WA": "category",
    "SRC_X": "category",
    "SRC_Y": "category",
    "SRC_Z": "category",
#    "DST_WA": "category",
    "CONQTY": "int32",
    "OUTNUM": "int32", # KEY
#    "RELNUM": "int32",
    "LISNUM": "int32",
    "SUMLIS": "int32",
    "TRNNUM": "int32",
    "USERID": "category",
}

# Load the CSV with optimized settings, and only load necessary cols
df_bewegungen = pd.read_csv(
    '../Data/bewegungen.csv',
    usecols=columns_to_keep,  # Only load the specified columns
    dtype=dtypes, # use optimized, manually set data types
    #usecols=columns_to_use,  # Only load required columns
    parse_dates=["TRNDAT"],
    low_memory=False
)

# Clean SRC_LOT column (Column "Artikelcharge", this LOT usually is a 3 digit int. An Article can have multiple LOTs. I simplify by removing leading zeros and clean up the column from wrong manual usererrors.
#df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].fillna('').astype(str)  # Handle NaNs and convert to string
#df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].apply(lambda x: '0' if x == '000' else x.lstrip('0'))  # Remove leading zeros
#df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].apply(lambda x: x if x.isdigit() and len(x) <= 3 else None)  # Validate format
# keep value if: consists of digits, and its length is less than or equal to 3 -- otherwise replace with none

# Clean DST_LOT column
#df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].fillna('').astype(str)  # Handle NaNs and convert to string
#df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].apply(lambda x: '0' if x == '000' else x.lstrip('0'))  # Remove leading zeros
#df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].apply(lambda x: x if x.isdigit() and len(x) <= 3 else None)  # Validate format
# keep value if: consists of digits, and its length is less than or equal to 3 -- otherwise replace with none

####
# Data Cleaning
####

# Temporarily convert 'SRC_LOT' to string type for cleaning
df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].astype(str)  # Convert Categorical to string
# Clean and validate the SRC_LOT column
def clean_lot(value):
    value = value.lstrip('0') if value != '000' else value  # Remove leading zeros unless '000'
    if value.isdigit() and 1 <= len(value) <= 3:           # Validate length (1 to 3 digits)
        return value.zfill(3)                              # Pad with leading zeros to ensure 3 digits
    return None                                            # Remove invalid entries

df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].apply(clean_lot)
# Convert back to Categorical if needed
df_bewegungen['SRC_LOT'] = pd.Categorical(df_bewegungen['SRC_LOT'])

# Clean STATUS column (10= Offen, 50= Bestätigt, 95= Storniert -- for my purposes I'm only interested in "Bestätigt" rows.
# Filter the DataFrame to keep only rows with status == 50 (abgeschlossen)
df_bewegungen = df_bewegungen[df_bewegungen['STATUS'] == 50]

# Filter the DataFrame in place to include only rows where 'SRC_WA' contains "WA", excluding all 'WE', 'WA', 'AU' and 'UM' areas
df_bewegungen = df_bewegungen[df_bewegungen['SRC_WA'] == 'EG']

# Remove rows where USER == 'LXONE'
df_bewegungen = df_bewegungen[df_bewegungen['USERID'] != 'LXONE']

# Only keep rows with QACODE == 'H'
df_bewegungen = df_bewegungen[df_bewegungen['SRC_QACODE'] == 'H']

# Remove all additional Crossdocking movements, easiest to pinpoint via SRC_LOT = 000
df_bewegungen = df_bewegungen[df_bewegungen['SRC_LOT'] != '000']


df_bewegungen = df_bewegungen.astype({
    "SRC_LOT": "category",
#    "DST_LOT": "category",
})

# Drop rows with missing values from the DataFrame - dropna (by default, without parameters) removes entire rows which have a NaN or null value
df_bewegungen.dropna(inplace=True)

# Drop both 'SRC_WA' and 'DST_WA' columns from the DataFrame
df_bewegungen.drop(columns=['STATUS', 'SRC_WA', 'SRC_QACODE'], inplace=True)

# Verify the changes
print(f"Updated DataFrame shape: {df_bewegungen.shape}")
# :::

Updated DataFrame shape: (17914909, 12)


In [79]:
####
#### Cleaning - unique entries in SRC_WA
####
# Count the number of unique entries in the 'SRC_WA' column
unique_entries_count = df_bewegungen['SRC_LOT'].nunique()

# Count the occurrences of each unique entry in the 'SRC_WA' column
unique_entries_occurrences = df_bewegungen['SRC_LOT'].value_counts()

# Print the results
print(f"There are {unique_entries_count} unique entries in the 'DST_WA' column.")
print("\nOccurrences of each unique entry:")
print(unique_entries_occurrences)


There are 72 unique entries in the 'DST_WA' column.

Occurrences of each unique entry:
001    15984112
000     1576942
002      749870
003      362855
004      226203
         ...   
045          57
077          50
035          38
051          13
223           1
Name: SRC_LOT, Length: 72, dtype: int64


In [75]:
####
#### Cleaning - unique entries in STATUS
####
# Count the number of unique entries in the 'SRC_WA' column
unique_entries_count = df_bewegungen['SRC_QACODE'].nunique()

# Count the occurrences of each unique entry in the 'SRC_WA' column
unique_entries_occurrences = df_bewegungen['SRC_QACODE'].value_counts()

# Print the results
print(f"There are {unique_entries_count} unique entries in the 'SRC_QACODE' column.")
print("\nOccurrences of each unique entry:")
print(unique_entries_occurrences)

KeyError: 'SRC_QACODE'

In [76]:
####
#### Cleaning - unique entries in SRC_QACODE
####
# Count the number of unique entries in the 'SRC_WA' column
unique_entries_count = df_bewegungen['DST_LOT'].nunique()

# Count the occurrences of each unique entry in the 'SRC_WA' column
unique_entries_occurrences = df_bewegungen['DST_LOT'].value_counts()

# Print the results
print(f"There are {unique_entries_count} unique entries in the 'SRC_QACODE' column.")
print("\nOccurrences of each unique entry:")
print(unique_entries_occurrences)

KeyError: 'DST_LOT'

In [81]:
df_bewegungen.head()

Unnamed: 0,SRC_ITEM,SRC_LOT,DST_QACODE,SRC_X,SRC_Y,SRC_Z,OUTNUM,LISNUM,SUMLIS,TRNNUM,TRNDAT,USERID
0,36529803,1,H,1601,2,D15,31313376,289679194,289679311,599324284,2024-04-25 08:16:39,24769
1,44118619,1,H,1780,2,E03,31320349,289874022,289874463,599670338,2024-04-25 19:29:47,178141
2,13091411,1,H,2215,4,F13,31324968,289835037,289835121,599596179,2024-04-25 16:42:37,LAESSIG
4,16359571,1,H,2004,7,G05,31346782,289840157,289725531,599576127,2024-04-25 16:03:15,104044
5,38789620,1,H,112,8,BA05,31344732,289808077,289726826,599529006,2024-04-25 14:44:06,170056


# ? maybe remove _ check for NaN / missing values in DF

In [5]:
# check for null values
# Total count of missing values in the DataFrame
total_missing = df_bewegungen.isnull().sum().sum()
print(f"Total missing values in the DataFrame: {total_missing}")

# Filter DataFrame to show only rows with at least one missing value
rows_with_missing_values = df_bewegungen[df_bewegungen.isnull().any(axis=1)]

# Display the rows with missing values
print(rows_with_missing_values)

Total missing values in the DataFrame: 0
Empty DataFrame
Columns: [WORNUM, STATUS, SRC_ITEM, SRC_LOT, SRC_QACODE, DST_LOT, DST_QACODE, SRC_WA, SRC_X, SRC_Y, SRC_Z, DST_WA, OUTNUM, LISNUM, SUMLIS, TRNNUM, CRTDAT, TRNDAT, USERID, LOADDAT]
Index: []


# WA kopf.csv prep

In [35]:
#Consolidated:::
# List of columns to keep (ran into memory issues...)
columns_to_keep = [
    "OUTNUM", "DOCNUM", "STATUS", "PICCOD", "CUSNUM",
    "SHPTYP", "TOUR", # "TRNDAT" "CRTDAT", "ORDDAT", "DLVDAT", not needed rn
]

dtypes = {
    "OUTNUM": "int32", # KEY
    "DOCNUM": "category", # Warenausgangsnummer
    # "ORDNUM": "category", # dont think i need this, seems it achieves the same as DOCNUM
    "STATUS": "int32",
    "PICCOD": "category",
    "CUSNUM": "category", # kunde
    "SHPTYP": "category", # versandart
    "TOUR": "category",
}

df_wa_kopf = pd.read_csv(
    '../Data/wa_kopf.csv',
    usecols=columns_to_keep,  # Only load the specified columns
    dtype=dtypes,
    #usecols=columns_to_use,  # Only load required columns
    #dtype=dtypes,            # Use optimized data types
    # parse_dates=["TRNDAT"], # "ORDDAT", "DLVDAT", "CRTDAT",  not needed rn
    low_memory=False
)
# Filter the DataFrame to keep only rows with status == 90 (abgeschlossene)
df_wa_kopf = df_wa_kopf[df_wa_kopf['STATUS'] == 90]

# Drop rows with missing values from the DataFrame - some early data and regression tests lead to PICCOD, SHPTYP and TOUR being empty (~86 rows)
df_wa_kopf.dropna(inplace=True) # inplace=True modifies DataFrame directly without having to create a new one

# Step 1: Merge main DataFrame with customers
# df_orders = pd.merge(df_bewegungen, df_wa_kopf, on='OUTNUM', how='inner')

#:::

In [20]:
# check for null values
# Total count of missing values in the DataFrame
total_missing = df_wa_kopf.isnull().sum().sum()
print(f"Total missing values in the DataFrame: {total_missing}")

# Filter DataFrame to show only rows with at least one missing value
rows_with_missing_values = df_wa_kopf[df_wa_kopf.isnull().any(axis=1)]

# Display the rows with missing values
print(rows_with_missing_values)

Total missing values in the DataFrame: 0
Empty DataFrame
Columns: [OUTNUM, DOCNUM, STATUS, PICCOD, CUSNUM, ORDDAT, DLVDAT, SHPTYP, TOUR, CRTDAT, TRNDAT]
Index: []


In [83]:
####
#### Cleaning - unique entries in SRC_WA
####
# Count the number of unique entries in the 'SRC_WA' column
unique_entries_count = df_wa_kopf['OUTNUM'].nunique()

# Count the occurrences of each unique entry in the 'SRC_WA' column
unique_entries_occurrences = df_wa_kopf['OUTNUM'].value_counts()

# Print the results
print(f"There are {unique_entries_count} unique entries in the 'OUTNUM' column.")
print("\nOccurrences of each unique entry:")
print(unique_entries_occurrences)

There are 19314904 unique entries in the 'OUTNUM' column.

Occurrences of each unique entry:
30723507    1
31429611    1
31418553    1
31422515    1
31418440    1
           ..
22136170    1
22136169    1
22136168    1
22136167    1
35224902    1
Name: OUTNUM, Length: 19314904, dtype: int64


In [85]:
####
#### Cleaning - unique entries in SRC_WA
####
# Count the number of unique entries in the 'SRC_WA' column
unique_entries_count = df_wa_kopf['DOCNUM'].nunique()

# Count the occurrences of each unique entry in the 'SRC_WA' column
unique_entries_occurrences = df_wa_kopf['DOCNUM'].value_counts()

# Print the results
print(f"There are {unique_entries_count} unique entries in the 'ORDNUM' column.")
print("\nOccurrences of each unique entry:")
print(unique_entries_occurrences)

There are 5965848 unique entries in the 'ORDNUM' column.

Occurrences of each unique entry:
28377289    5002
28109999    3890
28055543    3847
28168002    3829
28062966    3718
            ... 
28592674       0
28592676       0
34653261       0
34653258       0
34656371       0
Name: DOCNUM, Length: 5970945, dtype: int64


In [82]:
df_wa_kopf.head()

Unnamed: 0,OUTNUM,DOCNUM,STATUS,PICCOD,CUSNUM,SHPTYP,TOUR
0,30723507,33054472,90,BS17,100986,2,85
1,30711706,33052260,90,BS17,94536,2,85
2,30730649,33056533,90,BS17,90715,2,83
3,30712103,33052611,90,BS15-S,165269,124,0
4,30733401,33057203,90,BS17,2542,2,94


    # WA Positionen prep

In [37]:
### Consolidated

# List of columns to keep (ran into memory issues...)
columns_to_keep = [
    "OUTNUM", "STATUS", "ITEM", "LOT",
    "ORDQTY", "CONQTY", "USERID", "TRNNUM" #  "TRNDAT", "CRTDAT", not needed rn
]

dtypes = {
    #"OUTLIN": "int32", -- not used, useless column
    "OUTNUM": "int32",
    "STATUS": "int32",
    "ITEM": "category",
    "LOT": "string", # 3 character string, e.G: 001, 002, 006, 012 etc.
    "ORDQTY": "int32",
    "CONQTY": "float32", # somehow it thinks these values are float data
    "USERID": "category",
    "TRNNUM": "int32"
}

# Load the CSV with optimized settings, and only load necessary cols
df_wa_positionen = pd.read_csv(
    '../Data/wa_positionen.csv',
    usecols=columns_to_keep,  # Only load the specified columns
    dtype=dtypes,
    #usecols=columns_to_use,  # Only load required columns
    #dtype=dtypes,            # Use optimized data types
    # parse_dates=["TRNDAT"], # "CRTDAT", not needed rn
    low_memory=False
)

# Drop rows with missing values from the DataFrame
df_wa_positionen.dropna(inplace=True)

# Drop rows where CONQTY has a fractional part (i.e., a value with anything after the decimal point)
df_wa_positionen = df_wa_positionen[df_wa_positionen["CONQTY"] % 1 == 0]

# Convert CONQTY column to integer type to reflect that it no longer has fractions
df_wa_positionen["CONQTY"] = df_wa_positionen["CONQTY"].astype(int)

## WRITE TO CSV todo

# :::

In [42]:
df_wa_positionen.head()

Unnamed: 0,OUTNUM,STATUS,ITEM,LOT,ORDQTY,CONQTY,USERID,TRNNUM
0,9684339,90,4299149,3,11,7,PASAMONTES,536001122
1,10756704,90,28872808,2,83,83,168388,807604606
2,10756724,90,19020166,6,40,40,168388,807604231
3,11244298,90,35282807,1,3,3,MUAMET,518249888
4,11244340,90,35448829,1,4,4,MUAMET,518249888


In [90]:
####
#### Cleaning - unique entries in SRC_QACODE
####
# Count the number of unique entries in the 'SRC_WA' column
unique_entries_count = df_wa_positionen['LOT'].nunique()

# Count the occurrences of each unique entry in the 'SRC_WA' column
unique_entries_occurrences = df_wa_positionen['LOT'].value_counts()

# Print the results
print(f"There are {unique_entries_count} unique entries in the 'SRC_QACODE' column.")
print("\nOccurrences of each unique entry:")
print(unique_entries_occurrences)

There are 80 unique entries in the 'SRC_QACODE' column.

Occurrences of each unique entry:
001      15432416
000       2097906
002        717644
003        346643
004        215649
           ...   
0016            5
1001            2
223             1
.001            1
de001           1
Name: LOT, Length: 80, dtype: Int64


In [39]:
# check for null values
# Total count of missing values in the DataFrame
total_missing = df_wa_positionen.isnull().sum().sum()
print(f"Total missing values in the DataFrame: {total_missing}")

# Filter DataFrame to show only rows with at least one missing value
rows_with_missing_values = df_wa_positionen[df_wa_positionen.isnull().any(axis=1)]

# Display the rows with missing values
print(rows_with_missing_values)

Total missing values in the DataFrame: 0
Empty DataFrame
Columns: [OUTNUM, STATUS, ITEM, LOT, ORDQTY, CONQTY, USERID, TRNNUM]
Index: []


In [43]:
# Filter rows where CONQTY contains an actual float value
float_rows = df_wa_positionen[df_wa_positionen["CONQTY"] % 1 != 0]

# Display the rows with float values in CONQTY
print("Rows with actual float values in the CONQTY column:")
print(float_rows)

Rows with actual float values in the CONQTY column:
Empty DataFrame
Columns: [OUTNUM, STATUS, ITEM, LOT, ORDQTY, CONQTY, USERID, TRNNUM]
Index: []


In [44]:
count_float_values = len(float_rows)
print(f"Number of rows with float values in CONQTY: {count_float_values}")

Number of rows with float values in CONQTY: 0


# Joins

In [None]:

# = pd.merge(df_bewegungen, df_wa_kopf, on='OUTNUM', how='inner')


In [92]:
df_full = (
    pd.merge(df_bewegungen, df_wa_kopf, on='OUTNUM', how='inner')
    .merge(df_wa_positionen, on='OUTNUM', how='inner')
)

print(df_full.head())


   SRC_ITEM SRC_LOT DST_QACODE SRC_X SRC_Y SRC_Z    OUTNUM     LISNUM  \
0  36529803     001          H  1601    02   D15  31313376  289679194   
1  44118619     001          H  1780    02   E03  31320349  289874022   
2  13091411     001          H  2215    04   F13  31324968  289835037   
3  16359571     001          H  2004    07   G05  31346782  289840157   
4  38789620     001          H  0112    08  BA05  31344732  289808077   

      SUMLIS   TRNNUM_x  ...  CUSNUM SHPTYP TOUR  STATUS_y      ITEM  LOT  \
0  289679311  599324284  ...   83055     02   93        90  36529803  001   
1  289874463  599670338  ...  164073     02   95        90  44118619  001   
2  289835121  599596179  ...   34858     07   52        90  13091411  001   
3  289725531  599576127  ...   30804     02   96        90  16359571  001   
4  289726826  599529006  ...   13105     07   59        90  38789620  001   

  ORDQTY CONQTY  USERID_y   TRNNUM_y  
0      9      9     LXONE  599670642  
1      2      2     

In [87]:
df_full.head()
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19628916 entries, 0 to 19628915
Data columns (total 31 columns):
 #   Column      Dtype         
---  ------      -----         
 0   WORNUM      int32         
 1   STATUS_x    int32         
 2   SRC_ITEM    category      
 3   SRC_LOT     category      
 4   SRC_QACODE  category      
 5   DST_LOT     category      
 6   DST_QACODE  category      
 7   SRC_WA      category      
 8   SRC_X       category      
 9   SRC_Y       category      
 10  SRC_Z       category      
 11  DST_WA      category      
 12  OUTNUM      int32         
 13  LISNUM      int32         
 14  SUMLIS      int32         
 15  TRNNUM_x    int32         
 16  TRNDAT      datetime64[ns]
 17  USERID_x    category      
 18  DOCNUM      category      
 19  STATUS_y    int32         
 20  PICCOD      category      
 21  CUSNUM      category      
 22  SHPTYP      category      
 23  TOUR        category      
 24  STATUS      int32         
 25  ITEM        cate

In [93]:
num_rows = len(df_full)
print(f"Number of rows: {num_rows}")

Number of rows: 17505630


In [29]:
# check for null values
# Total count of missing values in the DataFrame
total_missing = df_full.isnull().sum().sum()
print(f"Total missing values in the DataFrame: {total_missing}")

# Filter DataFrame to show only rows with at least one missing value
rows_with_missing_values = df_full[df_full.isnull().any(axis=1)]

# Display the rows with missing values
print(rows_with_missing_values)

Total missing values in the DataFrame: 0
Empty DataFrame
Columns: [WORNUM, STATUS_x, SRC_ITEM, SRC_LOT, SRC_QACODE, DST_LOT, DST_QACODE, SRC_WA, SRC_X, SRC_Y, SRC_Z, DST_WA, OUTNUM, LISNUM, SUMLIS, TRNNUM_x, CRTDAT_x, TRNDAT_x, USERID_x, DOCNUM, STATUS_y, PICCOD, CUSNUM, ORDDAT, DLVDAT, SHPTYP, TOUR, CRTDAT_y, TRNDAT_y, STATUS, ITEM, LOT, ORDQTY, CONQTY, CRTDAT, TRNDAT, USERID_y, TRNNUM_y]
Index: []

[0 rows x 38 columns]


Export Dataframe to Parquet file

In [28]:
import pyarrow

# Save the fully joined DataFrame to a Parquet file for better performance
output_file = 'joined_data_v2.parquet'
df_full.to_parquet(output_file, index=False)

print(f"Fully joined DataFrame saved to {output_file}")

Fully joined DataFrame saved to joined_data_v2.parquet


In [None]:
# reading parquet file later

df_loaded = pd.read_parquet('joined_data.parquet')
print(df_loaded.head())


##TEMP


In [89]:
df_full.head()

Unnamed: 0,WORNUM,STATUS_x,SRC_ITEM,SRC_LOT,SRC_QACODE,DST_LOT,DST_QACODE,SRC_WA,SRC_X,SRC_Y,...,CUSNUM,SHPTYP,TOUR,STATUS,ITEM,LOT,ORDQTY,CONQTY,USERID_y,TRNNUM_y
0,289678987,50,36529803,1,H,1,H,EG,1601,2,...,83055,2,93,90,36529803,1,9,9,LXONE,599670642
1,289870561,50,44118619,1,H,1,H,EG,1780,2,...,164073,2,95,90,44118619,1,2,2,LXONE,599678660
2,289833599,50,13091411,1,H,1,H,EG,2215,4,...,34858,7,52,90,13091411,1,1,1,LXONE,599659813
3,289815259,50,9636829,0,H,0,H,EG,2401,2,...,94180,7,52,90,9636829,0,1,1,LXONE,599669523
4,289839097,50,16359571,1,H,1,H,EG,2004,7,...,30804,2,96,90,16359571,1,1,1,LXONE,599659634


## Machine Learning


In [46]:
# Encoding
columns_and_types = df_full.dtypes.reset_index()
columns_and_types.columns = ['Column', 'Data Type']

# Display the table
print(columns_and_types)

        Column       Data Type
0       WORNUM           int32
1     STATUS_x           int32
2     SRC_ITEM        category
3      SRC_LOT        category
4   SRC_QACODE        category
5      DST_LOT        category
6   DST_QACODE        category
7       SRC_WA        category
8        SRC_X        category
9        SRC_Y        category
10       SRC_Z        category
11      DST_WA        category
12      OUTNUM           int32
13      LISNUM           int32
14      SUMLIS           int32
15    TRNNUM_x           int32
16      TRNDAT  datetime64[ns]
17    USERID_x        category
18      DOCNUM        category
19    STATUS_y           int32
20      PICCOD        category
21      CUSNUM        category
22      SHPTYP        category
23        TOUR        category
24      STATUS           int32
25        ITEM        category
26         LOT          string
27      ORDQTY           int32
28      CONQTY           int32
29    USERID_y        category
30    TRNNUM_y           int32
