### All copied from final_dataprep_v1.ipynb to testrun fully and merge the df on FK and PKs for first usecase

# Bewegungen.csv prep

In [1]:
# consolidated :::
import pandas as pd
from idna.idnadata import joining_types

# List of columns to keep (ran into memory issues...)
columns_to_keep = [
    "STATUS", "SRC_ITEM", "SRC_LOT", "SRC_QACODE",
    "DST_QACODE", "SRC_WA", "SRC_X", "SRC_Y", "SRC_Z", "CONQTY", "OUTNUM",
    "LISNUM", "SUMLIS", "TRNDAT", "USERID", # , "TRNNUM", "WORNUM", "DST_LOT", "DST_WA", "LOADDAT", "CRTDAT" not needed (just the date when its imported into DWH)
]

dtypes = {
#    "WORNUM": "int32",
    "STATUS": "int32",
#    "MOVTYP": "category",
#    "MOVKEY": "category",
    "SRC_ITEM": "category",
    "SRC_LOT": "string",
    "SRC_QACODE": "category",
    #"DST_LOT": "string",
    "DST_QACODE": "category",
    "SRC_WA": "category",
    "SRC_X": "category",
    "SRC_Y": "category",
    "SRC_Z": "category",
#    "DST_WA": "category",
    "CONQTY": "int32",
    "OUTNUM": "int32", # KEY
#    "RELNUM": "int32",
    "LISNUM": "int32",
    "SUMLIS": "int32",
#    "TRNNUM": "int32",
    "USERID": "category",
}

# Load the CSV with optimized settings, and only load necessary cols
df_bewegungen = pd.read_csv(
    '../Data/bewegungen.csv',
    usecols=columns_to_keep,  # Only load the specified columns
    dtype=dtypes, # use optimized, manually set data types
    #usecols=columns_to_use,  # Only load required columns
    parse_dates=["TRNDAT"],
    low_memory=False
)

# Clean SRC_LOT column (Column "Artikelcharge", this LOT usually is a 3 digit int. An Article can have multiple LOTs. I simplify by removing leading zeros and clean up the column from wrong manual usererrors.
#df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].fillna('').astype(str)  # Handle NaNs and convert to string
#df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].apply(lambda x: '0' if x == '000' else x.lstrip('0'))  # Remove leading zeros
#df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].apply(lambda x: x if x.isdigit() and len(x) <= 3 else None)  # Validate format
# keep value if: consists of digits, and its length is less than or equal to 3 -- otherwise replace with none

# Clean DST_LOT column
#df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].fillna('').astype(str)  # Handle NaNs and convert to string
#df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].apply(lambda x: '0' if x == '000' else x.lstrip('0'))  # Remove leading zeros
#df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].apply(lambda x: x if x.isdigit() and len(x) <= 3 else None)  # Validate format
# keep value if: consists of digits, and its length is less than or equal to 3 -- otherwise replace with none

####
# Data Cleaning
####

# Temporarily convert 'SRC_LOT' to string type for cleaning
df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].astype(str)  # Convert Categorical to string

# Clean and validate the SRC_LOT column (Column "Artikelcharge", this LOT usually is a 3 digit int. An Article can have multiple LOTs. I simplify by removing leading zeros and clean up the column from wrong manual usererrors.)
def clean_lot(value):
    value = value.lstrip('0') if value != '000' else value  # Remove leading zeros unless '000'
    if value.isdigit() and 1 <= len(value) <= 3:           # Validate length (1 to 3 digits)
        return value.zfill(3)                              # Pad with leading zeros to ensure 3 digits
    return None                                            # Remove invalid entries

df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].apply(clean_lot)
# Convert back to Categorical if needed
df_bewegungen['SRC_LOT'] = pd.Categorical(df_bewegungen['SRC_LOT'])

# Clean STATUS column (10= Offen, 50= Bestätigt, 95= Storniert -- for my purposes I'm only interested in "Bestätigt" rows.
# Filter the DataFrame to keep only rows with status == 50 (abgeschlossen)
df_bewegungen = df_bewegungen[df_bewegungen['STATUS'] == 50]

# Filter the DataFrame in place to include only rows where 'SRC_WA' contains "WA", excluding all 'WE', 'WA', 'AU' and 'UM' areas
df_bewegungen = df_bewegungen[df_bewegungen['SRC_WA'] == 'EG']

# Remove rows where USER == 'LXONE'
df_bewegungen = df_bewegungen[df_bewegungen['USERID'] != 'LXONE']

# Only keep rows with QACODE == 'H'
df_bewegungen = df_bewegungen[df_bewegungen['SRC_QACODE'] == 'H']

# Remove all additional Crossdocking movements, easiest to pinpoint via SRC_LOT = 000
df_bewegungen = df_bewegungen[df_bewegungen['SRC_LOT'] != '000']


# Combine SRC_X, SRC_Y, and SRC_Z into a new column STOR_LOC
df_bewegungen['STOR_LOC'] = df_bewegungen['SRC_X'].astype(str) + \
                            df_bewegungen['SRC_Y'].astype(str) + \
                            df_bewegungen['SRC_Z'].astype(str)

# Change datatypes as needed
df_bewegungen = df_bewegungen.astype({
    "SRC_LOT": "category",
#    "DST_LOT": "category",
    "STOR_LOC": "category",
})

# Drop the original SRC_X, SRC_Y, and SRC_Z columns
df_bewegungen.drop(['SRC_X', 'SRC_Y', 'SRC_Z'], axis=1, inplace=True)

# Drop rows with missing values from the DataFrame - dropna (by default, without parameters) removes entire rows which have a NaN or null value
df_bewegungen.dropna(inplace=True)

# Drop both 'SRC_WA' and 'DST_WA' columns from the DataFrame now that we filtered the data
df_bewegungen.drop(columns=['STATUS', 'SRC_WA', 'SRC_QACODE', 'DST_QACODE'], inplace=True)

# Remove unused categories (i.e. Artifacts) across all categorical columns (after all cleaning steps there were pandas retained all defined categories, even though they no longer existed in the data. Not sure if this would've led to problems down the road during embedding/labeling, so I removed them to avoid eventual issues)
for col in df_bewegungen.select_dtypes(include='category').columns:
    df_bewegungen[col] = df_bewegungen[col].cat.remove_unused_categories()

# Verify the changes
print(f"Updated DataFrame shape: {df_bewegungen.shape}")
# :::

KeyboardInterrupt: 

In [24]:
df_bewegungen.head()

Unnamed: 0,SRC_ITEM,SRC_LOT,DST_QACODE,OUTNUM,LISNUM,SUMLIS,TRNNUM,TRNDAT,USERID,STOR_LOC
0,36529803,1,H,31313376,289679194,289679311,599324284,2024-04-25 08:16:39,24769,160102D15
1,44118619,1,H,31320349,289874022,289874463,599670338,2024-04-25 19:29:47,178141,178002E03
2,13091411,1,H,31324968,289835037,289835121,599596179,2024-04-25 16:42:37,LAESSIG,221504F13
4,16359571,1,H,31346782,289840157,289725531,599576127,2024-04-25 16:03:15,104044,200407G05
5,38789620,1,H,31344732,289808077,289726826,599529006,2024-04-25 14:44:06,170056,011208BA05


In [2]:

# Find all categorical columns with unused categories
for col in df_bewegungen.select_dtypes(include='category').columns:
    unused_categories = set(df_bewegungen[col].cat.categories) - set(df_bewegungen[col].dropna().unique())
    if unused_categories:
        print(f"Column '{col}' has unused categories: {unused_categories}")
    else:
        print(f"Column '{col}' has no unused categories.")

Column 'SRC_ITEM' has no unused categories.
Column 'SRC_LOT' has no unused categories.
Column 'DST_QACODE' has no unused categories.
Column 'SRC_X' has no unused categories.
Column 'SRC_Y' has no unused categories.
Column 'SRC_Z' has no unused categories.
Column 'USERID' has no unused categories.


In [11]:
####
#### Cleaning - unique entries in SRC_WA
####
# Count the number of unique entries in the 'SRC_WA' column
unique_entries_count = df_bewegungen['SRC_LOT'].nunique()

# Count the occurrences of each unique entry in the 'SRC_WA' column
unique_entries_occurrences = df_bewegungen['SRC_LOT'].value_counts()

# Print the results
print(f"There are {unique_entries_count} unique entries in the 'DST_WA' column.")
print("\nOccurrences of each unique entry:")
print(unique_entries_occurrences)


There are 71 unique entries in the 'DST_WA' column.

Occurrences of each unique entry:
001    15984112
002      749870
003      362855
004      226203
005      138548
         ...   
045          57
077          50
035          38
051          13
223           1
Name: SRC_LOT, Length: 71, dtype: int64


In [3]:
print("000" in df_bewegungen['SRC_LOT'].cat.categories)

True


In [75]:
####
#### Cleaning - unique entries in STATUS
####
# Count the number of unique entries in the 'SRC_WA' column
unique_entries_count = df_bewegungen['SRC_QACODE'].nunique()

# Count the occurrences of each unique entry in the 'SRC_WA' column
unique_entries_occurrences = df_bewegungen['SRC_QACODE'].value_counts()

# Print the results
print(f"There are {unique_entries_count} unique entries in the 'SRC_QACODE' column.")
print("\nOccurrences of each unique entry:")
print(unique_entries_occurrences)

KeyError: 'SRC_QACODE'

In [76]:
####
#### Cleaning - unique entries in SRC_QACODE
####
# Count the number of unique entries in the 'SRC_WA' column
unique_entries_count = df_bewegungen['DST_LOT'].nunique()

# Count the occurrences of each unique entry in the 'SRC_WA' column
unique_entries_occurrences = df_bewegungen['DST_LOT'].value_counts()

# Print the results
print(f"There are {unique_entries_count} unique entries in the 'SRC_QACODE' column.")
print("\nOccurrences of each unique entry:")
print(unique_entries_occurrences)

KeyError: 'DST_LOT'

In [18]:
df_bewegungen.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17914909 entries, 0 to 20094144
Data columns (total 12 columns):
 #   Column      Dtype         
---  ------      -----         
 0   SRC_ITEM    category      
 1   SRC_LOT     category      
 2   DST_QACODE  category      
 3   SRC_X       category      
 4   SRC_Y       category      
 5   SRC_Z       category      
 6   OUTNUM      int32         
 7   LISNUM      int32         
 8   SUMLIS      int32         
 9   TRNNUM      int32         
 10  TRNDAT      datetime64[ns]
 11  USERID      category      
dtypes: category(7), datetime64[ns](1), int32(4)
memory usage: 772.7 MB


In [3]:
df_bewegungen.head()

Unnamed: 0,SRC_ITEM,SRC_LOT,DST_QACODE,SRC_X,SRC_Y,SRC_Z,OUTNUM,LISNUM,SUMLIS,TRNNUM,TRNDAT,USERID
0,36529803,1,H,1601,2,D15,31313376,289679194,289679311,599324284,2024-04-25 08:16:39,24769
1,44118619,1,H,1780,2,E03,31320349,289874022,289874463,599670338,2024-04-25 19:29:47,178141
2,13091411,1,H,2215,4,F13,31324968,289835037,289835121,599596179,2024-04-25 16:42:37,LAESSIG
4,16359571,1,H,2004,7,G05,31346782,289840157,289725531,599576127,2024-04-25 16:03:15,104044
5,38789620,1,H,112,8,BA05,31344732,289808077,289726826,599529006,2024-04-25 14:44:06,170056


In [4]:
# Check if the OUTNUM column in df_bewegungen has unique values
is_unique = not df_bewegungen['OUTNUM'].duplicated().any()

# Print result
if is_unique:
    print("The OUTNUM column has only unique values.")
else:
    print("The OUTNUM column contains duplicate values.")

The OUTNUM column contains duplicate values.


In [5]:
# Count duplicate values in the OUTNUM column
num_duplicates = df_bewegungen['OUTNUM'].duplicated().sum()

# Print the result
print(f"Number of duplicate values in the OUTNUM column: {num_duplicates}")


Number of duplicate values in the OUTNUM column: 315324


In [6]:

# Filter rows with duplicate values in the OUTNUM column
duplicate_rows = df_bewegungen[df_bewegungen['OUTNUM'].duplicated(keep=False)]

# Retrieve the first 10 rows with duplicates
top_10_duplicates = duplicate_rows.head(10)

# Print the first 10 duplicate rows
print("First 10 rows with duplicate values in the OUTNUM column:")
print(top_10_duplicates)

First 10 rows with duplicate values in the OUTNUM column:
    SRC_ITEM SRC_LOT DST_QACODE SRC_X SRC_Y SRC_Z    OUTNUM     LISNUM  \
26  44492423     001          H  0215    01   B15  31343207  289789855   
34  38577015     001          H  0915    04   C06  31269284  289698865   
44  38577015     001          H  1418    10   C10  31269284  289698921   
49  38577090     001          H  1321    13   B17  31269302  289698898   
50  44123844     001          H  1502    06   D01  31269381  289698927   
69  38577029     001          H  1920    13   D02  31269287  289698945   
77  42421817     001          H  0910    02   C05  31269340  289698863   
83  38577099     001          H  0922    11   B13  31269303  289698869   
94  37726813     001          H  1111    11   C07  31269249  289698884   
96  42421430     001          H  1008    15   C03  31269323  289698876   

       SUMLIS     TRNNUM              TRNDAT  USERID  
26  289789898  599583371 2024-04-25 16:24:13  152420  
34  289698987  59

# ? maybe remove _ check for NaN / missing values in DF

In [14]:
# check for null values
# Total count of missing values in the DataFrame
total_missing = df_bewegungen.isnull().sum().sum()
print(f"Total missing values in the DataFrame: {total_missing}")

# Filter DataFrame to show only rows with at least one missing value
rows_with_missing_values = df_bewegungen[df_bewegungen.isnull().any(axis=1)]

# Display the rows with missing values
print(rows_with_missing_values)

Total missing values in the DataFrame: 0
Empty DataFrame
Columns: [SRC_ITEM, SRC_LOT, DST_QACODE, SRC_X, SRC_Y, SRC_Z, OUTNUM, LISNUM, SUMLIS, TRNNUM, TRNDAT, USERID]
Index: []


# WA kopf.csv prep

In [9]:
#Consolidated:::
# List of columns to keep (ran into memory issues...)
columns_to_keep = [
    "OUTNUM", "DOCNUM", "STATUS", "PICCOD", "CUSNUM",
    "SHPTYP", "TOUR", # "TRNDAT" "CRTDAT", "ORDDAT", "DLVDAT", not needed rn
]

dtypes = {
    "OUTNUM": "int32", # KEY
    "DOCNUM": "category", # Warenausgangsnummer
    # "ORDNUM": "category", # dont think i need this, seems it achieves the same as DOCNUM
    "STATUS": "int32",
    "PICCOD": "category",
    "CUSNUM": "category", # kunde
    "SHPTYP": "category", # versandart
    "TOUR": "category",
}

df_wa_kopf = pd.read_csv(
    '../Data/wa_kopf.csv',
    usecols=columns_to_keep,  # Only load the specified columns
    dtype=dtypes,
    #usecols=columns_to_use,  # Only load required columns
    #dtype=dtypes,            # Use optimized data types
    # parse_dates=["TRNDAT"], # "ORDDAT", "DLVDAT", "CRTDAT",  not needed rn
    low_memory=False
)
# Filter the DataFrame to keep only rows with status == 90 (abgeschlossene)
df_wa_kopf = df_wa_kopf[df_wa_kopf['STATUS'] == 90]

# Drop rows with missing values from the DataFrame - some early data and regression tests lead to PICCOD, SHPTYP and TOUR being empty (~86 rows)
df_wa_kopf.dropna(inplace=True) # inplace=True modifies DataFrame directly without having to create a new one

# Drop both 'SRC_WA' and 'DST_WA' columns from the DataFrame now that we filtered the data
df_wa_kopf.drop(columns=['STATUS'], inplace=True)

# Step 1: Merge main DataFrame with customers
# df_orders = pd.merge(df_bewegungen, df_wa_kopf, on='OUTNUM', how='inner')

#:::

In [32]:
# Extract unique values from both columns
unique_col_table1 = df_bewegungen['OUTNUM'].unique()
unique_col_table2 = df_wa_kopf['OUTNUM'].unique()

# Compare unique sets
print(f"Number of unique values in table1: {len(unique_col_table1)}")
print(f"Number of unique values in table2: {len(unique_col_table2)}")

Number of unique values in table1: 17599585
Number of unique values in table2: 19314904


In [33]:
# Count duplicate values in the OUTNUM column
num_duplicates = df_wa_kopf['OUTNUM'].duplicated().sum()

# Print the result
print(f"Number of duplicate values in the OUTNUM column: {num_duplicates}")
# Filter rows with duplicate values in the OUTNUM column
duplicate_rows = df_wa_kopf[df_wa_kopf['OUTNUM'].duplicated(keep=False)]

# Retrieve the first 10 rows with duplicates
top_10_duplicates = duplicate_rows.head(10)

# Print the first 10 duplicate rows
print("First 10 rows with duplicate values in the OUTNUM column:")
print(top_10_duplicates)

Number of duplicate values in the OUTNUM column: 0
First 10 rows with duplicate values in the OUTNUM column:
Empty DataFrame
Columns: [OUTNUM, DOCNUM, STATUS, PICCOD, CUSNUM, SHPTYP, TOUR]
Index: []


In [15]:
# check for null values
# Total count of missing values in the DataFrame
total_missing = df_wa_kopf.isnull().sum().sum()
print(f"Total missing values in the DataFrame: {total_missing}")

# Filter DataFrame to show only rows with at least one missing value
rows_with_missing_values = df_wa_kopf[df_wa_kopf.isnull().any(axis=1)]

# Display the rows with missing values
print(rows_with_missing_values)

Total missing values in the DataFrame: 0
Empty DataFrame
Columns: [OUTNUM, DOCNUM, STATUS, PICCOD, CUSNUM, SHPTYP, TOUR]
Index: []


In [83]:
####
#### Cleaning - unique entries in SRC_WA
####
# Count the number of unique entries in the 'SRC_WA' column
unique_entries_count = df_wa_kopf['OUTNUM'].nunique()

# Count the occurrences of each unique entry in the 'SRC_WA' column
unique_entries_occurrences = df_wa_kopf['OUTNUM'].value_counts()

# Print the results
print(f"There are {unique_entries_count} unique entries in the 'OUTNUM' column.")
print("\nOccurrences of each unique entry:")
print(unique_entries_occurrences)

There are 19314904 unique entries in the 'OUTNUM' column.

Occurrences of each unique entry:
30723507    1
31429611    1
31418553    1
31422515    1
31418440    1
           ..
22136170    1
22136169    1
22136168    1
22136167    1
35224902    1
Name: OUTNUM, Length: 19314904, dtype: int64


In [85]:
####
#### Cleaning - unique entries in SRC_WA
####
# Count the number of unique entries in the 'SRC_WA' column
unique_entries_count = df_wa_kopf['DOCNUM'].nunique()

# Count the occurrences of each unique entry in the 'SRC_WA' column
unique_entries_occurrences = df_wa_kopf['DOCNUM'].value_counts()

# Print the results
print(f"There are {unique_entries_count} unique entries in the 'ORDNUM' column.")
print("\nOccurrences of each unique entry:")
print(unique_entries_occurrences)

There are 5965848 unique entries in the 'ORDNUM' column.

Occurrences of each unique entry:
28377289    5002
28109999    3890
28055543    3847
28168002    3829
28062966    3718
            ... 
28592674       0
28592676       0
34653261       0
34653258       0
34656371       0
Name: DOCNUM, Length: 5970945, dtype: int64


In [28]:
df_wa_kopf.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19314904 entries, 0 to 19377912
Data columns (total 7 columns):
 #   Column  Dtype   
---  ------  -----   
 0   OUTNUM  int32   
 1   DOCNUM  category
 2   STATUS  int32   
 3   PICCOD  category
 4   CUSNUM  category
 5   SHPTYP  category
 6   TOUR    category
dtypes: category(5), int32(2)
memory usage: 654.8 MB


In [26]:
df_wa_kopf.head()

Unnamed: 0,OUTNUM,DOCNUM,STATUS,PICCOD,CUSNUM,SHPTYP,TOUR
0,30723507,33054472,90,BS17,100986,2,85
1,30711706,33052260,90,BS17,94536,2,85
2,30730649,33056533,90,BS17,90715,2,83
3,30712103,33052611,90,BS15-S,165269,124,0
4,30733401,33057203,90,BS17,2542,2,94


    # WA Positionen prep

In [11]:
### Consolidated

# List of columns to keep (ran into memory issues...)
columns_to_keep = [
    "OUTNUM", "STATUS", "ITEM", # "LOT",
    "ORDQTY", "CONQTY", "USERID", "TRNNUM" #  "TRNDAT", "CRTDAT", not needed rn
]

dtypes = {
    #"OUTLIN": "int32", -- not used, useless column
    "OUTNUM": "int32",
    "STATUS": "int32",
    "ITEM": "category",
#    "LOT": "string", # 3 character string, e.G: 001, 002, 006, 012 etc.
    "ORDQTY": "int32",
#    "RELQTY": "int32" # freigegebene Menge
#    "FNDQTY": "int32" # reservierte Menge
#    "SHPQTY": "int32" # versendete Menge
    "CONQTY": "float32", # setting this to float for initial load, going to clean the dataframe and change it to int later
    "USERID": "category",
    "TRNNUM": "int32"
}

# Load the CSV with optimized settings, and only load necessary cols
df_wa_positionen = pd.read_csv(
    '../Data/wa_positionen.csv',
    usecols=columns_to_keep,  # Only load the specified columns
    dtype=dtypes,
    #usecols=columns_to_use,  # Only load required columns
    #dtype=dtypes,            # Use optimized data types
    # parse_dates=["TRNDAT","CRTDAT"] # , not needed rn
    low_memory=False
)

# Drop rows with missing values from the DataFrame
df_wa_positionen.dropna(inplace=True)

# Drop rows where CONQTY has a fractional part (i.e., a value with anything after the decimal point)
df_wa_positionen = df_wa_positionen[df_wa_positionen["CONQTY"] % 1 == 0]

# Convert CONQTY column to integer type to reflect that it no longer has fractions
df_wa_positionen["CONQTY"] = df_wa_positionen["CONQTY"].astype(int)

## WRITE TO CSV todo

# :::

In [12]:
# Count duplicate values in the OUTNUM column
num_duplicates = df_wa_positionen['OUTNUM'].duplicated().sum()

# Print the result
print(f"Number of duplicate values in the OUTNUM column: {num_duplicates}")
# Filter rows with duplicate values in the OUTNUM column
duplicate_rows = df_wa_positionen[df_wa_positionen['OUTNUM'].duplicated(keep=False)]

# Retrieve the first 10 rows with duplicates
top_10_duplicates = duplicate_rows.head(10)

# Print the first 10 duplicate rows
print("First 10 rows with duplicate values in the OUTNUM column:")
print(top_10_duplicates)

Number of duplicate values in the OUTNUM column: 1333
First 10 rows with duplicate values in the OUTNUM column:
          OUTNUM  STATUS      ITEM  ORDQTY  CONQTY  USERID     TRNNUM
51897   15783851      90   4574463       7       7   LXONE  497154217
51898   15783851      90   4574463       3       3   LXONE  497154217
62476   15794462      90  32858191      55      55  RIEDER  498057928
62477   15794462      90  32858191      54      54  RIEDER  498057928
67517   15799594      90  35170676      31      31  RIEDER  502119803
67518   15799594      90  35170676     269     269  RIEDER  502296482
74785   15806895      90  37085455       3       3   LXONE  498338562
74786   15806895      90  37085455       2       2   LXONE  498338562
127804  15860126      90   5173272       1       1  170719  503609904
127805  15860126      90   5173272       1       1  170719  503609904


In [13]:
# Filter rows with duplicate values in the OUTNUM column
duplicate_rows = df_wa_positionen[df_wa_positionen['OUTNUM'].duplicated(keep=False)]

# Count the total number of rows that have duplicates
num_duplicate_rows = len(duplicate_rows)

# Print the result
print(f"Total number of rows with duplicate values in the OUTNUM column: {num_duplicate_rows}")

Total number of rows with duplicate values in the OUTNUM column: 2656


In [27]:
df_wa_positionen.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19375221 entries, 0 to 19375223
Data columns (total 8 columns):
 #   Column  Dtype   
---  ------  -----   
 0   OUTNUM  int32   
 1   STATUS  int32   
 2   ITEM    category
 3   LOT     string  
 4   ORDQTY  int32   
 5   CONQTY  int32   
 6   USERID  category
 7   TRNNUM  int32   
dtypes: category(2), int32(5), string(1)
memory usage: 815.2 MB


In [27]:
df_wa_positionen.head()

Unnamed: 0,OUTNUM,STATUS,ITEM,ORDQTY,CONQTY,USERID,TRNNUM
0,9684339,90,4299149,11,7,PASAMONTES,536001122
1,10756704,90,28872808,83,83,168388,807604606
2,10756724,90,19020166,40,40,168388,807604231
3,11244298,90,35282807,3,3,MUAMET,518249888
4,11244340,90,35448829,4,4,MUAMET,518249888


In [17]:
# check for null values
# Total count of missing values in the DataFrame
total_missing = df_wa_positionen.isnull().sum().sum()
print(f"Total missing values in the DataFrame: {total_missing}")

# Filter DataFrame to show only rows with at least one missing value
rows_with_missing_values = df_wa_positionen[df_wa_positionen.isnull().any(axis=1)]

# Display the rows with missing values
print(rows_with_missing_values)

Total missing values in the DataFrame: 0
Empty DataFrame
Columns: [OUTNUM, STATUS, ITEM, LOT, ORDQTY, CONQTY, USERID, TRNNUM]
Index: []


In [18]:
####
#### Cleaning - unique entries in SRC_QACODE
####
# Count the number of unique entries in the 'SRC_WA' column
unique_entries_count = df_wa_positionen['LOT'].nunique()

# Count the occurrences of each unique entry in the 'SRC_WA' column
unique_entries_occurrences = df_wa_positionen['LOT'].value_counts()

# Print the results
print(f"There are {unique_entries_count} unique entries in the 'SRC_QACODE' column.")
print("\nOccurrences of each unique entry:")
print(unique_entries_occurrences)

There are 80 unique entries in the 'SRC_QACODE' column.

Occurrences of each unique entry:
001      15432416
000       2097906
002        717644
003        346643
004        215649
           ...   
0016            5
1001            2
223             1
.001            1
de001           1
Name: LOT, Length: 80, dtype: Int64


In [39]:
# check for null values
# Total count of missing values in the DataFrame
total_missing = df_wa_positionen.isnull().sum().sum()
print(f"Total missing values in the DataFrame: {total_missing}")

# Filter DataFrame to show only rows with at least one missing value
rows_with_missing_values = df_wa_positionen[df_wa_positionen.isnull().any(axis=1)]

# Display the rows with missing values
print(rows_with_missing_values)

Total missing values in the DataFrame: 0
Empty DataFrame
Columns: [OUTNUM, STATUS, ITEM, LOT, ORDQTY, CONQTY, USERID, TRNNUM]
Index: []


In [43]:
# Filter rows where CONQTY contains an actual float value
float_rows = df_wa_positionen[df_wa_positionen["CONQTY"] % 1 != 0]

# Display the rows with float values in CONQTY
print("Rows with actual float values in the CONQTY column:")
print(float_rows)

Rows with actual float values in the CONQTY column:
Empty DataFrame
Columns: [OUTNUM, STATUS, ITEM, LOT, ORDQTY, CONQTY, USERID, TRNNUM]
Index: []


In [44]:
count_float_values = len(float_rows)
print(f"Number of rows with float values in CONQTY: {count_float_values}")

Number of rows with float values in CONQTY: 0


# Joins

In [31]:
# Group and count OUTNUM in the original tables
bewegungen_counts = df_bewegungen.groupby('OUTNUM').size()
kopf_counts = df_wa_kopf.groupby('OUTNUM').size()
positionen_counts = df_wa_positionen.groupby('OUTNUM').size()

# Combine the counts into a comparison DataFrame
summary_df = pd.DataFrame({
    'Bewegungen': bewegungen_counts,
    'Kopf': kopf_counts,
    'Positionen': positionen_counts
}).fillna(0)  # Fill missing values with 0

# Add a total column
summary_df['Total Matches'] = summary_df.sum(axis=1)

# Filter keys with one-to-many relationships
one_to_many_keys = summary_df[(summary_df['Kopf'] > 1) | (summary_df['Positionen'] > 1)]
print(one_to_many_keys)

          Bewegungen  Kopf  Positionen  Total Matches
OUTNUM                                               
15783851         3.0   1.0         2.0            6.0
15794462         6.0   1.0         2.0            9.0
15799594        11.0   1.0         2.0           14.0
15806895         3.0   1.0         2.0            6.0
15860126         2.0   1.0         2.0            5.0
...              ...   ...         ...            ...
35093408         2.0   1.0         2.0            5.0
35098455         2.0   1.0         2.0            5.0
35110153         2.0   1.0         2.0            5.0
35169793         2.0   1.0         2.0            5.0
35215210         3.0   1.0         2.0            6.0

[1323 rows x 4 columns]


In [32]:
# Group by OUTNUM in the final DataFrame and count rows
final_counts = df_full.groupby('OUTNUM').size()

# Filter for keys with more than one match in df_full
one_to_many_final = final_counts[final_counts > 1]

# Display the results
print(f"Keys with one-to-many relationships after the join:")
print(one_to_many_final)

Keys with one-to-many relationships after the join:
OUTNUM
15427705    2
15427706    2
15427707    2
15427708    2
15427709    2
           ..
35244232    2
35244237    5
35244246    2
35245563    2
35246048    2
Length: 205664, dtype: int64


In [None]:

# = pd.merge(df_bewegungen, df_wa_kopf, on='OUTNUM', how='inner')


In [14]:
df_full = (
    pd.merge(df_bewegungen, df_wa_kopf, on='OUTNUM', how='inner')
    .merge(df_wa_positionen, on='OUTNUM', how='inner')
)

print(df_full.head())


   SRC_ITEM SRC_LOT DST_QACODE SRC_X SRC_Y SRC_Z    OUTNUM     LISNUM  \
0  36529803     001          H  1601    02   D15  31313376  289679194   
1  44118619     001          H  1780    02   E03  31320349  289874022   
2  13091411     001          H  2215    04   F13  31324968  289835037   
3  16359571     001          H  2004    07   G05  31346782  289840157   
4  38789620     001          H  0112    08  BA05  31344732  289808077   

      SUMLIS   TRNNUM_x  ...      PICCOD  CUSNUM SHPTYP  TOUR STATUS_y  \
0  289679311  599324284  ...     BS17-Do   83055     02    93       90   
1  289874463  599670338  ...        BS17  164073     02    95       90   
2  289835121  599596179  ...  BS15-Di/Do   34858     07    52       90   
3  289725531  599576127  ...        BS17   30804     02    96       90   
4  289726826  599529006  ...        BS17   13105     07    59       90   

       ITEM ORDQTY CONQTY  USERID_y   TRNNUM_y  
0  36529803      9      9     LXONE  599670642  
1  44118619      2

In [30]:
# Count occurrences of OUTNUM in the joined DataFrame
outnum_counts = df_full['OUTNUM'].value_counts()

# Display the distribution
print(outnum_counts)

# Check for entries with more than one match
one_to_many = outnum_counts[outnum_counts > 1]
print(f"Number of one-to-many relationships: {len(one_to_many)}")
print(one_to_many)

17514071    50
26680720    46
22799765    40
34917870    40
19772684    39
            ..
21956928     1
21940998     1
21941002     1
21944954     1
35092650     1
Name: OUTNUM, Length: 17194390, dtype: int64
Number of one-to-many relationships: 205664
17514071    50
26680720    46
22799765    40
34917870    40
19772684    39
            ..
24868779     2
32000063     2
28362691     2
17381903     2
24604344     2
Name: OUTNUM, Length: 205664, dtype: int64


In [21]:
# Find duplicates in the entire DataFrame
duplicates = df_full[df_full.duplicated()]

# Print duplicates (if any exist)
print(duplicates)

# Count the number of duplicate rows
print(f"Number of duplicate rows: {len(duplicates)}")

          SRC_ITEM SRC_LOT DST_QACODE SRC_X SRC_Y SRC_Z    OUTNUM     LISNUM  \
83895     25406197     003          H  0924    08   A02  31339265  289880687   
83897     25406197     004          H  0919    05   F02  31339265  289880687   
89052     35536664     002          H  1516    04   F04  30314877  280195471   
89054     35536664     001          H  1511    03   E02  30314877  280195468   
89056     35536664     002          H  1780    03   D02  30314877  280195481   
...            ...     ...        ...   ...   ...   ...       ...        ...   
17341329  38369989     002          H  1204    02   E04  35110153  327008835   
17341331  38369989     003          H  0703    07   G04  35110153  327008385   
17367746  45670606     004          H  0208    03   B02  34786997  323805694   
17493352  30885830     005          H  1007    08   C01  35098455  326878694   
17493354  30885830     004          H  0608    07   D01  35098455  326878589   

             SUMLIS   TRNNUM_x  ...    

In [26]:
# Find duplicates based on specific columns, e.g., OUTNUM
duplicates = df_full[df_full.duplicated(subset=['OUTNUM'])]

# Print the duplicate rows
print(duplicates)

# Count duplicate rows based on OUTNUM
print(f"Number of duplicate rows based on OUTNUM: {len(duplicates)}")

          SRC_ITEM SRC_LOT DST_QACODE SRC_X SRC_Y SRC_Z    OUTNUM     LISNUM  \
24        44492423     001          H  0212    08   D03  31343207  289724989   
25        44492423     001          H  1780    05   C01  31343207  289789886   
32        38577015     001          H  1418    10   C10  31269284  289698921   
33        38577015     001          H  1207    10   H03  31269284  289698887   
34        38577015     001          H  0603    13   B15  31269284  289698844   
...            ...     ...        ...   ...   ...   ...       ...        ...   
17498788   1019832     006          H  1916    08   D01  35079291  326705790   
17499754  33957262     001          H  2002    02   B04  35091016  326789913   
17500949  46576644     001          H  2605    98   A01  35087506  326754964   
17502099  36538297     001          H  1207    02   C03  35096533  326847082   
17504440  45496856     001          H  2245    04   B01  35094269  326825209   

             SUMLIS   TRNNUM_x  ...  CU

In [24]:
# Check if all rows are unique
is_unique = not df_full.duplicated().any()

# Print result
if is_unique:
    print("All rows in df_full are unique.")
else:
    print("There are duplicate rows in df_full.")

There are duplicate rows in df_full.


In [25]:
# Count the number of duplicate rows in the DataFrame
num_duplicates = df_full.duplicated().sum()

# Print the result
print(f"Number of duplicate rows in df_full: {num_duplicates}")

Number of duplicate rows in df_full: 1325


In [17]:
df_full.head()
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17505630 entries, 0 to 17505629
Data columns (total 24 columns):
 #   Column      Dtype         
---  ------      -----         
 0   SRC_ITEM    category      
 1   SRC_LOT     category      
 2   DST_QACODE  category      
 3   SRC_X       category      
 4   SRC_Y       category      
 5   SRC_Z       category      
 6   OUTNUM      int32         
 7   LISNUM      int32         
 8   SUMLIS      int32         
 9   TRNNUM_x    int32         
 10  TRNDAT      datetime64[ns]
 11  USERID_x    category      
 12  DOCNUM      category      
 13  STATUS_x    int32         
 14  PICCOD      category      
 15  CUSNUM      category      
 16  SHPTYP      category      
 17  TOUR        category      
 18  STATUS_y    int32         
 19  ITEM        category      
 20  ORDQTY      int32         
 21  CONQTY      int32         
 22  USERID_y    category      
 23  TRNNUM_y    int32         
dtypes: category(14), datetime64[ns](1), int32(9)
mem

In [16]:
####
#### Cleaning - unique entries in SRC_QACODE
####
# Count the number of unique entries in the 'SRC_WA' column
unique_entries_count = df_full['LOT'].nunique()

# Count the occurrences of each unique entry in the 'SRC_WA' column
unique_entries_occurrences = df_full['LOT'].value_counts()

# Print the results
print(f"There are {unique_entries_count} unique entries in the 'SRC_QACODE' column.")
print("\nOccurrences of each unique entry:")
print(unique_entries_occurrences)

KeyError: 'LOT'

In [34]:
num_rows = len(df_full)
print(f"Number of rows: {num_rows}")

Number of rows: 17505630


In [29]:
# check for null values
# Total count of missing values in the DataFrame
total_missing = df_full.isnull().sum().sum()
print(f"Total missing values in the DataFrame: {total_missing}")

# Filter DataFrame to show only rows with at least one missing value
rows_with_missing_values = df_full[df_full.isnull().any(axis=1)]

# Display the rows with missing values
print(rows_with_missing_values)

Total missing values in the DataFrame: 0
Empty DataFrame
Columns: [WORNUM, STATUS_x, SRC_ITEM, SRC_LOT, SRC_QACODE, DST_LOT, DST_QACODE, SRC_WA, SRC_X, SRC_Y, SRC_Z, DST_WA, OUTNUM, LISNUM, SUMLIS, TRNNUM_x, CRTDAT_x, TRNDAT_x, USERID_x, DOCNUM, STATUS_y, PICCOD, CUSNUM, ORDDAT, DLVDAT, SHPTYP, TOUR, CRTDAT_y, TRNDAT_y, STATUS, ITEM, LOT, ORDQTY, CONQTY, CRTDAT, TRNDAT, USERID_y, TRNNUM_y]
Index: []

[0 rows x 38 columns]


Export Dataframe to Parquet file

In [20]:
import pyarrow

# Save the fully joined DataFrame to a Parquet file for better performance
output_file = 'joined_data_v3.parquet'
df_full.to_parquet(output_file, index=False)

print(f"Fully joined DataFrame saved to {output_file}")

Fully joined DataFrame saved to joined_data_v3.parquet


In [None]:
# reading parquet file later

df_loaded = pd.read_parquet('joined_data.parquet')
print(df_loaded.head())


##TEMP


In [89]:
df_full.head()

Unnamed: 0,WORNUM,STATUS_x,SRC_ITEM,SRC_LOT,SRC_QACODE,DST_LOT,DST_QACODE,SRC_WA,SRC_X,SRC_Y,...,CUSNUM,SHPTYP,TOUR,STATUS,ITEM,LOT,ORDQTY,CONQTY,USERID_y,TRNNUM_y
0,289678987,50,36529803,1,H,1,H,EG,1601,2,...,83055,2,93,90,36529803,1,9,9,LXONE,599670642
1,289870561,50,44118619,1,H,1,H,EG,1780,2,...,164073,2,95,90,44118619,1,2,2,LXONE,599678660
2,289833599,50,13091411,1,H,1,H,EG,2215,4,...,34858,7,52,90,13091411,1,1,1,LXONE,599659813
3,289815259,50,9636829,0,H,0,H,EG,2401,2,...,94180,7,52,90,9636829,0,1,1,LXONE,599669523
4,289839097,50,16359571,1,H,1,H,EG,2004,7,...,30804,2,96,90,16359571,1,1,1,LXONE,599659634


In [15]:
# Export df_full to a CSV file
output_file = 'df_full_output.csv'  # Specify the file name
df_full.to_csv(output_file, index=False)

# Print confirmation
print(f"DataFrame df_full has been successfully exported to {output_file}")

DataFrame df_full has been successfully exported to df_full_output.csv


## Machine Learning


In [19]:
# Encoding
columns_and_types = df_full.dtypes.reset_index()
columns_and_types.columns = ['Column', 'Data Type']

# Display the table
print(columns_and_types)

        Column       Data Type
0     SRC_ITEM        category
1      SRC_LOT        category
2   DST_QACODE        category
3        SRC_X        category
4        SRC_Y        category
5        SRC_Z        category
6       OUTNUM           int32
7       LISNUM           int32
8       SUMLIS           int32
9     TRNNUM_x           int32
10      TRNDAT  datetime64[ns]
11    USERID_x        category
12      DOCNUM        category
13    STATUS_x           int32
14      PICCOD        category
15      CUSNUM        category
16      SHPTYP        category
17        TOUR        category
18    STATUS_y           int32
19        ITEM        category
20      ORDQTY           int32
21      CONQTY           int32
22    USERID_y        category
23    TRNNUM_y           int32
