take a small sample of data to get quick overview over the data and its structure

In [1]:
import pandas as pd

df_sample = pd.read_csv('../Data/bewegungen.csv', nrows=100)
print(df_sample.head())
print(df_sample.info())


      MOVNUM     WORNUM  STATUS MOVTYP MOVKEY  SRC_ITEM  SRC_LOT SRC_QACODE  \
0  289678988  289678987      50    REL    PIC  36529803        1          H   
1  289870562  289870561      50    REL    PIC  44118619        1          H   
2  289833600  289833599      50    REL    PIC  13091411        1          H   
3  289815260  289815259      50    REL    PIC   9636829        0          H   
4  289839098  289839097      50    REL    PIC  16359571        1          H   

   DST_LOT DST_QACODE  ...    RELNUM     LISNUM     SUMLIS     TRNNUM  \
0        1          H  ...  41484834  289679194  289679311  599324284   
1        1          H  ...  41494178  289874022  289874463  599670338   
2        1          H  ...  41492163  289835037  289835121  599596179   
3        0          H  ...  41491410  289815703  289815805  599541258   
4        1          H  ...  41492589  289840157  289725531  599576127   

                CRTDAT               TRNDAT    USERID   VOLREQ   VOLPIC  \
0  2024-04-

Due to error while loading csv into parquet - data has to be cleaned, following im checking which columns have mixed types

In [14]:
# Code to check which columns have mixed types due to error while loading csv
import pandas as pd

# Load a small chunk of the file or the whole file
df = pd.read_csv('../Data/bewegungen.csv')

# Specify the columns expected to be numeric
numeric_columns = ["SRC_X", "SRC_Y", "DST_X", "DST_Y", "REQQTY", "CONQTY"]

# Iterate through each numeric column and find non-numeric values
for column in numeric_columns:
    non_numeric = df[~df[column].apply(lambda x: str(x).replace('.', '', 1).isdigit())]
    if not non_numeric.empty:
        print(f"Non-numeric values found in column '{column}':")
        print(non_numeric[[column]].head())  # Show examples of non-numeric rows


  exec(code_obj, self.user_global_ns, self.user_ns)


MemoryError: Unable to allocate 2.25 GiB for an array with shape (15, 20094145) and data type object

### **Manually set dtypes, because the pandas inferred ones were not correct or unoptimized for performance.**

In [None]:
# Cleanup bewegungen.csv -> adjust dtype and load to parquet for more efficient access
import pandas as pd
import pyarrow

# Adjusted dtype mapping
df_bewegungen = pd.read_csv(
    '../Data/bewegungen.csv',
    dtype={
        "MOVNUM": "int64",
        "WORNUM": "int64",
        "STATUS": "category",
        "MOVTYP": "category",
        "MOVKEY": "category",
        "SRC_ITEM": "category", #category ideal - unique 900k articles, repeated articles IDs, column might be used for grouping, filtering and comparisons later on
        "SRC_LOT": "category",
        "SRC_QACODE": "category",
        "DST_LOT": "category",
        "DST_QACODE": "category",
        "SRC_WA": "category",
        "DST_WA": "category",
        "SRC_X": "object",
        "SRC_Y": "object",
        "SRC_Z": "category",
        "DST_X": "object", #always the same destination for X,Y and Z -> Warenausgang
        "DST_Y": "object",
        "DST_Z": "category",
        "REQQTY": "int64",
        "CONQTY": "int64",
        "INCNUM": "int64",
        "LISNUM": "int64",
        "USERID": "category",
    },
    parse_dates=["CRTDAT", "TRNDAT"],
    chunksize=100000  # Process large files in chunks
)

# Process chunks
for chunk in df_bewegungen:
    print(chunk.info())

# Save to Parquet
df.to_parquet('../Data/processed_bewegungen.parquet', index=False)


In [None]:
df.head()

Its beginning to show that I have to clean the table and take a closer look at all columns.
First step: Collect all unique values in SRC_LOT column

In [3]:
import pandas as pd

# Read the file in chunks
chunk_iter = pd.read_csv(
    '../Data/bewegungen.csv',
    dtype={
        "SRC_LOT": "category",  # Ensures memory efficiency for repetitive values
    },
    usecols=["SRC_LOT"],  # Only load the SRC_LOT column
    chunksize=100000      # Process in chunks
)

# Collect unique values from all chunks
unique_values = set()

for chunk in chunk_iter:
    unique_values.update(chunk['SRC_LOT'].unique())

# Convert the set to a sorted list and print the values
unique_values = sorted(unique_values)
print("Unique values in SRC_LOT column:")
print(unique_values)


Unique values in SRC_LOT column:
['.001', '000', '001', '0016', '002', '003', '004', '005', '006', '007', '008', '009', '01', '010', '011', '012', '013', '014', '015', '016', '017', '018', '019', '02', '020', '021', '022', '023', '024', '025', '026', '027', '028', '029', '030', '031', '032', '033', '034', '035', '036', '037', '038', '039', '040', '041', '042', '043', '044', '045', '046', '047', '048', '049', '050', '051', '052', '053', '054', '055', '056', '057', '058', '059', '060', '061', '062', '073', '074', '075', '077', '088', '089', '090', '091', '1', '1001', '223', 'de001', 'o001']


In [12]:
unique_values_count = df['SRC_LOT'].nunique()
print(f"Number of unique values in SRC_LOT: {unique_values_count}")
unique_values = df['SRC_LOT'].unique()
print(f"Unique values in SRC_LOT: {unique_values}")


Number of unique values in SRC_LOT: 71
Unique values in SRC_LOT: ['1' '11' '4' '3' '2' '7' '6' '10' '5' '19' '33' '9' '16' '20' '17' '90'
 '24' '32' '13' '8' '12' '23' '14' '21' '31' '43' '15' '55' '18' '53' '26'
 '89' '30' '28' '49' '73' '25' '39' '54' '52' '44' '60' '38' '34' '22'
 '37' '29' '36' '88' '59' '27' '42' '61' '47' '40' '75' '58' '41' '77'
 '57' '56' '45' '50' '48' '74' '46' '223' '91' '62' '35' '51']


Above output shows the issue that there is a mix of data types. Before I decide what to do with the outliers, I need to find out how these values are spread out.

In [11]:
# Define a function to determine the type of each entry
def get_dtype(value):
    return type(value).__name__

# Apply the function to the SRC_LOT column and count occurrences of each type
type_counts = df['SRC_LOT'].apply(get_dtype).value_counts()

# Print the results
print("Data types in SRC_LOT column:")
print(type_counts)


Data types in SRC_LOT column:
str    17931231
Name: SRC_LOT, dtype: int64


I can see most Data types are int and str, I wonder what the floats are - so I output them to a .csv file for me to see

In [10]:
import os

# Define the folder path
output_folder = r"H:\Projects\PyCharmProjects\HS2024_MachineLearning\Data\temp"
output_file = os.path.join(output_folder, "float_values_in_src_lot_v2.csv")

# Function to identify floats in the SRC_LOT column
def is_float(value):
    return isinstance(value, float)

# Filter rows where SRC_LOT contains floats
float_values = df[df['SRC_LOT'].apply(is_float)]

# Select only the columns you need (SRC_LOT and SRC_ITEM)
float_values_filtered = float_values[['SRC_LOT', 'SRC_ITEM', 'CRTDAT']]

# Save the results to the specified folder
float_values_filtered.to_csv(output_file, index=False)

print(f"CSV file with float values in SRC_LOT saved at: {output_file}")


CSV file with float values in SRC_LOT saved at: H:\Projects\PyCharmProjects\HS2024_MachineLearning\Data\temp\float_values_in_src_lot_v2.csv


The Data all has the same CRTDate from way back in 2022 traced back to the same day. I will remove these float values as i cannot validate their correctness.
-At the same time, I'm going to convert all values to strings for consitency before cleaning those up aswell
-The SRC_LOT 000/0 is relevant and should should also stay in the system

In [13]:
# Remove all float values from the column
df = df[df['SRC_LOT'].apply(lambda x: not isinstance(x, float))]

# Convert all entries in the column to strings
df['SRC_LOT'] = df['SRC_LOT'].astype(str)

# Strip leading zeros but ensure "000" becomes "0"
df['SRC_LOT'] = df['SRC_LOT'].apply(lambda x: '0' if x == '000' else x.lstrip('0'))

# Validate format (numeric only and max 3 characters)
df['SRC_LOT'] = df['SRC_LOT'].apply(lambda x: x if x.isdigit() and len(x) <= 3 else None)

# Filter out rows where SRC_LOT is None (invalid entries)
df = df[df['SRC_LOT'].notna()]

# Reset index after filtering
df.reset_index(drop=True, inplace=True)

# Print cleaned column
unique_values_count = df['SRC_LOT'].nunique()
print(f"Number of unique values in cleaned SRC_LOT: {unique_values_count}")
unique_values = df['SRC_LOT'].unique()
print(f"Cleaned unique values in SRC_LOT: {unique_values}")

Number of unique values in cleaned SRC_LOT: 71
Cleaned unique values in SRC_LOT: ['1' '11' '4' '3' '2' '7' '6' '10' '5' '19' '33' '9' '16' '20' '17' '90'
 '24' '32' '13' '8' '12' '23' '14' '21' '31' '43' '15' '55' '18' '53' '26'
 '89' '30' '28' '49' '73' '25' '39' '54' '52' '44' '60' '38' '34' '22'
 '37' '29' '36' '88' '59' '27' '42' '61' '47' '40' '75' '58' '41' '77'
 '57' '56' '45' '50' '48' '74' '46' '223' '91' '62' '35' '51']


In [None]:
#check if null values are present
round((df.isnull().sum()/df.shape[0])*100,2)

In [None]:
#count unique entries in each row of the table
unique_counts = df.nunique()
print(unique_counts)

### Complete Mapping of all Tables of Data that I will be needing (for now).

In [None]:
import pandas as pd

# Read Bewegungen 1.1 (WMDT) - only PIC movements, otherwise the file would be too big (50gb+)
df_bewegungen = pd.read_csv(
    '../Data/bewegungen.csv',
    dtype={
        "MOVNUM": "int64",
        "WORNUM": "int64",
        "STATUS": "category",
        "MOVTYP": "category",
        "MOVKEY": "category",
        "SRC_ITEM": "category", #category ideal - unique 900k articles, repeated articles IDs, column might be used for grouping, filtering and comparisons later on
        "SRC_LOT": "category",
        "SRC_QACODE": "category",
        "DST_LOT": "category",
        "DST_QACODE": "category",
        "SRC_WA": "category",
        "DST_WA": "category",
        "SRC_X": "object",
        "SRC_Y": "object",
        "SRC_Z": "category",
        "DST_X": "object", #always the same destination for X,Y and Z -> Warenausgang
        "DST_Y": "object",
        "DST_Z": "category",
        "REQQTY": "int64",
        "CONQTY": "int64",
        "INCNUM": "int64",
        "LISNUM": "int64",
        "USERID": "category",
    },
    parse_dates=["CRTDAT", "TRNDAT"]
)

# Read Warenausgang Kopf 7a
df_wa_kopf = pd.read_csv(
    '../Data/wa_kopf.csv',
    dtype={
        "OUTNUM": "int64",
        "DOCNUM": "int64",
        "ORDNUM": "int64",
        "STATUS": "category",
        "CUSNUM": "int64",
        "SHPTYP": "category",
        "TOUR": "int64",
    },
    parse_dates=["ORDDAT", "DLVDAT", "CRTDAT", "TRNDAT"]
)

# Read Warenausgang Positionen 7b
df_wa_positionen = pd.read_csv(
    '../Data/wa_positionen.csv',
    dtype={
        "OUTNUM": "int64",
        "OUTLIN": "int64",
        "STATUS": "category",
        "ITEM": "int64",
        "LOT": "category",
        "QACODE": "category",
        "ORDQTY": "int64",
        "RELQTY": "int64",
        "FNDQTY": "int64",
        "CONQTY": "int64",
        "SHPQTY": "int64",
        "USERID": "category",
    },
    parse_dates=["CRTDAT", "TRNDAT"]
)

# Read Listen 8
df_listen = pd.read_csv(
    '../Data/listen.csv',
    dtype={
        "LISNUM": "int64",
        "SUMLIS": "int64",
        "STATUS": "category",
        "PRIO": "int64",
        "PZ": "category",
        "RELNUM": "int64",
        "CUSNUM": "int64",
        "DSPADR": "category",
    },
    parse_dates=["CRTDAT", "TRNDAT"]
)

# Read Artikelbestand 5
df_artikelbestand = pd.read_csv(
    '../Data/artikelbestand.csv',
    dtype={
        "OBJNUM": "int64",
        "STATUS": "category",
        "LOCNUM": "int64",
        "PICLCK": "bool",
        "STTLCK": "bool",
        "ITEM": "int64",
        "LOT": "category",
        "INQTY": "int64",
        "OUTQTY": "int64",
        "AVLQTY": "int64",
        "CONQTY": "int64",
    },
    parse_dates=["CRTDAT", "TRNDAT"]
)

# Read Lagerplatz 3
df_lagerplatz = pd.read_csv(
    '../Data/lagerplatz.csv',
    dtype={
        "OBJNUM": "int64",
        "WH": "category",
        "WA": "category",
        "X": "int64",
        "Y": "int64",
        "Z": "object",
        "STATUS": "category",
        "PUTPRI": "int64",
        "PICPRI": "int64",
    },
    parse_dates=["CRTDAT", "TRNDAT"]
)


In [None]:
# Display info for Bewegungen 1.1 (WMDT)
print("Bewegungen 1.1 Info:")
print(df_bewegungen.info())

# Display info for Warenausgang Kopf 7a
print("\nWarenausgang Kopf Info:")
print(df_wa_kopf.info())

# Display info for Warenausgang Positionen 7b
print("\nWarenausgang Positionen Info:")
print(df_wa_positionen.info())

# Display info for Listen 8
print("\nListen Info:")
print(df_listen.info())

# Display info for Artikelbestand 5
print("\nArtikelbestand Info:")
print(df_artikelbestand.info())

# Display info for Lagerplatz 3
print("\nLagerplatz Info:")
print(df_lagerplatz.info())
