In [None]:
# consolidated :::
import pandas as pd
# List of columns to keep (ran into memory issues...)
columns_to_keep = [
    "WORNUM", "STATUS", "SRC_ITEM", "SRC_LOT", "SRC_QACODE", "DST_LOT",
    "DST_QACODE", "SRC_WA", "SRC_X", "SRC_Y", "SRC_Z", "DST_WA",
    "LISNUM", "SUMLIS", "TRNNUM", "CRTDAT", "TRNDAT", "USERID", "LOADDAT"
]

dtypes = {
    "WORNUM": "int32",
    "STATUS": "int32",
    "MOVTYP": "category",
    "MOVKEY": "category",
    "SRC_ITEM": "category",
    "SRC_LOT": "string",
    "SRC_QACODE": "category",
    "DST_LOT": "string",
    "DST_QACODE": "category",
    "SRC_WA": "category",
    "SRC_X": "category",
    "SRC_Y": "category",
    "SRC_Z": "category",
    "DST_WA": "category",
    "CONQTY": "int32",
    "OUTNUM": "int32",
    "RELNUM": "int32",
    "LISNUM": "int32",
    "SUMLIS": "int32",
    "TRNNUM": "int32",
    "USERID": "category",
}

# Load the CSV with optimized settings, and only load necessary cols
df_bewegungen = pd.read_csv(
    '../Data/bewegungen.csv',
    usecols=columns_to_keep,  # Only load the specified columns
    dtype=dtypes,
    #usecols=columns_to_use,  # Only load required columns
    #dtype=dtypes,            # Use optimized data types
    parse_dates=["CRTDAT", "TRNDAT", "LOADDAT"],
    low_memory=False
)

# Clean SRC_LOT column (Column "Artikelcharge", this LOT usually is a 3 digit int. An Article can have multiple LOTs. I simplify by removing leading zeros and clean up the column from wrong manual usererrors.
df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].fillna('').astype(str)  # Handle NaNs and convert to string
df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].apply(lambda x: '0' if x == '000' else x.lstrip('0'))  # Remove leading zeros
df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].apply(lambda x: x if x.isdigit() and len(x) <= 3 else None)  # Validate format

# Clean DST_LOT column
df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].fillna('').astype(str)  # Handle NaNs and convert to string
df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].apply(lambda x: '0' if x == '000' else x.lstrip('0'))  # Remove leading zeros
df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].apply(lambda x: x if x.isdigit() and len(x) <= 3 else None)  # Validate format

# Clean STATUS column (10= Offen, 50= Bestätigt, 95= Storniert -- for my purposes I'm only interested in "Bestätigt" rows.
# Filter the DataFrame to keep only rows with status == 50 (abgeschlossen)
df = df_bewegungen[df_bewegungen['STATUS'] == 50]

# Remove rows where USER == 'LXONE'
df = df[df['USERID'] != 'LXONE']

df_bewegungen = df_bewegungen.astype({
    "SRC_LOT": "category",
    "DST_LOT": "category",
})
# :::

In [1]:
import pandas as pd
# Define columns to load and optimize data types for memory efficiency
#columns_to_use = ['SRC_Z']  # Adjust as needed
#dtypes = {'SRC_Z': 'string'}  # Specify data types for memory efficiency

# List of columns to keep (ran into memory issues...)
columns_to_keep = [
    "WORNUM", "STATUS", "SRC_ITEM", "SRC_LOT", "SRC_QACODE", "DST_LOT",
    "DST_QACODE", "SRC_WA", "SRC_X", "SRC_Y", "SRC_Z", "DST_WA",
    "LISNUM", "SUMLIS", "TRNNUM", "CRTDAT", "TRNDAT", "USERID", "LOADDAT"
]

dtypes = {
    "WORNUM": "int32",
    "STATUS": "int32",
    "MOVTYP": "category",
    "MOVKEY": "category",
    "SRC_ITEM": "category",
    "SRC_LOT": "string",
    "SRC_QACODE": "category",
    "DST_LOT": "string",
    "DST_QACODE": "category",
    "SRC_WA": "category",
    "SRC_X": "category",
    "SRC_Y": "category",
    "SRC_Z": "category",
    "DST_WA": "category",
    "CONQTY": "int32",
    "OUTNUM": "int32",
    "RELNUM": "int32",
    "LISNUM": "int32",
    "SUMLIS": "int32",
    "TRNNUM": "int32",
    "USERID": "category",
}

# Load the CSV with optimized settings, and only load necessary cols
df_bewegungen = pd.read_csv(
    '../Data/bewegungen.csv',
    usecols=columns_to_keep,  # Only load the specified columns
    dtype=dtypes,
    #usecols=columns_to_use,  # Only load required columns
    #dtype=dtypes,            # Use optimized data types
    parse_dates=["CRTDAT", "TRNDAT", "LOADDAT"],
    low_memory=False
)

print(df_bewegungen.info(memory_usage="deep"))
# Clean SRC_LOT column
#df['SRC_LOT'] = df['SRC_LOT'].fillna('').astype(str)  # Handle NaNs and convert to string
#df['SRC_LOT'] = df['SRC_LOT'].apply(lambda x: '0' if x == '000' else x.lstrip('0'))  # Remove leading zeros
#df['SRC_LOT'] = df['SRC_LOT'].apply(lambda x: x if x.isdigit() and len(x) <= 3 else None)  # Validate format

# Clean DST_LOT column
#df['DST_LOT'] = df['DST_LOT'].fillna('').astype(str)  # Handle NaNs and convert to string
#df['DST_LOT'] = df['DST_LOT'].apply(lambda x: '0' if x == '000' else x.lstrip('0'))  # Remove leading zeros
#df['DST_LOT'] = df['DST_LOT'].apply(lambda x: x if x.isdigit() and len(x) <= 3 else None)  # Validate format

# Clean 

# Columns to drop
#columns_to_drop = ["DST_X", "DST_Y", "DST_Z", "REQQTY"] #RequiredQuant not used as per now

# Drop the columns from the DataFrame
#df = df.drop(columns=columns_to_drop)

# Confirm the columns are dropped
#for chunk in df:
    #print(chunk.info())

# set dtype after all corrections 
# Assuming `df` is the cleaned DataFrame


# Drop rows with invalid SRC_LOT values
#df = df[df['SRC_LOT'].notna()]

# Save the cleaned DataFrame
#df.to_csv('../Data/cleaned_bewegungen.csv', index=False)

#print(f"Cleaned data saved to '../Data/cleaned_bewegungen.csv'")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20094145 entries, 0 to 20094144
Data columns (total 19 columns):
 #   Column      Dtype         
---  ------      -----         
 0   WORNUM      int32         
 1   STATUS      int32         
 2   SRC_ITEM    category      
 3   SRC_LOT     string        
 4   SRC_QACODE  category      
 5   DST_LOT     string        
 6   DST_QACODE  category      
 7   SRC_WA      category      
 8   SRC_X       category      
 9   SRC_Y       category      
 10  SRC_Z       category      
 11  DST_WA      category      
 12  LISNUM      int32         
 13  SUMLIS      int32         
 14  TRNNUM      int32         
 15  CRTDAT      datetime64[ns]
 16  TRNDAT      datetime64[ns]
 17  USERID      category      
 18  LOADDAT     datetime64[ns]
dtypes: category(9), datetime64[ns](3), int32(5), string(2)
memory usage: 3.4 GB
None


In [2]:
# Get the number of rows in the DataFrame
num_rows = len(df_bewegungen)
print(f"Number of rows: {num_rows}")


Number of rows: 20094145


# clean columns dst_lot and src_lot

In [5]:
# Clean SRC_LOT column (Column "Artikelcharge", this LOT usually is a 3 digit int. An Article can have multiple LOTs. I simplify by removing leading zeros and clean up the column from wrong manual usererrors.
df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].fillna('').astype(str)  # Handle NaNs and convert to string
df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].apply(lambda x: '0' if x == '000' else x.lstrip('0'))  # Remove leading zeros
df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].apply(lambda x: x if x.isdigit() and len(x) <= 3 else None)  # Validate format

# Clean DST_LOT column
df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].fillna('').astype(str)  # Handle NaNs and convert to string
df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].apply(lambda x: '0' if x == '000' else x.lstrip('0'))  # Remove leading zeros
df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].apply(lambda x: x if x.isdigit() and len(x) <= 3 else None)  # Validate format

# Clean STATUS column (10= Offen, 50= Bestätigt, 95= Storniert -- for my purposes I'm only interested in "Bestätigt" rows.
# Filter the DataFrame to keep only rows with status == 50 (abgeschlossen)
df = df_bewegungen[df_bewegungen['STATUS'] == 50]

# Remove rows where USER == 'LXONE'
df = df[df['USERID'] != 'LXONE']


In [6]:
num_rows = len(df)
print(f"Number of rows: {num_rows}")

Number of rows: 19677442


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19677442 entries, 0 to 20094144
Data columns (total 19 columns):
 #   Column      Dtype         
---  ------      -----         
 0   WORNUM      int32         
 1   STATUS      int32         
 2   SRC_ITEM    category      
 3   SRC_LOT     object        
 4   SRC_QACODE  category      
 5   DST_LOT     object        
 6   DST_QACODE  category      
 7   SRC_WA      category      
 8   SRC_X       category      
 9   SRC_Y       category      
 10  SRC_Z       category      
 11  DST_WA      category      
 12  LISNUM      int32         
 13  SUMLIS      int32         
 14  TRNNUM      int32         
 15  CRTDAT      datetime64[ns]
 16  TRNDAT      datetime64[ns]
 17  USERID      category      
 18  LOADDAT     datetime64[ns]
dtypes: category(9), datetime64[ns](3), int32(5), object(2)
memory usage: 1.6+ GB


In [34]:
# Get the number of rows in the DataFrame
num_rows = len(df_bewegungen)
print(f"Number of rows: {num_rows}")


Number of rows: 19677442


# [ignore] drop not needed columns // #already did this part by only importing specified colums

In [5]:
# Columns to drop
columns_to_drop = ["MOVNUM", "DST_X", "DST_Y", "DST_Z", "REQQTY", "VOLREQ", "VOLPIC", "MOVTYP", "MOVKEY", "INCNUM", "INCLIN"] #RequiredQuant not used as per now

# Drop the columns from the DataFrame
df_bewegungen = df_bewegungen.drop(columns=columns_to_drop)

# Confirm the columns are dropped
print(df_bewegungen.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20094145 entries, 0 to 20094144
Data columns (total 27 columns):
 #   Column      Dtype         
---  ------      -----         
 0   MOVNUM      int64         
 1   WORNUM      int64         
 2   STATUS      int64         
 3   MOVTYP      object        
 4   MOVKEY      object        
 5   SRC_ITEM    int64         
 6   SRC_LOT     object        
 7   SRC_QACODE  object        
 8   DST_LOT     object        
 9   DST_QACODE  object        
 10  SRC_WA      object        
 11  SRC_X       object        
 12  SRC_Y       object        
 13  SRC_Z       object        
 14  DST_WA      object        
 15  CONQTY      int64         
 16  INCNUM      int64         
 17  INCLIN      int64         
 18  OUTNUM      int64         
 19  RELNUM      int64         
 20  LISNUM      int64         
 21  SUMLIS      int64         
 22  TRNNUM      int64         
 23  CRTDAT      datetime64[ns]
 24  TRNDAT      datetime64[ns]
 25  USERID      obje

# then add final correct astype after all cleaning operations

In [8]:
#following list is not complete

df_bewegungen = df_bewegungen.astype({
    "SRC_LOT": "category",
    "DST_LOT": "category",
})

In [9]:
# final control if all types are set
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19677442 entries, 0 to 20094144
Data columns (total 19 columns):
 #   Column      Dtype         
---  ------      -----         
 0   WORNUM      int32         
 1   STATUS      int32         
 2   SRC_ITEM    category      
 3   SRC_LOT     category      
 4   SRC_QACODE  category      
 5   DST_LOT     category      
 6   DST_QACODE  category      
 7   SRC_WA      category      
 8   SRC_X       category      
 9   SRC_Y       category      
 10  SRC_Z       category      
 11  DST_WA      category      
 12  LISNUM      int32         
 13  SUMLIS      int32         
 14  TRNNUM      int32         
 15  CRTDAT      datetime64[ns]
 16  TRNDAT      datetime64[ns]
 17  USERID      category      
 18  LOADDAT     datetime64[ns]
dtypes: category(11), datetime64[ns](3), int32(5)
memory usage: 1.3 GB


view df


# Data Analysis: check unique values and distributions (maybe move somewhere else)
found out that UserID LXONE had the highest distribition - this is a SystemUser and therefor should be excluded

In [10]:
# VIEW UNIQUE VALUES for specific column
unique_values_count = df_bewegungen['SRC_X'].nunique()
print(f"Number of unique values in cleaned DST_LOT: {unique_values_count}")
unique_values = df_bewegungen['SRC_X'].unique()
print(f"Cleaned unique values in DST_LOT: {unique_values}")

Number of unique values in cleaned DST_LOT: 701
Cleaned unique values in DST_LOT: ['1601', '1780', '2215', '2401', '2004', ..., '2246', '2237', '2240', '2238', '2239']
Length: 701
Categories (707, object): ['0001', '0100', '0101', '0102', ..., '7012', '7040', 'R', 'S']


Values 0001 (Aussenlager) and Q R S are allowed

In [11]:
import pandas as pd

# Assuming `df` is your DataFrame
for column in df_bewegungen.columns:
    print(f"Distribution of unique values in column: {column}")
    print(df_bewegungen[column].value_counts())  # Count occurrences of each unique value
    print("-" * 50)  # Separator for readability



Distribution of unique values in column: WORNUM
289678987    1
251742346    1
251644736    1
251650856    1
251633841    1
            ..
190249200    1
190249198    1
190249196    1
190249194    1
326807027    1
Name: WORNUM, Length: 19677442, dtype: int64
--------------------------------------------------
Distribution of unique values in column: STATUS
50    19677442
Name: STATUS, dtype: int64
--------------------------------------------------
Distribution of unique values in column: SRC_ITEM
12119574    22583
45683121     8688
42875222     8212
37671777     8149
18089785     7629
            ...  
15373685        0
32897063        0
34836104        0
34835958        0
30167214        0
Name: SRC_ITEM, Length: 914671, dtype: int64
--------------------------------------------------
Distribution of unique values in column: SRC_LOT
1      15984175
2        749870
3        362855
4        226203
5        138548
         ...   
45           57
77           50
35           38
51           

Make sure no rows are duped

In [12]:
df_bewegungen.duplicated()

0           False
1           False
2           False
3           False
4           False
            ...  
20094140    False
20094141    False
20094142    False
20094143    False
20094144    False
Length: 19677442, dtype: bool

In [None]:
check date distribution?

## 2. File integration: WA KOPF

In [None]:
#Consolidated:::
df_wa_kopf = pd.read_csv(
    '../Data/wa_kopf.csv',
    dtype={
        "OUTNUM": "int32",
        "DOCNUM": "category",
        "ORDNUM": "category",
        "STATUS": "int32",
        "CUSNUM": "category",
        "SHPTYP": "category",
        "TOUR": "category",
    },
    low_memory=False,
    parse_dates=["ORDDAT", "DLVDAT", "CRTDAT", "TRNDAT"]
)
# Filter the DataFrame to keep only rows with status == 90 (abgeschlossene)
df_wa_kopf = df_wa_kopf[df_wa_kopf['STATUS'] == 90]

# Step 1: Merge main DataFrame with customers
# df_orders = pd.merge(df_bewegungen, df_wa_kopf, on='OUTNUM', how='inner')

#:::

In [22]:
df_wa_kopf = pd.read_csv(
    '../Data/wa_kopf.csv',
    dtype={
        "OUTNUM": "int32",
        "DOCNUM": "category",
        "ORDNUM": "category",
        "STATUS": "int32",
        "CUSNUM": "category",
        "SHPTYP": "category",
        "TOUR": "category",
    },
    low_memory=False,
    parse_dates=["ORDDAT", "DLVDAT", "CRTDAT", "TRNDAT"]
)
print(df_wa_kopf.info(memory_usage="deep"))





<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19377913 entries, 0 to 19377912
Data columns (total 15 columns):
 #   Column   Dtype         
---  ------   -----         
 0   OUTNUM   int32         
 1   DOCNUM   category      
 2   ORDNUM   category      
 3   STATUS   int32         
 4   PICCOD   object        
 5   CUSNUM   category      
 6   ORDDAT   datetime64[ns]
 7   DLVDAT   datetime64[ns]
 8   SHPTYP   category      
 9   TOUR     category      
 10  CRTDAT   datetime64[ns]
 11  TRNDAT   datetime64[ns]
 12  USERID   object        
 13  TRNNUM   int64         
 14  LOADDAT  object        
dtypes: category(5), datetime64[ns](4), int32(2), int64(1), object(3)
memory usage: 7.1 GB
None


check distribution of values to assess if dtype are correctly set

In [19]:
df_wa_kopf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19377913 entries, 0 to 19377912
Data columns (total 15 columns):
 #   Column   Dtype         
---  ------   -----         
 0   OUTNUM   int32         
 1   DOCNUM   category      
 2   ORDNUM   category      
 3   STATUS   category      
 4   PICCOD   object        
 5   CUSNUM   category      
 6   ORDDAT   datetime64[ns]
 7   DLVDAT   datetime64[ns]
 8   SHPTYP   category      
 9   TOUR     category      
 10  CRTDAT   datetime64[ns]
 11  TRNDAT   datetime64[ns]
 12  USERID   object        
 13  TRNNUM   int64         
 14  LOADDAT  object        
dtypes: category(6), datetime64[ns](4), int32(1), int64(1), object(3)
memory usage: 2.3+ GB


In [26]:
num_rows = len(df_wa_kopf)
print(f"Number of rows: {num_rows}")

Number of rows: 19314965


In [15]:
df_wa_kopf.head()

Unnamed: 0,OUTNUM,DOCNUM,ORDNUM,STATUS,PICCOD,CUSNUM,ORDDAT,DLVDAT,SHPTYP,TOUR,CRTDAT,TRNDAT,USERID,TRNNUM,LOADDAT
0,30723507,33054472,llo229691223,90,BS17,100986,2024-03-20 11:38:02,2024-03-20,2,85,2024-03-20 11:39:08,2024-03-20 18:13:18,LXONE,546327413,2024-03-21 01:54:14.176700
1,30711706,33052260,llo229681872,90,BS17,94536,2024-03-20 05:53:37,2024-03-20,2,85,2024-03-20 05:54:13,2024-03-20 19:58:26,LXONE,546357022,2024-03-21 01:54:14.176700
2,30730649,33056533,llo229700285,90,BS17,90715,2024-03-20 15:22:37,2024-03-20,2,83,2024-03-20 15:22:47,2024-03-20 17:32:19,LXONE,546301836,2024-03-21 01:54:14.176700
3,30712103,33052611,llo229682417,90,BS15-S,165269,2024-03-20 05:55:07,2024-03-20,124,0,2024-03-20 05:56:00,2024-03-20 11:13:51,176551,546066479,2024-03-21 01:54:14.176700
4,30733401,33057203,llo229704047,90,BS17,2542,2024-03-20 16:52:31,2024-03-20,2,94,2024-03-20 16:52:38,2024-03-20 21:05:25,LXONE,546369019,2024-03-21 01:54:14.176700


In [18]:
import pandas as pd

# Assuming `df` is your DataFrame
for column in df_wa_kopf.columns:
    print(f"Distribution of unique values in column: {column}")
    print(df_wa_kopf[column].value_counts())  # Count occurrences of each unique value
    print("-" * 50)  # Separator for readability

Distribution of unique values in column: OUTNUM
30723507    1
31464006    1
31463357    1
31463848    1
31469393    1
           ..
22156164    1
22156163    1
22156162    1
22156161    1
35224902    1
Name: OUTNUM, Length: 19377913, dtype: int64
--------------------------------------------------
Distribution of unique values in column: DOCNUM
28377289    5002
28109999    3890
28055543    3847
28168002    3829
28062966    3718
            ... 
30642764       1
30642765       1
30642766       1
30642767       1
9998647        1
Name: DOCNUM, Length: 5970945, dtype: int64
--------------------------------------------------
Distribution of unique values in column: ORDNUM
llo231603066    20
llo231602055    20
llo231602278    20
llo231601659    20
llo231602085    20
                ..
llo216880728     1
llo216880727     1
llo216880726     1
llo216880725     1
llo95169276      1
Name: ORDNUM, Length: 19315767, dtype: int64
--------------------------------------------------
Distribution of uni

In [25]:
# Filter the DataFrame to keep only rows with status == 90 (abgeschlossene)
df_wa_kopf = df_wa_kopf[df_wa_kopf['STATUS'] == 90]

Merge Dataframes

In [None]:
# Step 1: Merge main DataFrame with customers
df_orders = pd.merge(df_bewegungen, df_wa_kopf, on='OUTNUM', how='inner')

## 3. File Integration/Prep WA POS

In [None]:
### Consolidated
df_wa_positionen = pd.read_csv(
    '../Data/wa_positionen.csv',
    dtype={
        "OUTNUM": "int32",
        "OUTLIN": "int32",
        "STATUS": "category",
        "ITEM": "category",
        "LOT": "category",
        "QACODE": "category",
        "ORDQTY": "int32",
        "RELQTY": "int32",
        "FNDQTY": "int32",
        "CONQTY": "float32", # somehow it thinks these values are float data
        "SHPQTY": "int32",
        "USERID": "category",
        "TRNNUM": "int32",
    },
    low_memory=False,
    parse_dates=["CRTDAT", "TRNDAT"]
)

df_wa_positionen['CONQTY'] = df_wa_positionen['CONQTY'].astype(int)
## WRITE TO CSV todo

# :::

In [43]:
df_wa_positionen = pd.read_csv(
    '../Data/wa_positionen.csv',
    dtype={
        "OUTNUM": "int32",
        "OUTLIN": "int32",
        "STATUS": "category",
        "ITEM": "category",
        "LOT": "category",
        "QACODE": "category",
        "ORDQTY": "int32",
        "RELQTY": "int32",
        "FNDQTY": "int32",
        "CONQTY": "float32", # somehow it thinks these values are float data
        "SHPQTY": "int32",
        "USERID": "category",
        "TRNNUM": "int32",
    },
    low_memory=False,
    parse_dates=["CRTDAT", "TRNDAT"]
)

#change float inferred column to int
#df_wa_positionen['CONQTY'] = df_wa_positionen['CONQTY'].astype(int)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19375224 entries, 0 to 19375223
Data columns (total 17 columns):
 #   Column   Dtype         
---  ------   -----         
 0   OUTNUM   int32         
 1   OUTLIN   int32         
 2   ORDNUM   object        
 3   STATUS   category      
 4   ITEM     category      
 5   LOT      category      
 6   QACODE   category      
 7   ORDQTY   int32         
 8   RELQTY   int32         
 9   FNDQTY   int32         
 10  CONQTY   float32       
 11  SHPQTY   int32         
 12  CRTDAT   datetime64[ns]
 13  TRNDAT   datetime64[ns]
 14  USERID   category      
 15  TRNNUM   int64         
 16  LOADDAT  object        
dtypes: category(5), datetime64[ns](2), float32(1), int32(6), int64(1), object(2)
memory usage: 3.9 GB
None


In [None]:
#change float inferred column to int
df_wa_positionen['CONQTY'] = df_wa_positionen['CONQTY'].astype(int)

In [44]:
df_wa_positionen.info()
num_rows = len(df_wa_kopf)
print(f"Number of rows: {num_rows}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19375224 entries, 0 to 19375223
Data columns (total 17 columns):
 #   Column   Dtype         
---  ------   -----         
 0   OUTNUM   int32         
 1   OUTLIN   int32         
 2   ORDNUM   object        
 3   STATUS   category      
 4   ITEM     category      
 5   LOT      category      
 6   QACODE   category      
 7   ORDQTY   int32         
 8   RELQTY   int32         
 9   FNDQTY   int32         
 10  CONQTY   float32       
 11  SHPQTY   int32         
 12  CRTDAT   datetime64[ns]
 13  TRNDAT   datetime64[ns]
 14  USERID   category      
 15  TRNNUM   int64         
 16  LOADDAT  object        
dtypes: category(5), datetime64[ns](2), float32(1), int32(6), int64(1), object(2)
memory usage: 1.4+ GB
Number of rows: 19314965


## Not needed in the end - analysis of data in wa pos

In [32]:
unique_values_count = df_wa_positionen['CONQTY'].nunique()
print(f"Number of unique values in cleaned DST_LOT: {unique_values_count}")
unique_values = df_wa_positionen['CONQTY'].unique()
print(f"Cleaned unique values in DST_LOT: {unique_values}")

Number of unique values in cleaned DST_LOT: 1207
Cleaned unique values in DST_LOT: [   7.   83.   40. ...  647. 1474. 1404.]


In [49]:
#compare column confirmed / shipped qty to check if there is a difference
non_matching_count = (df_wa_positionen['RELQTY'] != df_wa_positionen['CONQTY']).sum()
print(f"Number of non-matching rows: {non_matching_count}")


Number of non-matching rows: 1923259


In [34]:
# Assuming `df` is your DataFrame
for column in df_wa_positionen.columns:
    print(f"Distribution of unique values in column: {column}")
    print(df_wa_positionen[column].value_counts())  # Count occurrences of each unique value
    print("-" * 50)  # Separator for readability


Distribution of unique values in column: OUTNUM
19772684    3
29880438    3
26748586    3
22482811    3
22482812    3
           ..
22274019    1
22274018    1
22274017    1
22274016    1
35235763    1
Name: OUTNUM, Length: 19373891, dtype: int64
--------------------------------------------------
Distribution of unique values in column: OUTLIN
1    19373862
2        1356
3           6
Name: OUTLIN, dtype: int64
--------------------------------------------------
Distribution of unique values in column: ORDNUM
llo232378131    25
llo231602020    21
llo231601773    20
llo231602073    20
llo231601659    20
                ..
llo217040598     1
llo217040596     1
llo217040592     1
llo217040591     1
llo236192316     1
Name: ORDNUM, Length: 19311781, dtype: int64
--------------------------------------------------
Distribution of unique values in column: STATUS
90    19312730
10       60216
20        1392
17         291
95         278
70         208
30          69
80          40
Name: STATUS,

In [45]:
# Define a function to determine the type of each entry
def get_dtype(value):
    return type(value).__name__

# Apply the function to the SRC_LOT column and count occurrences of each type
type_counts = df_wa_positionen['CONQTY'].apply(get_dtype).value_counts()

# Print the results
print("Data types in CONQTY column:")
print(type_counts)

Data types in CONQTY column:
float    19375224
Name: CONQTY, dtype: int64


In [36]:
# Specify the column to extract unique values from
column_name = 'CONQTY'

# Get unique values
unique_values = df_wa_positionen[column_name].unique()

# Convert to a DataFrame for saving
unique_values_df = pd.DataFrame(unique_values, columns=[column_name])

# Save to CSV
output_file = 'unique_values_wapositionen.csv'
unique_values_df.to_csv(output_file, index=False)

print(f"Unique values saved to {output_file}")

Unique values saved to unique_values_wapositionen.csv


In [40]:
#checking for NaN values in float column to change it to int after
# Check for NaN values
if df_wa_positionen['CONQTY'].isna().any():
    print(f"Column '{column_name}' contains NaN values.")
else:
    print(f"Column '{column_name}' does not contain any NaN values.")

Column 'CONQTY' does not contain any NaN values.


In [41]:
df_wa_positionen['CONQTY'] = df_wa_positionen['CONQTY'].astype(int)


In [46]:
df_wa_positionen.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19375224 entries, 0 to 19375223
Data columns (total 17 columns):
 #   Column   Dtype         
---  ------   -----         
 0   OUTNUM   int32         
 1   OUTLIN   int32         
 2   ORDNUM   object        
 3   STATUS   category      
 4   ITEM     category      
 5   LOT      category      
 6   QACODE   category      
 7   ORDQTY   int32         
 8   RELQTY   int32         
 9   FNDQTY   int32         
 10  CONQTY   float32       
 11  SHPQTY   int32         
 12  CRTDAT   datetime64[ns]
 13  TRNDAT   datetime64[ns]
 14  USERID   category      
 15  TRNNUM   int64         
 16  LOADDAT  object        
dtypes: category(5), datetime64[ns](2), float32(1), int32(6), int64(1), object(2)
memory usage: 1.4+ GB
