### All copied from final_dataprep_v1.ipynb to testrun fully and merge the df on FK and PKs for first usecase

# Bewegungen.csv prep

In [7]:
# consolidated :::
import pandas as pd
from idna.idnadata import joining_types

# List of columns to keep (ran into memory issues...)
columns_to_keep = [
    "WORNUM", "STATUS", "SRC_ITEM", "SRC_LOT", "SRC_QACODE", "DST_LOT",
    "DST_QACODE", "SRC_WA", "SRC_X", "SRC_Y", "SRC_Z", "DST_WA", "OUTNUM",
    "LISNUM", "SUMLIS", "TRNNUM", "CRTDAT", "TRNDAT", "USERID", "LOADDAT"
]

dtypes = {
    "WORNUM": "int32",
    "STATUS": "int32",
    "MOVTYP": "category",
    "MOVKEY": "category",
    "SRC_ITEM": "category",
    "SRC_LOT": "string",
    "SRC_QACODE": "category",
    "DST_LOT": "string",
    "DST_QACODE": "category",
    "SRC_WA": "category",
    "SRC_X": "category",
    "SRC_Y": "category",
    "SRC_Z": "category",
    "DST_WA": "category",
    "CONQTY": "int32",
    "OUTNUM": "int32",
    "RELNUM": "int32",
    "LISNUM": "int32",
    "SUMLIS": "int32",
    "TRNNUM": "int32",
    "USERID": "category",
}

# Load the CSV with optimized settings, and only load necessary cols
df_bewegungen = pd.read_csv(
    '../Data/bewegungen.csv',
    usecols=columns_to_keep,  # Only load the specified columns
    dtype=dtypes,
    #usecols=columns_to_use,  # Only load required columns
    #dtype=dtypes,            # Use optimized data types
    parse_dates=["CRTDAT", "TRNDAT", "LOADDAT"],
    low_memory=False
)

# Clean SRC_LOT column (Column "Artikelcharge", this LOT usually is a 3 digit int. An Article can have multiple LOTs. I simplify by removing leading zeros and clean up the column from wrong manual usererrors.
df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].fillna('').astype(str)  # Handle NaNs and convert to string
df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].apply(lambda x: '0' if x == '000' else x.lstrip('0'))  # Remove leading zeros
df_bewegungen['SRC_LOT'] = df_bewegungen['SRC_LOT'].apply(lambda x: x if x.isdigit() and len(x) <= 3 else None)  # Validate format

# Clean DST_LOT column
df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].fillna('').astype(str)  # Handle NaNs and convert to string
df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].apply(lambda x: '0' if x == '000' else x.lstrip('0'))  # Remove leading zeros
df_bewegungen['DST_LOT'] = df_bewegungen['DST_LOT'].apply(lambda x: x if x.isdigit() and len(x) <= 3 else None)  # Validate format

# Clean STATUS column (10= Offen, 50= Bestätigt, 95= Storniert -- for my purposes I'm only interested in "Bestätigt" rows.
# Filter the DataFrame to keep only rows with status == 50 (abgeschlossen)
df = df_bewegungen[df_bewegungen['STATUS'] == 50]

# Remove rows where USER == 'LXONE'
df = df[df['USERID'] != 'LXONE']

df_bewegungen = df_bewegungen.astype({
    "SRC_LOT": "category",
    "DST_LOT": "category",
})
# :::

In [6]:
df_bewegungen.head()

Unnamed: 0,WORNUM,STATUS,SRC_ITEM,SRC_LOT,SRC_QACODE,DST_LOT,DST_QACODE,SRC_WA,SRC_X,SRC_Y,SRC_Z,DST_WA,LISNUM,SUMLIS,TRNNUM,CRTDAT,TRNDAT,USERID,LOADDAT
0,289678987,50,36529803,1,H,1,H,EG,1601,2,D15,WA,289679194,289679311,599324284,2024-04-25 01:04:53,2024-04-25 08:16:39,24769,2024-04-26 00:52:47.247838
1,289870561,50,44118619,1,H,1,H,EG,1780,2,E03,WA,289874022,289874463,599670338,2024-04-25 17:34:53,2024-04-25 19:29:47,178141,2024-04-26 00:52:47.247838
2,289833599,50,13091411,1,H,1,H,EG,2215,4,F13,WA,289835037,289835121,599596179,2024-04-25 15:24:37,2024-04-25 16:42:37,LAESSIG,2024-04-26 00:52:47.247838
3,289815259,50,9636829,0,H,0,H,EG,2401,2,C03,WA,289815703,289815805,599541258,2024-04-25 14:34:37,2024-04-25 14:56:15,GUERBUEZ,2024-04-26 00:52:47.247838
4,289839097,50,16359571,1,H,1,H,EG,2004,7,G05,WA,289840157,289725531,599576127,2024-04-25 15:39:56,2024-04-25 16:03:15,104044,2024-04-26 00:52:47.247838


# WA kopf.csv prep

In [2]:
#Consolidated:::
df_wa_kopf = pd.read_csv(
    '../Data/wa_kopf.csv',
    dtype={
        "OUTNUM": "int32",
        "DOCNUM": "category",
        "ORDNUM": "category",
        "STATUS": "int32",
        "CUSNUM": "category",
        "SHPTYP": "category",
        "TOUR": "category",
    },
    low_memory=False,
    parse_dates=["ORDDAT", "DLVDAT", "CRTDAT", "TRNDAT"]
)
# Filter the DataFrame to keep only rows with status == 90 (abgeschlossene)
df_wa_kopf = df_wa_kopf[df_wa_kopf['STATUS'] == 90]

# Step 1: Merge main DataFrame with customers
# df_orders = pd.merge(df_bewegungen, df_wa_kopf, on='OUTNUM', how='inner')

#:::

# WA Positionen prep

In [3]:
### Consolidated
df_wa_positionen = pd.read_csv(
    '../Data/wa_positionen.csv',
    dtype={
        "OUTNUM": "int32",
        "OUTLIN": "int32",
        "STATUS": "category",
        "ITEM": "category",
        "LOT": "category",
        "QACODE": "category",
        "ORDQTY": "int32",
        "RELQTY": "int32",
        "FNDQTY": "int32",
        "CONQTY": "float32", # somehow it thinks these values are float data
        "SHPQTY": "int32",
        "USERID": "category",
        "TRNNUM": "int32",
    },
    low_memory=False,
    parse_dates=["CRTDAT", "TRNDAT"]
)

df_wa_positionen['CONQTY'] = df_wa_positionen['CONQTY'].astype(int)
## WRITE TO CSV todo

# :::

# Joins

In [None]:

# = pd.merge(df_bewegungen, df_wa_kopf, on='OUTNUM', how='inner')


In [8]:
df_full = (
    pd.merge(df_bewegungen, df_wa_kopf, on='OUTNUM', how='inner')
    .merge(df_wa_positionen, on='OUTNUM', how='inner')
)

print(df_full.head())


      WORNUM  STATUS_x  SRC_ITEM SRC_LOT SRC_QACODE DST_LOT DST_QACODE SRC_WA  \
0  289678987        50  36529803       1          H       1          H     EG   
1  289870561        50  44118619       1          H       1          H     EG   
2  289833599        50  13091411       1          H       1          H     EG   
3  289815259        50   9636829       0          H       0          H     EG   
4  289839097        50  16359571       1          H       1          H     EG   

  SRC_X SRC_Y  ... ORDQTY RELQTY  FNDQTY  CONQTY  SHPQTY              CRTDAT  \
0  1601    02  ...      9      9       9       9       9 2024-04-24 08:39:31   
1  1780    02  ...      2      2       2       2       2 2024-04-24 12:43:27   
2  2215    04  ...      1      1       1       1       1 2024-04-24 15:07:52   
3  2401    02  ...      1      1       1       1       1 2024-04-25 06:57:41   
4  2004    07  ...      1      1       1       1       1 2024-04-25 15:38:32   

               TRNDAT USERID    

In [15]:
df_full.head()
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19628991 entries, 0 to 19628990
Data columns (total 50 columns):
 #   Column      Dtype         
---  ------      -----         
 0   WORNUM      int32         
 1   STATUS_x    int32         
 2   SRC_ITEM    category      
 3   SRC_LOT     category      
 4   SRC_QACODE  category      
 5   DST_LOT     category      
 6   DST_QACODE  category      
 7   SRC_WA      category      
 8   SRC_X       category      
 9   SRC_Y       category      
 10  SRC_Z       category      
 11  DST_WA      category      
 12  OUTNUM      int32         
 13  LISNUM      int32         
 14  SUMLIS      int32         
 15  TRNNUM_x    int32         
 16  CRTDAT_x    datetime64[ns]
 17  TRNDAT_x    datetime64[ns]
 18  USERID_x    category      
 19  LOADDAT_x   datetime64[ns]
 20  DOCNUM      category      
 21  ORDNUM_x    category      
 22  STATUS_y    int32         
 23  PICCOD      object        
 24  CUSNUM      category      
 25  ORDDAT      date

In [14]:
num_rows = len(df_full)
print(f"Number of rows: {num_rows}")

Number of rows: 19628991


In [12]:
import pyarrow

# Save the fully joined DataFrame to a Parquet file for better performance
output_file = 'joined_data_v1.parquet'
df_full.to_parquet(output_file, index=False)

print(f"Fully joined DataFrame saved to {output_file}")

Fully joined DataFrame saved to joined_data_v1.parquet


In [None]:
# reading parquet file later

df_loaded = pd.read_parquet('joined_data.parquet')
print(df_loaded.head())


##TEMP
