load prepared parquet

In [None]:
import pandas as pd
# reading parquet file later

df_loaded = pd.read_parquet('joined_data_v1.parquet')
print(df_loaded.head())

understand structure

In [None]:
print(df_loaded.info())  # Overview of columns and data types
print(df_loaded.describe())  # Summary statistics for numerical columns
print(df_loaded.head())  # Preview the first few rows


check missing data

In [None]:
print(df_loaded.isna().sum())  # Count missing values in each column


visualize distributions

In [None]:
df_loaded.hist(figsize=(12, 8))  # Histogram for numerical columns


identifiy duplicates

In [None]:
print(df_loaded.duplicated().sum())  # Count duplicated rows


# Feature engineering -- categorical data needs to be encoded

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# Load dataset
file_path = "data.csv"  # Adjust to your actual file
df = pd.read_csv(file_path)

# Drop unnecessary columns
df.drop(columns=[""], inplace=True)

# Encode categorical columns
categorical_cols = [
    "STATUS_x", "SRC_QACODE", "DST_QACODE", "SRC_WA", "DST_WA", "USERID_x", "STATUS_y", "PICCOD", "SHPTYP", "USERID_y"
]

# Low cardinality: Use One-Hot Encoding
low_cardinality_cols = [col for col in categorical_cols if df[col].nunique() < 10]
df = pd.get_dummies(df, columns=low_cardinality_cols, drop_first=True)

# High cardinality: Use Label Encoding
high_cardinality_cols = [col for col in categorical_cols if col not in low_cardinality_cols]
label_encoders = {}
for col in high_cardinality_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le  # Save encoder for future use

# Handle date columns
date_cols = ["CRTDAT_x", "TRNDAT_x", "LOADDAT_x", "CRTDAT_y", "TRNDAT_y", "LOADDAT_y", "CRTDAT", "TRNDAT", "LOADDAT"]
for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors="coerce")  # Convert to datetime
    df[f"{col}_year"] = df[col].dt.year
    df[f"{col}_month"] = df[col].dt.month
    df[f"{col}_day"] = df[col].dt.day
    df[f"{col}_weekday"] = df[col].dt.weekday
    df[f"{col}_hour"] = df[col].dt.hour
    df.drop(columns=[col], inplace=True)  # Drop original date column after feature extraction

# Normalize numerical columns if needed
numerical_cols = [
    "WORNUM", "OUTNUM", "LISNUM", "SUMLIS", "TRNNUM_x", "TRNNUM_y", "OUTLIN", "ORDQTY", "RELQTY", "FNDQTY", "CONQTY", "SHPQTY"
]
df[numerical_cols] = df[numerical_cols].fillna(0)  # Fill missing values
df[numerical_cols] = (df[numerical_cols] - df[numerical_cols].mean()) / df[numerical_cols].std()  # Standardize



In [None]:
# Save the processed DataFrame to a new file
processed_file = "processed_data.parquet"
df.to_parquet(processed_file, index=False)

print(f"Processed data saved to {processed_file}")