In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import warnings
import os
warnings.filterwarnings('ignore')

def create_directories():
    """
    Create necessary directories if they don't exist
    """
    directories = [
        '../data/raw',
        '../data/processed',
        '../models',
        '../results'
    ]
    for directory in directories:
        os.makedirs(directory, exist_ok=True)
        print(f"Directory '{directory}' created or already exists")

def load_and_clean_data(filepath='../data/raw/freight_log.csv'):
    """
    Load and perform initial cleaning of the freight data
    """
    try:
        # Read the CSV data
        df = pd.read_csv(filepath)
        
        # Clean column names
        df.columns = df.columns.str.strip().str.replace(' ', '_')
        
        # Convert date columns to datetime
        date_columns = ['Shipment_Date', 'Planned_Delivery_Date', 'Actual_Delivery_Date']
        for col in date_columns:
            df[col] = pd.to_datetime(df[col])
        
        # Extract numeric part from Shipment_ID and convert to integer
        df['Shipment_ID'] = df['Shipment_ID'].str.extract('(\d+)').astype(int)
        
        print(f"Data loaded successfully with shape: {df.shape}")
        return df
        
    except Exception as e:
        print(f"Error loading data: {str(e)}")
        raise

def create_features(df):
    """
    Create new features from existing data
    """
    # Time-based features
    df['Planned_Delivery_Days'] = (df['Planned_Delivery_Date'] - df['Shipment_Date']).dt.days
    df['Actual_Delivery_Days'] = (df['Actual_Delivery_Date'] - df['Shipment_Date']).dt.days
    df['Delay_Days'] = df['Actual_Delivery_Days'] - df['Planned_Delivery_Days']
    
    # Date features
    df['Shipment_Month'] = df['Shipment_Date'].dt.month
    df['Shipment_Day_Of_Week'] = df['Shipment_Date'].dt.dayofweek
    df['Is_Weekend'] = df['Shipment_Day_Of_Week'].isin([5, 6]).astype(int)
    df['Shipment_Quarter'] = df['Shipment_Date'].dt.quarter
    
    # Distance-based features
    df['Distance_Category'] = pd.qcut(df['Distance_(km)'], q=5, 
                                    labels=['Very_Short', 'Short', 'Medium', 'Long', 'Very_Long'])
    df['Avg_Speed'] = df['Distance_(km)'] / df['Actual_Delivery_Days']
    df['Is_Long_Distance'] = (df['Distance_(km)'] > df['Distance_(km)'].median()).astype(int)
    
    # Weather and traffic impact
    df['Is_Bad_Weather'] = df['Weather_Conditions'].isin(['Rain', 'Storm', 'Fog']).astype(int)
    df['Is_Heavy_Traffic'] = (df['Traffic_Conditions'] == 'Heavy').astype(int)
    
    # Convert target variable to numeric
    df['Delayed'] = (df['Delayed'] == 'Yes').astype(int)
    
    print("\nCreated new features:")
    print("Time-based:", ['Planned_Delivery_Days', 'Actual_Delivery_Days', 'Delay_Days'])
    print("Date-based:", ['Shipment_Month', 'Shipment_Day_Of_Week', 'Is_Weekend', 'Shipment_Quarter'])
    print("Distance-based:", ['Distance_Category', 'Avg_Speed', 'Is_Long_Distance'])
    print("Weather/Traffic:", ['Is_Bad_Weather', 'Is_Heavy_Traffic'])
    
    return df

def encode_categorical_variables(df):
    """
    Encode categorical variables for model input
    """
    # Initialize dictionary to store label encoders
    label_encoders = {}
    
    # Columns to encode
    categorical_columns = [
        'Origin', 'Destination', 'Vehicle_Type',
        'Weather_Conditions', 'Traffic_Conditions',
        'Distance_Category'
    ]
    
    # Encode each categorical column
    for col in categorical_columns:
        label_encoders[col] = LabelEncoder()
        df[col] = label_encoders[col].fit_transform(df[col])
        print(f"\nEncoded {col} with {len(label_encoders[col].classes_)} unique values")
        mapping = dict(zip(label_encoders[col].classes_, label_encoders[col].transform(label_encoders[col].classes_)))
        print(f"Mapping: {mapping}")
    
    return df, label_encoders

def prepare_features_for_modeling(df):
    """
    Prepare final feature set for modeling
    """
    # Select features for modeling
    feature_columns = [
        'Distance_(km)', 'Distance_Category', 
        'Shipment_Month', 'Shipment_Day_Of_Week', 
        'Shipment_Quarter', 'Is_Weekend',
        'Planned_Delivery_Days', 'Actual_Delivery_Days', 
        'Delay_Days', 'Avg_Speed', 'Is_Long_Distance',
        'Is_Bad_Weather', 'Is_Heavy_Traffic',
        'Origin', 'Destination', 'Vehicle_Type',
        'Weather_Conditions', 'Traffic_Conditions'
    ]
    
    # Split features and target
    X = df[feature_columns]
    y = df['Delayed']
    
    # Scale numerical features
    scaler = StandardScaler()
    numerical_features = X.select_dtypes(include=['float64', 'int64']).columns
    X[numerical_features] = scaler.fit_transform(X[numerical_features])
    
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"\nData split complete:")
    print(f"Training set size: {len(X_train)}")
    print(f"Testing set size: {len(X_test)}")
    
    return X_train, X_test, y_train, y_test, feature_columns, scaler

def save_processed_data(df, X_train, X_test, y_train, y_test, label_encoders, feature_columns, scaler):
    """
    Save all processed data and metadata
    """
    # Save processed data
    df.to_csv('../data/processed/processed_data.csv', index=False)
    X_train.to_csv('../data/processed/X_train.csv', index=False)
    X_test.to_csv('../data/processed/X_test.csv', index=False)
    y_train.to_csv('../data/processed/y_train.csv', index=False)
    y_test.to_csv('../data/processed/y_test.csv', index=False)
    
    # Save encoders and scaler
    import joblib
    joblib.dump(label_encoders, '../models/label_encoders.pkl')
    joblib.dump(scaler, '../models/scaler.pkl')
    
    # Save feature columns
    with open('models/feature_columns.txt', 'w') as f:
        f.write('\n'.join(feature_columns))
    
    print("\nFiles saved successfully:")
    print("- Processed data: data/processed/processed_data.csv")
    print("- Train/test splits: data/processed/X_train.csv, X_test.csv, y_train.csv, y_test.csv")
    print("- Encoders and scaler: models/label_encoders.pkl, models/scaler.pkl")
    print("- Feature columns: models/feature_columns.txt")

def analyze_data(df):
    """
    Perform basic data analysis
    """
    print("\nData Analysis:")
    print("\nBasic Statistics:")
    print(df.describe())
    
    print("\nDelay Analysis:")
    print("Delay Rate:", (df['Delayed'] == 1).mean())
    
    print("\nVehicle Type Distribution:")
    print(df['Vehicle_Type'].value_counts(normalize=True))
    
    print("\nWeather Conditions Distribution:")
    print(df['Weather_Conditions'].value_counts(normalize=True))
    
    print("\nTraffic Conditions Distribution:")
    print(df['Traffic_Conditions'].value_counts(normalize=True))

def main():
    """
    Main preprocessing pipeline
    """
    print("Starting complete data preprocessing pipeline...")
    
    # Create necessary directories
    create_directories()
    
    # Load and clean data
    df = load_and_clean_data()
    print("Data loaded and cleaned successfully")
    
    # Create new features
    df = create_features(df)
    print("Feature engineering complete")
    
    # Analyze data
    analyze_data(df)
    
    # Encode categorical variables
    df, label_encoders = encode_categorical_variables(df)
    print("Categorical encoding complete")
    
    # Prepare features for modeling
    X_train, X_test, y_train, y_test, feature_columns, scaler = prepare_features_for_modeling(df)
    print("Feature preparation complete")
    
    # Save all processed data
    save_processed_data(df, X_train, X_test, y_train, y_test, label_encoders, feature_columns, scaler)
    print("\nPreprocessing pipeline completed successfully!")
    
    return df, X_train, X_test, y_train, y_test, feature_columns, label_encoders, scaler

if __name__ == "__main__":
    df, X_train, X_test, y_train, y_test, feature_columns, label_encoders, scaler = main()

Starting complete data preprocessing pipeline...
Directory '../data/raw' created or already exists
Directory '../data/processed' created or already exists
Directory '../models' created or already exists
Directory '../results' created or already exists
Data loaded successfully with shape: (20000, 11)
Data loaded and cleaned successfully

Created new features:
Time-based: ['Planned_Delivery_Days', 'Actual_Delivery_Days', 'Delay_Days']
Date-based: ['Shipment_Month', 'Shipment_Day_Of_Week', 'Is_Weekend', 'Shipment_Quarter']
Distance-based: ['Distance_Category', 'Avg_Speed', 'Is_Long_Distance']
Weather/Traffic: ['Is_Bad_Weather', 'Is_Heavy_Traffic']
Feature engineering complete

Data Analysis:

Basic Statistics:
        Shipment_ID  Distance_(km)       Delayed  Planned_Delivery_Days  \
count  20000.000000   20000.000000  20000.000000           20000.000000   
mean    9999.500000    1101.663750      0.737850               3.269400   
std     5773.647028     520.717873      0.439815          