In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [26]:
file = r'D:\Projects\PowerBi Projects\Uber\ncr_ride_bookings.csv'
df=pd.read_csv(file)
print("======="*15)
print(df.columns)
print("======="*15)
df.head()
print("======="*15)
df.info()
print("======="*15)
df.describe()

Index(['Date', 'Time', 'Booking ID', 'Booking Status', 'Customer ID',
       'Vehicle Type', 'Pickup Location', 'Drop Location', 'Avg VTAT',
       'Avg CTAT', 'Cancelled Rides by Customer',
       'Reason for cancelling by Customer', 'Cancelled Rides by Driver',
       'Driver Cancellation Reason', 'Incomplete Rides',
       'Incomplete Rides Reason', 'Booking Value', 'Ride Distance',
       'Driver Ratings', 'Customer Rating', 'Payment Method'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 21 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Date                               150000 non-null  object 
 1   Time                               150000 non-null  object 
 2   Booking ID                         150000 non-null  object 
 3   Booking Status                     150000 non-null  object 
 4   Customer ID         

Unnamed: 0,Avg VTAT,Avg CTAT,Cancelled Rides by Customer,Cancelled Rides by Driver,Incomplete Rides,Booking Value,Ride Distance,Driver Ratings,Customer Rating
count,139500.0,102000.0,10500.0,27000.0,9000.0,102000.0,102000.0,93000.0,93000.0
mean,8.456352,29.149636,1.0,1.0,1.0,508.295912,24.637012,4.230992,4.404584
std,3.773564,8.902577,0.0,0.0,0.0,395.805774,14.002138,0.436871,0.437819
min,2.0,10.0,1.0,1.0,1.0,50.0,1.0,3.0,3.0
25%,5.3,21.6,1.0,1.0,1.0,234.0,12.46,4.1,4.2
50%,8.3,28.8,1.0,1.0,1.0,414.0,23.72,4.3,4.5
75%,11.3,36.8,1.0,1.0,1.0,689.0,36.82,4.6,4.8
max,20.0,45.0,1.0,1.0,1.0,4277.0,50.0,5.0,5.0


In [31]:
print("======="*15)
print("2. Initial Column Name Clean-up")
print("======="*15)

df.columns = df.columns.str.replace(' ', '_').str.replace('[^A-Za-z0-9_]+', '', regex=True).str.lower()
print(f"Initial columns: {df.columns.tolist()}")

2. Initial Column Name Clean-up
Initial columns: ['date', 'time', 'booking_id', 'booking_status', 'customer_id', 'vehicle_type', 'pickup_location', 'drop_location', 'avg_vtat', 'avg_ctat', 'cancelled_rides_by_customer', 'reason_for_cancelling_by_customer', 'cancelled_rides_by_driver', 'driver_cancellation_reason', 'incomplete_rides', 'incomplete_rides_reason', 'booking_value', 'ride_distance', 'driver_ratings', 'customer_rating', 'payment_method']


In [33]:
print("======="*15)
print("3. Data Type Conversions and Timestamp Creation")
print("======="*15)

try:
    df['ride_timestamp'] = pd.to_datetime(df['date'] + ' ' + df['time'], errors='coerce')
    # Drop the original separate columns
    df = df.drop(columns=['date', 'time'])
    print("Created 'ride_timestamp' column and dropped 'date' and 'time'.")
except Exception as e:
    print(f"Error creating ride_timestamp: {e}")

3. Data Type Conversions and Timestamp Creation
Created 'ride_timestamp' column and dropped 'date' and 'time'.


In [35]:
print("======="*15)
print("4. Categorical Cleanup and Standardization")
print("======="*15)

# List of categorical columns to standardize
categorical_cols = ['booking_id', 'booking_status', 'customer_id', 'vehicle_type', 
                    'pickup_location', 'drop_location', 'payment_method']

for col in categorical_cols:
    if col in df.columns:
        # Strip leading/trailing whitespace and standardize casing
        df[col] = df[col].astype(str).str.strip().str.title()
        
        # Specific cleanup for Booking ID which appears to have extra quotes in the snippet
        if col == 'booking_id':
            # Remove leading/trailing quotes that might have been missed by the parser
            df[col] = df[col].str.replace('"', '', regex=False).str.strip()

print("Standardized categorical columns (stripped whitespace and title-cased).")

4. Categorical Cleanup and Standardization
Standardized categorical columns (stripped whitespace and title-cased).


In [41]:
print("======="*15)
print("5.Missing Numerical Value Imputation Based on Booking Status")
print("======="*15)

# Define numerical columns that should be zero if the ride was not 'Completed'
value_cols_to_impute = [
    'avg_vtat', 
    'avg_ctat', 
    'booking_value', 
    'ride_distance', 
    'driver_ratings', 
    'customer_rating'
]

# Define columns related to cancellation/incompleteness that should be 0 instead of NaN/NULL
flag_cols_to_zero = [
    'cancelled_rides_by_customer',
    'cancelled_rides_by_driver',
    'incomplete_rides'
]

# A. Impute core metrics to 0 if the ride was not completed
# 'No Driver Found', 'Cancelled', 'Incomplete' rides have no meaningful value, distance, or ratings.
for col in value_cols_to_impute:
    if col in df.columns:
        # Check if the booking status is NOT 'Completed'
        mask = df['booking_status'] != 'Completed'
        # Fill NaN values in the specified column with 0 where the mask is True
        df.loc[mask, col] = df.loc[mask, col].fillna(0)
        # For the remaining NaNs (where status IS 'Completed'), we leave them as NaN 
        # as they represent genuinely missing data that may be imputed later (if needed) 
        # or handled by the BI tool.

# B. Impute cancellation/incompletion flags to 0 where they are NaN
for col in flag_cols_to_zero:
    if col in df.columns:
        # A NaN in these columns almost certainly means 'No' or 0 occurrences
        df[col] = df[col].fillna(0)

5.Missing Numerical Value Imputation Based on Booking Status


In [42]:
print("======="*15)
print("6. Final Data Type Enforcement")
print("======="*15)

# Convert imputed numerical columns to appropriate types
for col in value_cols_to_impute + flag_cols_to_zero:
    if col in df.columns:
        # Use 'Int64' (Pandas integer with support for NaN) or standard float
        if col in flag_cols_to_zero:
            df[col] = df[col].astype('Int64') # Handles 1, 0, and NaNs if any remain
        else:
            df[col] = df[col].astype(float) # Ensure all other metrics are floats for precision

print("Completed conditional imputation and final type enforcement.")


6. Final Data Type Enforcement
Completed conditional imputation and final type enforcement.


In [43]:
print("======="*15)
print("7. Summary and Output")
print("======="*15)

print("\n--- Cleaned DataFrame Summary ---")
print(df.info())
print("\n--- Cleaned Data Head (First 5 Rows) ---")
print(df.head())

7. Summary and Output

--- Cleaned DataFrame Summary ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 20 columns):
 #   Column                             Non-Null Count   Dtype         
---  ------                             --------------   -----         
 0   booking_id                         150000 non-null  object        
 1   booking_status                     150000 non-null  object        
 2   customer_id                        150000 non-null  object        
 3   vehicle_type                       150000 non-null  object        
 4   pickup_location                    150000 non-null  object        
 5   drop_location                      150000 non-null  object        
 6   avg_vtat                           150000 non-null  float64       
 7   avg_ctat                           150000 non-null  float64       
 8   cancelled_rides_by_customer        150000 non-null  Int64         
 9   reason_for_cancelling_by_customer  

In [44]:
print("======="*15)
print("======="*15)
print("Creating new features based on existing data")
print("======="*15)
print("======="*15)

Creating new features based on existing data


In [46]:
print("======="*15)
print("PART-B 1. Load Data")
print("======="*15)

try:
    # Use the filename provided in the context
    df = pd.read_csv("ncr_ride_bookings.csv", sep=',', quotechar='"', engine='python')
    print("Successfully loaded ncr_ride_bookings.csv.")
except FileNotFoundError:
    print("Error: The file 'ncr_ride_bookings.csv' was not found.")
    df = pd.DataFrame()
    exit()

PART-B 1. Load Data
Successfully loaded ncr_ride_bookings.csv.


In [48]:
print("======="*15)
print("PART-B 2. Initial Column Name Cleanup (Required for consistency)")
print("======="*15)

df.columns = df.columns.str.replace(' ', '_').str.replace('[^A-Za-z0-9_]+', '', regex=True).str.lower()

PART-B 2. Initial Column Name Cleanup (Required for consistency)


In [52]:
print("======="*15)
print("PART-B 3. Data Type Conversions and Timestamp Creation")
print("======="*15)

# Combine 'date' and 'time' into a single 'ride_timestamp' column
try:
    df['ride_timestamp'] = pd.to_datetime(df['date'] + ' ' + df['time'], errors='coerce')
    df = df.drop(columns=['date', 'time'])
except Exception as e:
    print(f"Error creating ride_timestamp: {e}")

PART-B 3. Data Type Conversions and Timestamp Creation
Error creating ride_timestamp: 'date'


In [54]:
print("======="*15)
print("PART-B 4. Categorical Cleanup and Standardization")
print("======="*15)

categorical_cols = ['booking_id', 'booking_status', 'customer_id', 'vehicle_type', 
                    'pickup_location', 'drop_location', 'payment_method']

for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().str.title()
        if col == 'booking_id':
            df[col] = df[col].str.replace('"', '', regex=False).str.strip()
            
print("Standardized categorical columns (stripped whitespace and title-cased).")

PART-B 4. Categorical Cleanup and Standardization
Standardized categorical columns (stripped whitespace and title-cased).


In [56]:
print("======="*15)
print("PART-B 5. Missing Numerical Value Imputation (Replicating Cleaning Logic)")
print("======="*15)

value_cols_to_impute = [
    'avg_vtat', 
    'avg_ctat', 
    'booking_value', 
    'ride_distance', 
    'driver_ratings', 
    'customer_rating'
]
flag_cols_to_zero = [
    'cancelled_rides_by_customer',
    'cancelled_rides_by_driver',
    'incomplete_rides'
]

# Impute core metrics to 0 if the ride was NOT 'Completed'
mask_not_completed = df['booking_status'] != 'Completed'
for col in value_cols_to_impute:
    if col in df.columns:
        df.loc[mask_not_completed, col] = df.loc[mask_not_completed, col].fillna(0)
        df[col] = df[col].astype(float) # Final type enforcement

# Impute cancellation/incompletion flags to 0 where they are NaN
for col in flag_cols_to_zero:
    if col in df.columns:
        df[col] = df[col].fillna(0).astype('Int64')
        
print("Completed conditional imputation and final type enforcement.")

PART-B 5. Missing Numerical Value Imputation (Replicating Cleaning Logic)
Completed conditional imputation and final type enforcement.


In [61]:
# 6.1. Cancellation/Incompletion Flags (Binary 1/0)
print("Creating binary cancellation/incompletion flags...")

# Is_Cancelled_Customer: Based on 'cancelled_rides_by_customer' column > 0
# CHANGED: Replaced 'Int64' with 'int'
df['is_cancelled_customer'] = np.where(df['cancelled_rides_by_customer'] > 0, 1, 0).astype(int)

# Is_Cancelled_Driver: Based on 'cancelled_rides_by_driver' column > 0
# CHANGED: Replaced 'Int64' with 'int'
df['is_cancelled_driver'] = np.where(df['cancelled_rides_by_driver'] > 0, 1, 0).astype(int)

# Is_Incomplete: Based on 'incomplete_rides' column > 0 or booking_status is 'Incomplete'
# CHANGED: Replaced 'Int64' with 'int'
df['is_incomplete'] = np.where(
    (df['incomplete_rides'] > 0) | (df['booking_status'] == 'Incomplete'), 
    1, 
    0
).astype(int)


# 6.2. Pricing Efficiency: Price_Per_KM (Booking Value / Ride Distance)
print("Calculating Price_Per_KM...")

# Use numpy.where to avoid division by zero (where ride_distance is 0 or NaN)
df['price_per_km'] = np.where(
    (df['ride_distance'].fillna(0) > 0),
    df['booking_value'] / df['ride_distance'],
    np.nan # Use NaN for cases where distance is zero or missing
).astype(float)


# 6.3. Time Analysis Features
print("Extracting time-based features...")

# Hour_of_Day
# CHANGED: Replaced 'Int64' with 'int'
df['hour_of_day'] = df['ride_timestamp'].dt.hour.astype(int)

# Day_of_Week (0=Monday, 6=Sunday)
df['day_of_week'] = df['ride_timestamp'].dt.day_name()

# Month_Name
df['month_name'] = df['ride_timestamp'].dt.month_name()

# Is_Peak_Hour (7-10 AM and 5-8 PM)
# CHANGED: Replaced 'Int64' with 'int'
df['is_peak_hour'] = np.where(
    ((df['hour_of_day'] >= 7) & (df['hour_of_day'] <= 9)) | 
    ((df['hour_of_day'] >= 17) & (df['hour_of_day'] <= 19)),
    1,
    0
).astype(int)


# 6.4. Performance Metrics: Efficiency_Ratio (Avg VTAT / Ride Distance)
print("Calculating Efficiency_Ratio...")

# Measures how long it takes for a driver to reach a customer relative to the trip distance. Lower is better.
# Handle division by zero/nulls for ride_distance
df['efficiency_ratio'] = np.where(
    (df['ride_distance'].fillna(0) > 0),
    df['avg_vtat'] / df['ride_distance'],
    np.nan
).astype(float)


# 6.5. Geospatial Categorization (Simple Logic for NCR Zones)
print("Applying simple geospatial categorization...")

# Define simple lookups for common areas in NCR (New Delhi, Gurgaon, Noida)
# This uses simple string checking on pickup_location/drop_location
def classify_zone(location):
    if pd.isna(location):
        return 'Unknown'
    location = str(location).upper()
    
    if 'GURGAON' in location or 'NEHRU PLACE' in location or 'MALVIYA NAGAR' in location or 'AYA NAGAR' in location or 'TUGHLAKABAD' in location or 'KHANDSA' in location:
        return 'South NCR'
    elif 'NOIDA' in location or 'AKSHARDHAM' in location or 'JHEL MIL' in location:
        return 'East NCR'
    elif 'KAROL BAGH' in location or 'JAMA MASJID' in location or 'VISHWAVIDYALAYA' in location:
        return 'Central Delhi'
    elif 'PALAM VIHAR' in location or 'SHASTRI NAGAR' in location:
        return 'West Delhi'
    else:
        # Default to a general "NCR" or 'Other' zone
        return 'Other NCR Zone'

df['pickup_zone'] = df['pickup_location'].apply(classify_zone)
df['drop_zone'] = df['drop_location'].apply(classify_zone)


# --- 7. Final Summary and Output ---
print("\n--- Feature Engineered DataFrame Summary ---")
print(f"Shape: {df.shape}")
print(df.info())
print("\n--- Engineered Features Head (First 5 Rows) ---")
print(df[['booking_status', 'ride_distance', 'booking_value', 'price_per_km', 
          'is_peak_hour', 'efficiency_ratio', 'pickup_zone', 'is_cancelled_customer']].head())

Creating binary cancellation/incompletion flags...
Calculating Price_Per_KM...
Extracting time-based features...
Calculating Efficiency_Ratio...
Applying simple geospatial categorization...

--- Feature Engineered DataFrame Summary ---
Shape: (150000, 31)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 31 columns):
 #   Column                             Non-Null Count   Dtype         
---  ------                             --------------   -----         
 0   booking_id                         150000 non-null  object        
 1   booking_status                     150000 non-null  object        
 2   customer_id                        150000 non-null  object        
 3   vehicle_type                       150000 non-null  object        
 4   pickup_location                    150000 non-null  object        
 5   drop_location                      150000 non-null  object        
 6   avg_vtat                           150000 non-null  

In [62]:
# ====================================================================
# --- 2. Save Processed Data to CSV Checkpoint ---
# ====================================================================
PROCESSED_FILENAME = 'ncr_ride_bookings_processed.csv'
try:
    df.to_csv(PROCESSED_FILENAME, index=False)
    print(f"\nSUCCESS: Cleaned and feature-engineered data saved to {PROCESSED_FILENAME}")
except Exception as e:
    print(f"\nERROR saving CSV file: {e}")


SUCCESS: Cleaned and feature-engineered data saved to ncr_ride_bookings_processed.csv


In [63]:
print("======="*15)
print("======="*15)
print("PART C SQL")
print("======="*15)
print("======="*15)

PART C SQL


In [2]:
import pandas as pd
import numpy as np

# --- Placeholder for the provided CSV file content
try:
    df = pd.read_csv("ncr_ride_bookings.csv", sep=',', quotechar='"', engine='python')
    print("Successfully loaded ncr_ride_bookings.csv.")
except FileNotFoundError:
    print("Error: The file 'ncr_ride_bookings.csv' was not found.")
    df = pd.DataFrame()
    exit()

# Initial Column Name Cleanup
print("Starting data cleaning and standardization...")
df.columns = df.columns.str.replace(' ', '_').str.replace('[^A-Za-z0-9_]+', '', regex=True).str.lower()

# Combine 'date' and 'time' into a single 'ride_timestamp' column
try:
    df['ride_timestamp'] = pd.to_datetime(df['date'] + ' ' + df['time'], errors='coerce')
    df = df.drop(columns=['date', 'time'])
except Exception as e:
    print(f"Error creating ride_timestamp: {e}")

# Categorical Cleanup and Standardization
categorical_cols = ['booking_id', 'booking_status', 'customer_id', 'vehicle_type', 
                    'pickup_location', 'drop_location', 'payment_method']

for col in categorical_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip().str.title()
        if col == 'booking_id':
            # Remove any residual quotes from the ID field
            df[col] = df[col].str.replace('"', '', regex=False).str.strip()

# Missing Numerical Value Imputation
# Impute with 0 for non-completed rides, as these values (VTAT, distance, value) are irrelevant.
value_cols_to_impute = [
    'avg_vtat', 'avg_ctat', 'booking_value', 'ride_distance', 
    'driver_ratings', 'customer_rating'
]

mask_not_completed = df['booking_status'] != 'Completed'
for col in value_cols_to_impute:
    if col in df.columns:
        df.loc[mask_not_completed, col] = df.loc[mask_not_completed, col].fillna(0)
        df[col] = df[col].astype(float)

# Impute flag columns
flag_cols_to_zero = [
    'cancelled_rides_by_customer', 'cancelled_rides_by_driver', 'incomplete_rides'
]

for col in flag_cols_to_zero:
    if col in df.columns:
        df[col] = df[col].fillna(0).astype(int) 

# --- Feature Engineering ---
print("Creating feature-rich columns...")

# Cancellation/Incompletion Flags
df['is_cancelled_customer'] = np.where(df['cancelled_rides_by_customer'] > 0, 1, 0).astype(int)
df['is_cancelled_driver'] = np.where(df['cancelled_rides_by_driver'] > 0, 1, 0).astype(int)

# *** IMPORTANT FIX: Rename 'is_incomplete' to 'trip_incomplete' to match SQL schema ***
df['trip_incomplete'] = np.where(
    (df['incomplete_rides'] > 0) | (df['booking_status'] == 'Incomplete'), 
    1, 
    0
).astype(int)
# df['is_incomplete'] column is implicitly replaced by df['trip_incomplete'] above
# Original definition:
# df['is_incomplete'] = np.where(
#     (df['incomplete_rides'] > 0) | (df['booking_status'] == 'Incomplete'), 
#     1, 
#     0
# ).astype(int)


# Pricing Efficiency: Price_Per_KM
df['price_per_km'] = np.where(
    (df['ride_distance'].fillna(0) > 0),
    df['booking_value'] / df['ride_distance'],
    np.nan
).astype(float)

# Time Analysis Features
df['hour_of_day'] = df['ride_timestamp'].dt.hour.astype(int)
df['day_of_week'] = df['ride_timestamp'].dt.day_name()
df['month_name'] = df['ride_timestamp'].dt.month_name()
df['is_peak_hour'] = np.where(
    ((df['hour_of_day'] >= 7) & (df['hour_of_day'] <= 9)) | 
    ((df['hour_of_day'] >= 17) & (df['hour_of_day'] <= 19)),
    1,
    0
).astype(int)

# Performance Metrics: Efficiency_Ratio (VTAT per KM)
df['efficiency_ratio'] = np.where(
    (df['ride_distance'].fillna(0) > 0),
    df['avg_vtat'] / df['ride_distance'],
    np.nan
).astype(float)

# Geospatial Categorization
def classify_zone(location):
    if pd.isna(location): return 'Unknown'
    location = str(location).upper()
    
    if 'GURGAON' in location or 'NEHRU PLACE' in location or 'MALVIYA NAGAR' in location or 'AYA NAGAR' in location or 'TUGHLAKABAD' in location or 'KHANDSA' in location:
        return 'South NCR'
    elif 'NOIDA' in location or 'AKSHARDHAM' in location or 'JHEL MIL' in location:
        return 'East NCR'
    elif 'KAROL BAGH' in location or 'JAMA MASJID' in location or 'VISHWAVIDYALAYA' in location:
        return 'Central Delhi'
    elif 'PALAM VIHAR' in location or 'SHASTRI NAGAR' in location:
        return 'West Delhi'
    else:
        return 'Other NCR Zone'

df['pickup_zone'] = df['pickup_location'].apply(classify_zone)
df['drop_zone'] = df['drop_location'].apply(classify_zone)


# --- Final Export to CSV ---
PROCESSED_FILENAME = 'ncr_ride_bookings_final_for_import.csv'
# Rename columns to match the SQL schema's expected casing/naming conventions for the wizard
# The SQL schema uses TitleCase/PascalCase for column names
column_mapping = {
    'ride_timestamp': 'Ride_Timestamp',
    'hour_of_day': 'Hour_of_Day',
    'day_of_week': 'Day_of_Week',
    'month_name': 'Month_Name',
    'is_peak_hour': 'Is_Peak_Hour',
    'booking_id': 'Booking_ID',
    'booking_status': 'Booking_Status',
    'customer_id': 'Customer_ID',
    'vehicle_type': 'Vehicle_Type',
    'payment_method': 'Payment_Method',
    'pickup_location': 'Pickup_Location',
    'drop_location': 'Drop_Location',
    'pickup_zone': 'Pickup_Zone',
    'drop_zone': 'Drop_Zone',
    'avg_vtat': 'Avg_VTAT',
    'avg_ctat': 'Avg_CTAT',
    'driver_ratings': 'Driver_Ratings',
    'customer_rating': 'Customer_Rating',
    'booking_value': 'Booking_Value',
    'ride_distance': 'Ride_Distance',
    'price_per_km': 'Price_Per_KM',
    'efficiency_ratio': 'Efficiency_Ratio',
    'is_cancelled_customer': 'Is_Cancelled_Customer',
    'is_cancelled_driver': 'Is_Cancelled_Driver',
    # Ensure this matches the renamed column in the SQL schema
    'trip_incomplete': 'Trip_Incomplete', 
    'cancelled_rides_by_customer': 'Cancelled_Rides_By_Customer',
    'cancelled_rides_by_driver': 'Cancelled_Rides_By_Driver',
    'incomplete_rides': 'Incomplete_Rides',
    'reason_for_cancelling_by_customer': 'Reason_For_Cancelling_By_Customer',
    'driver_cancellation_reason': 'Driver_Cancellation_Reason',
    'incomplete_rides_reason': 'Incomplete_Rides_Reason'
}

# Apply renames before saving the CSV
df = df.rename(columns=column_mapping)

# Drop any columns that were in the original CSV but not defined in the SQL mapping
# (e.g., the original date/time columns if they were not explicitly dropped earlier)
# This step is often necessary for clean SQL import.
columns_to_keep = list(column_mapping.values())
df = df.reindex(columns=columns_to_keep, fill_value=None)


try:
    df.to_csv(PROCESSED_FILENAME, index=False, na_rep='NULL')
    print(f"\n✅ SUCCESS: Cleaned and feature-engineered data exported to {PROCESSED_FILENAME}")
    print("You can now import this file directly into MySQL Workbench.")
except Exception as e:
    print(f"\n❌ ERROR saving CSV file: {e}")


Successfully loaded ncr_ride_bookings.csv.
Starting data cleaning and standardization...
Creating feature-rich columns...

✅ SUCCESS: Cleaned and feature-engineered data exported to ncr_ride_bookings_final_for_import.csv
You can now import this file directly into MySQL Workbench.
