In [3]:
#!/usr/bin/env python3
import pandas as pd
import numpy as np

In [4]:
# --- Configuration ---
# IMPORTANT: Make sure this path is correct for your environment!
file_path = "/content/train1.csv"
target_col = "Production (M.Ton)"
labels = ["Very Low", "Low", "Medium", "High", "Very High"]
num_bins = 5

In [5]:
# --- Load Data ---
print(f"Loading dataset from {file_path}...")
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    exit()
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()
print("Dataset loaded successfully.")

Loading dataset from /content/train1.csv...
Dataset loaded successfully.


In [6]:
# --- Step 1: Calculate Quantile Bin Edges ---
print("\nCalculating quantile-based bin edges...")
try:
    # Use duplicates=\'drop\' to handle non-unique edges, might result in fewer bins
    df["temp_bins"], bin_edges = pd.qcut(df[target_col], q=num_bins, labels=False, retbins=True, duplicates="drop")
    actual_bins = df["temp_bins"].nunique()
    if actual_bins != num_bins:
        print(f"Warning: Could only create {actual_bins} bins instead of {num_bins} due to data distribution.")
        # Adjust labels list if fewer bins were created
        labels = labels[:actual_bins]
    else:
        print(f"Successfully determined {num_bins} quantile-based bins.")

    # Ensure the lowest edge is the minimum value and highest edge is the maximum value
    min_val = df[target_col].min()
    max_val = df[target_col].max()
    bin_edges[0] = min_val
    bin_edges[-1] = max_val

except ValueError as e:
    print(f"Error calculating quantiles: {e}")
    print("Cannot proceed without valid bin edges.")
    exit()
except Exception as e:
    print(f"An unexpected error occurred during bin calculation: {e}")
    exit()


Calculating quantile-based bin edges...
Successfully determined 5 quantile-based bins.


In [7]:
# --- Step 2: Print the Calculated Ranges ---
print("\n--- Suggested Production Ranges based on Quantiles --- ")
calculated_ranges_info = []
for i in range(len(bin_edges) - 1):
    lower_bound = bin_edges[i]
    upper_bound = bin_edges[i+1]
    label = labels[i]
    # Format the range string
    if i == 0:
        range_str = f"[{lower_bound:.2f} - {upper_bound:.2f}]"
    else:
        range_str = f"({lower_bound:.2f} - {upper_bound:.2f}]"
    print(f"- {label}: {range_str}")
    calculated_ranges_info.append(range_str) # Store for potential later use



--- Suggested Production Ranges based on Quantiles --- 
- Very Low: [0.00 - 545.38]
- Low: (545.38 - 1452.00]
- Medium: (1452.00 - 3050.00]
- High: (3050.00 - 6447.40]
- Very High: (6447.40 - 544979.54]


In [8]:
# --- Step 3: Apply Binning Using Calculated Edges ---
print("\nApplying calculated bins to the data...")
df["Production_Category"] = pd.cut(df[target_col], bins=bin_edges, labels=labels, include_lowest=True, right=True)



Applying calculated bins to the data...


In [12]:
# --- Step 4: Calculate and Print Counts per Category ---
print("Calculating counts per category...")
category_counts = df["Production_Category"].value_counts().sort_index() # sort_index to keep the order

print("\n--- Counts per Production Category --- ")
print(category_counts)

print("\nScript finished.")



Calculating counts per category...

--- Counts per Production Category --- 
Production_Category
Very Low     1024
Low          1025
Medium       1024
High         1023
Very High    1024
Name: count, dtype: int64

Script finished.
