In [None]:
import pandas as pd
import random
from datetime import datetime, timedelta

# --- Main Generation Function ---
def generate_mandal_dataset(mandal_name, city, base_headcount_range, yearly_growth_factor, start_year=2010, num_years=15):
    """
    Generates a realistic dummy dataset with binary flags for weekend and special days.
    """
    print(f"--- Generating dataset for: {mandal_name}, {city} ---")

    FESTIVAL_DURATION_DAYS = 11
    weather_options = ['Humid', 'Cloudy', 'Sunny', 'Light Rain', 'Heavy Rain']
    all_data = []
    total_hours = FESTIVAL_DURATION_DAYS * 24

    for year_offset in range(num_years):
        current_year = start_year + year_offset
        festival_start_date = datetime(current_year, 9, 5, 0, 0, 0)

        for hour_offset in range(total_hours):
            current_time = festival_start_date + timedelta(hours=hour_offset)

            day_of_festival = (current_time - festival_start_date).days + 1
            hour_of_day = current_time.hour

            # --- MODIFICATION START ---

            # 1. Determine day of week for headcount logic, then create binary flag
            day_of_week_str = current_time.strftime('%A')
            is_weekend = 1 if day_of_week_str in ['Saturday', 'Sunday'] else 0

            # 2. Determine special day type for headcount logic, then create binary flag
            if day_of_festival == 1: special_day_str = 'Pratishthapana'
            elif day_of_festival == 11: special_day_str = 'Visarjan'
            elif day_of_festival in [5, 7, 10]: special_day_str = 'Key Day'
            else: special_day_str = 'Regular Day'
            is_special_day = 1 if special_day_str != 'Regular Day' else 0

            # --- MODIFICATION END ---

            weather = random.choice(weather_options)

            # Headcount Simulation (uses the original string variables for logic)
            base_headcount = random.randint(*base_headcount_range)
            base_headcount += (current_year - start_year) * yearly_growth_factor
            headcount = float(base_headcount)

            if 18 <= hour_of_day <= 23: headcount *= random.uniform(2.5, 4.0)
            elif 1 <= hour_of_day <= 5: headcount *= random.uniform(0.1, 0.4)

            if special_day_str == 'Visarjan': headcount *= random.uniform(3.5, 5.0)
            elif special_day_str == 'Pratishthapana': headcount *= random.uniform(2.0, 3.0)

            if is_weekend == 1: # Logic now uses the binary flag
                headcount *= random.uniform(1.4, 2.0)

            if weather == 'Heavy Rain': headcount *= 0.4
            elif weather == 'Light Rain': headcount *= 0.7

            # Append the new binary columns to the data
            all_data.append([
                mandal_name, city, current_time, current_year, day_of_festival,
                hour_of_day, is_weekend, is_special_day, weather, int(headcount)
            ])

    # Update the column names for the final DataFrame
    columns = [
        'mandal_name', 'city', 'datetime', 'year', 'day_of_festival', 'hour_of_day',
        'is_weekend', 'is_special_day', 'weather', 'headcount'
    ]
    df = pd.DataFrame(all_data, columns=columns)

    filename = f"{mandal_name.lower().replace(' ', '_')}_binary_dataset.csv"
    df.to_csv(filename, index=False)
    print(f"Successfully created '{filename}' with {len(df)} rows.")
    # Show a preview of the first file generated
    if mandal_name == mandals_to_generate[0]["mandal_name"]:
        print("\nPreview of the new format:")
        print(df.head())
        print("\n")


# --- Configuration for 5 Mandals ---
mandals_to_generate = [
    {
        "mandal_name": "Lalbaugcha Raja",
        "city": "Mumbai",
        "base_headcount_range": (10000, 25000),
        "yearly_growth_factor": 1800
    },
    {
        "mandal_name": "Dagdusheth Halwai Ganpati",
        "city": "Pune",
        "base_headcount_range": (8000, 20000),
        "yearly_growth_factor": 1500
    },
    {
        "mandal_name": "Andhericha Raja",
        "city": "Mumbai",
        "base_headcount_range": (4000, 9000),
        "yearly_growth_factor": 1000
    },
    {
        "mandal_name": "Kasba Ganpati",
        "city": "Pune",
        "base_headcount_range": (3000, 7000),
        "yearly_growth_factor": 700
    },
    {
        "mandal_name": "Siddhivinayak Temple",
        "city": "Mumbai",
        "base_headcount_range": (7000, 15000),
        "yearly_growth_factor": 1200
    }
]

# --- Main Execution Block ---
if __name__ == "__main__":
    print("Starting dataset generation with binary columns...")
    for mandal_config in mandals_to_generate:
        generate_mandal_dataset(**mandal_config)
    print("All datasets have been generated successfully.")

Starting dataset generation with binary columns...
--- Generating dataset for: Lalbaugcha Raja, Mumbai ---
Successfully created 'lalbaugcha_raja_binary_dataset.csv' with 3960 rows.

Preview of the new format:
       mandal_name    city            datetime  year  day_of_festival  \
0  Lalbaugcha Raja  Mumbai 2010-09-05 00:00:00  2010                1   
1  Lalbaugcha Raja  Mumbai 2010-09-05 01:00:00  2010                1   
2  Lalbaugcha Raja  Mumbai 2010-09-05 02:00:00  2010                1   
3  Lalbaugcha Raja  Mumbai 2010-09-05 03:00:00  2010                1   
4  Lalbaugcha Raja  Mumbai 2010-09-05 04:00:00  2010                1   

   hour_of_day  is_weekend  is_special_day     weather  headcount  
0            0           1               1       Sunny      35350  
1            1           1               1       Humid      19998  
2            2           1               1  Heavy Rain      10743  
3            3           1               1  Heavy Rain       3415  
4           

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import glob

# --- Step 1: Combine All Datasets ---

# Find all CSV files that end with '_binary_dataset.csv'
file_pattern = '*_binary_dataset.csv'
all_files = glob.glob(file_pattern)

print(f"Found {len(all_files)} files to combine:")
print(all_files)

# Load each file into a DataFrame and store them in a list
list_of_dfs = [pd.read_csv(f) for f in all_files]

# Concatenate all DataFrames in the list into a single DataFrame
combined_df = pd.concat(list_of_dfs, ignore_index=True)

print(f"\nShape of the combined dataset before sorting: {combined_df.shape}")

# --- Step 2: Prepare and Sort the Combined Data ---

# Convert 'datetime' column to a proper datetime object
combined_df['datetime'] = pd.to_datetime(combined_df['datetime'])

# CRITICAL: Sort the entire DataFrame by datetime to ensure chronological order
combined_df = combined_df.sort_values(by='datetime').reset_index(drop=True)

print("\nCombined dataset has been sorted chronologically.")
print("Preview of the first few rows (start of 2010):")
print(combined_df.head())
print("\nPreview of the last few rows (end of 2024):")
print(combined_df.tail())


# --- Step 3: Perform the Chronological Split ---

# Define split points (70% train, 15% validation, 15% test)
train_split_index = int(len(combined_df) * 0.70)
validation_split_index = int(len(combined_df) * 0.85)

# Create the splits
train_data = combined_df.iloc[:train_split_index]
validation_data = combined_df.iloc[train_split_index:validation_split_index]
test_data = combined_df.iloc[validation_split_index:]

print("\n--- Data Split Complete ---")
print(f"Training data shape:   {train_data.shape}")
print(f"Validation data shape: {validation_data.shape}")
print(f"Test data shape:       {test_data.shape}")
print("---")
print(f"Training data dates:   {train_data['datetime'].min()} to {train_data['datetime'].max()}")
print(f"Validation data dates: {validation_data['datetime'].min()} to {validation_data['datetime'].max()}")
print(f"Test data dates:       {test_data['datetime'].min()} to {test_data['datetime'].max()}")

Found 5 files to combine:
['siddhivinayak_temple_binary_dataset.csv', 'andhericha_raja_binary_dataset.csv', 'dagdusheth_halwai_ganpati_binary_dataset.csv', 'kasba_ganpati_binary_dataset.csv', 'lalbaugcha_raja_binary_dataset.csv']

Shape of the combined dataset before sorting: (19800, 10)

Combined dataset has been sorted chronologically.
Preview of the first few rows (start of 2010):
                 mandal_name    city   datetime  year  day_of_festival  \
0       Siddhivinayak Temple  Mumbai 2010-09-05  2010                1   
1  Dagdusheth Halwai Ganpati    Pune 2010-09-05  2010                1   
2              Kasba Ganpati    Pune 2010-09-05  2010                1   
3            Andhericha Raja  Mumbai 2010-09-05  2010                1   
4            Lalbaugcha Raja  Mumbai 2010-09-05  2010                1   

   hour_of_day  is_weekend  is_special_day     weather  headcount  
0            0           1               1  Heavy Rain      18502  
1            0           1      

In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

# Assume 'train_data', 'validation_data', and 'test_data' are already created
# from the previous step.

# --- Step 1: Feature Engineering (Adding Lag Features) ---
def create_features(df):
    """Create time series features based on datetime index."""
    df = df.copy()
    # Create a 1-hour lag and a 24-hour lag
    df['headcount_lag_1hr'] = df.groupby('mandal_name')['headcount'].shift(1)
    df['headcount_lag_24hr'] = df.groupby('mandal_name')['headcount'].shift(24)
    return df

train_featured = create_features(train_data)
validation_featured = create_features(validation_data)
test_featured = create_features(test_data)


# --- Step 2: Preprocessing ---

# Define which columns are features and which is the target
TARGET = 'headcount'
# Drop the original datetime and any rows with missing lag values
FEATURES = ['year', 'day_of_festival', 'hour_of_day', 'is_weekend',
            'is_special_day', 'mandal_name', 'city', 'weather',
            'headcount_lag_1hr', 'headcount_lag_24hr']

# Create X (features) and y (target) sets
X_train = train_featured[FEATURES].dropna()
y_train = train_featured.loc[X_train.index][TARGET]

X_val = validation_featured[FEATURES].dropna()
y_val = validation_featured.loc[X_val.index][TARGET]

X_test = test_featured[FEATURES].dropna()
y_test = test_featured.loc[X_test.index][TARGET]


# One-Hot Encode categorical features
X_train = pd.get_dummies(X_train, drop_first=True)
X_val = pd.get_dummies(X_val, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Align columns - ensures val/test sets have the same columns as the train set
# after one-hot encoding, filling missing ones with 0.
train_cols = X_train.columns
X_val = X_val.reindex(columns=train_cols, fill_value=0)
X_test = X_test.reindex(columns=train_cols, fill_value=0)


# --- Step 3 & 4: Model Training and Evaluation ---

# Initialize and train the LightGBM Regressor model
model = lgb.LGBMRegressor(objective='mae',
                          metric='mae',
                          n_estimators=1000,
                          n_jobs=-1,
                          learning_rate=0.05,
                          random_state=42)

print("Training the LightGBM model...")
model.fit(X_train, y_train,
          eval_set=[(X_val, y_val)],
          eval_metric='mae',
          callbacks=[lgb.early_stopping(100, verbose=False)])

# Make predictions on the validation data
val_predictions = model.predict(X_val)

# Evaluate the model
mae = mean_absolute_error(y_val, val_predictions)
print(f"\nModel training complete.")
print(f"The Mean Absolute Error (MAE) on the validation set is: {mae:,.2f}")
print(f"This means the model's predictions are, on average, off by about {int(round(mae, -2))} people.")

Training the LightGBM model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000850 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 580
[LightGBM] [Info] Number of data points in the train set: 13740, number of used features: 16
[LightGBM] [Info] Start training from score 16749.500000

Model training complete.
The Mean Absolute Error (MAE) on the validation set is: 8,793.75
This means the model's predictions are, on average, off by about 8800 people.
