In [6]:
import pandas as pd
import numpy as np
from scipy.stats import beta

def generate_authentic_flood_dataset(num_records=1500):
    """
    Generate a more authentic flood occurrence dataset 
    reflecting Nepal's actual flood characteristics
    """
    
    # Nepal's river basins
    basins = [
        'Koshi', 'Gandaki', 'Narayani', 'Karnali', 
        'Mahakali', 'Rapti', 'Bagmati', 'Kamala'
    ]
    
    # Set random seed for reproducibility
    np.random.seed(42)
    
    # Actual flood occurrence probabilities by basin
    # Based on historical flood frequency data
    basin_flood_probabilities = {
        'Koshi': 0.35,       # Highly flood-prone
        'Gandaki': 0.25,     # Moderate flood risk
        'Narayani': 0.40,    # High flood frequency
        'Karnali': 0.20,     # Lower flood risk
        'Mahakali': 0.15,    # Least flood-prone
        'Rapti': 0.30,       # Moderate flood risk
        'Bagmati': 0.35,     # Significant flood risk
        'Kamala': 0.45       # Very high flood risk
    }
    
    # Generate basin names with flood probabilities
    basin_names = []
    flood_probabilities = []
    for _ in range(num_records):
        basin = np.random.choice(list(basin_flood_probabilities.keys()))
        basin_names.append(basin)
        flood_probabilities.append(basin_flood_probabilities[basin])
    
    # Create base dataset
    data = {
        'basin_name': basin_names,
    }
    
    # Generate other feature columns
    data['total_slope'] = np.round(np.random.uniform(0, 45, num_records), 2)
    data['total_elevation'] = np.round(np.random.uniform(70, 8848, num_records), 2)
    data['annual_rainfall_mm'] = np.round(np.random.uniform(500, 5000, num_records), 2)
    
    # Authentic Flood Occurrence Generation
    def generate_authentic_flood_occurrence(base_probability):
        """
        Generate more nuanced flood occurrence 
        using beta distribution for realistic variation
        """
        # Adjust beta distribution parameters for more realistic flood occurrence
        a = base_probability * 20  # Shape parameter 1
        b = (1 - base_probability) * 20  # Shape parameter 2
        
        # Generate flood occurrence with beta distribution
        return 1 if np.random.beta(a, b) > 0.5 else 0
    
    # Generate flood occurrence column
    data['flood_occurred'] = [
        generate_authentic_flood_occurrence(prob) 
        for prob in flood_probabilities
    ]
    
    # Additional contextual features
    data['flood_month'] = np.random.choice([6,7,8,9], num_records)  # Monsoon months
    data['river_basin_width_km'] = np.round(np.random.uniform(10, 500, num_records), 2)
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    return df

# Generate dataset
authentic_flood_dataset = generate_authentic_flood_dataset(1500)

# Save to CSV
authentic_flood_dataset.to_csv('authentic_nepal_flood_dataset.csv', index=False)

# Detailed Analysis
print("Dataset Overview:")
print(authentic_flood_dataset.info())

print("\nFlood Occurrence by Basin:")
basin_flood_summary = authentic_flood_dataset.groupby('basin_name')['flood_occurred'].agg(['count', 'sum', 'mean'])
basin_flood_summary.columns = ['Total Records', 'Flood Occurrences', 'Flood Probability']
print(basin_flood_summary)

print("\nOverall Flood Occurrence:")
total_records = len(authentic_flood_dataset)
total_floods = authentic_flood_dataset['flood_occurred'].sum()
print(f"Total Records: {total_records}")
print(f"Total Flood Occurrences: {total_floods}")
print(f"Flood Occurrence Rate: {total_floods/total_records*100:.2f}%")

Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   basin_name            1500 non-null   object 
 1   total_slope           1500 non-null   float64
 2   total_elevation       1500 non-null   float64
 3   annual_rainfall_mm    1500 non-null   float64
 4   flood_occurred        1500 non-null   int64  
 5   flood_month           1500 non-null   int32  
 6   river_basin_width_km  1500 non-null   float64
dtypes: float64(4), int32(1), int64(1), object(1)
memory usage: 76.3+ KB
None

Flood Occurrence by Basin:
            Total Records  Flood Occurrences  Flood Probability
basin_name                                                     
Bagmati               183                 16           0.087432
Gandaki               180                  2           0.011111
Kamala                188                 61           

In [7]:
authentic_flood_dataset

Unnamed: 0,basin_name,total_slope,total_elevation,annual_rainfall_mm,flood_occurred,flood_month,river_basin_width_km
0,Bagmati,5.26,893.88,4530.03,0,8,80.88
1,Karnali,42.29,8054.70,4591.29,0,8,309.50
2,Mahakali,28.25,5944.24,1912.45,0,8,467.30
3,Bagmati,15.07,7349.48,3609.75,0,6,105.89
4,Narayani,6.27,7785.68,1722.35,1,8,384.64
...,...,...,...,...,...,...,...
1495,Mahakali,23.47,1934.86,3119.32,0,7,33.93
1496,Kamala,2.86,5894.41,4090.12,0,7,440.19
1497,Kamala,37.41,3679.49,4389.65,0,9,49.12
1498,Narayani,26.95,3602.32,2556.00,0,8,82.69


In [10]:
# Calculate the percentage of 0s in the specified column
total_values = len(authentic_flood_dataset['flood_occurred'])
zero_count = (authentic_flood_dataset['flood_occurred'] == 1).sum()
zero_percentage = (zero_count / total_values) * 100

# Display the result
print(f"Percentage of 0s in column 'flood_occurred': {zero_percentage:.2f}%")


Percentage of 0s in column 'flood_occurred': 9.47%


In [15]:
if authentic_flood_dataset['flood_month'] == int(6):

# Calculate the percentage of 0s in the specified column
 total_values = len(authentic_flood_dataset['flood_occurred'])
 zero_count = (authentic_flood_dataset['flood_occurred'] == 1).sum()
 zero_percentage = (zero_count / total_values) * 100

# Display the result
print(f"Percentage of 0s in column 'flood_occurred': {zero_percentage:.2f}%")


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().