### Importing necessary ml modules

In [2]:
import numpy as np
import pandas as pd

In [3]:
import pandas as pd
import numpy as np
from scipy.stats import beta

def generate_authentic_flood_dataset(num_records=1500):
    """
    Generate a more authentic flood occurrence dataset 
    reflecting Nepal's actual flood characteristics
    """
    
    # Nepal's river basins
    basins = [
        'Koshi', 'Gandaki', 'Narayani', 'Karnali', 
        'Mahakali', 'Rapti', 'Bagmati', 'Kamala'
    ]
    
    # Set random seed for reproducibility
    np.random.seed(42)
    
    # Actual flood occurrence probabilities by basin
    # Based on historical flood frequency data
    basin_flood_probabilities = {
        'Koshi': 0.35,       # Highly flood-prone
        'Gandaki': 0.25,     # Moderate flood risk
        'Narayani': 0.40,    # High flood frequency
        'Karnali': 0.20,     # Lower flood risk
        'Mahakali': 0.15,    # Least flood-prone
        'Rapti': 0.30,       # Moderate flood risk
        'Bagmati': 0.35,     # Significant flood risk
        'Kamala': 0.45       # Very high flood risk
    }
    
    # Generate basin names with flood probabilities
    basin_names = []
    flood_probabilities = []
    for _ in range(num_records):
        basin = np.random.choice(list(basin_flood_probabilities.keys()))
        basin_names.append(basin)
        flood_probabilities.append(basin_flood_probabilities[basin])
    
    # Create base dataset
    data = {
        'basin_name': basin_names,
    }
    
    # Generate other feature columns
    data['total_slope'] = np.round(np.random.uniform(0, 45, num_records), 2)
    data['total_elevation'] = np.round(np.random.uniform(70, 8848, num_records), 2)
    data['annual_rainfall_mm'] = np.round(np.random.uniform(500, 5000, num_records), 2)
    
    # Authentic Flood Occurrence Generation
    def generate_authentic_flood_occurrence(base_probability):
        """
        Generate more nuanced flood occurrence 
        using beta distribution for realistic variation
        """
        # Adjust beta distribution parameters for more realistic flood occurrence
        a = base_probability * 20  # Shape parameter 1
        b = (1 - base_probability) * 20  # Shape parameter 2
        
        # Generate flood occurrence with beta distribution
        return 1 if np.random.beta(a, b) > 0.5 else 0
    
    # Generate flood occurrence column
    data['flood_occurred'] = [
        generate_authentic_flood_occurrence(prob) 
        for prob in flood_probabilities
    ]
    
    # Additional contextual features
    data['flood_month'] = np.random.choice([6,7,8,9], num_records)  # Monsoon months
    data['river_basin_width_km'] = np.round(np.random.uniform(10, 500, num_records), 2)
    
    # Create DataFrame
    df = pd.DataFrame(data)
    
    return df

# Generate dataset
authentic_flood_dataset = generate_authentic_flood_dataset(1500)

# Save to CSV
authentic_flood_dataset.to_csv('authentic_nepal_flood_dataset.csv', index=False)

# Detailed Analysis
print("Dataset Overview:")
print(authentic_flood_dataset.info())

print("\nFlood Occurrence by Basin:")
basin_flood_summary = authentic_flood_dataset.groupby('basin_name')['flood_occurred'].agg(['count', 'sum', 'mean'])
basin_flood_summary.columns = ['Total Records', 'Flood Occurrences', 'Flood Probability']
print(basin_flood_summary)

print("\nOverall Flood Occurrence:")
total_records = len(authentic_flood_dataset)
total_floods = authentic_flood_dataset['flood_occurred'].sum()
print(f"Total Records: {total_records}")
print(f"Total Flood Occurrences: {total_floods}")
print(f"Flood Occurrence Rate: {total_floods/total_records*100:.2f}%")

Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   basin_name            1500 non-null   object 
 1   total_slope           1500 non-null   float64
 2   total_elevation       1500 non-null   float64
 3   annual_rainfall_mm    1500 non-null   float64
 4   flood_occurred        1500 non-null   int64  
 5   flood_month           1500 non-null   int32  
 6   river_basin_width_km  1500 non-null   float64
dtypes: float64(4), int32(1), int64(1), object(1)
memory usage: 76.3+ KB
None

Flood Occurrence by Basin:
            Total Records  Flood Occurrences  Flood Probability
basin_name                                                     
Bagmati               183                 16           0.087432
Gandaki               180                  2           0.011111
Kamala                188                 61           

In [4]:
authentic_flood_dataset.head() # to see the top 5 rows of the dataset

Unnamed: 0,basin_name,total_slope,total_elevation,annual_rainfall_mm,flood_occurred,flood_month,river_basin_width_km
0,Bagmati,5.26,893.88,4530.03,0,8,80.88
1,Karnali,42.29,8054.7,4591.29,0,8,309.5
2,Mahakali,28.25,5944.24,1912.45,0,8,467.3
3,Bagmati,15.07,7349.48,3609.75,0,6,105.89
4,Narayani,6.27,7785.68,1722.35,1,8,384.64


In [5]:
authentic_flood_dataset

Unnamed: 0,basin_name,total_slope,total_elevation,annual_rainfall_mm,flood_occurred,flood_month,river_basin_width_km
0,Bagmati,5.26,893.88,4530.03,0,8,80.88
1,Karnali,42.29,8054.70,4591.29,0,8,309.50
2,Mahakali,28.25,5944.24,1912.45,0,8,467.30
3,Bagmati,15.07,7349.48,3609.75,0,6,105.89
4,Narayani,6.27,7785.68,1722.35,1,8,384.64
...,...,...,...,...,...,...,...
1495,Mahakali,23.47,1934.86,3119.32,0,7,33.93
1496,Kamala,2.86,5894.41,4090.12,0,7,440.19
1497,Kamala,37.41,3679.49,4389.65,0,9,49.12
1498,Narayani,26.95,3602.32,2556.00,0,8,82.69


In [7]:
# Calculate the percentage of 0s in the specified column
total_values = len(authentic_flood_dataset['flood_occurred'])
zero_count = (authentic_flood_dataset['flood_occurred'] == 1).sum()
zero_percentage = (zero_count / total_values) * 100

# Display the result
print(f"Percentage of 1s in column 'flood_occurred': {zero_percentage:.2f}%")


Percentage of 1s in column 'flood_occurred': 9.47%


In [8]:
authentic_flood_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   basin_name            1500 non-null   object 
 1   total_slope           1500 non-null   float64
 2   total_elevation       1500 non-null   float64
 3   annual_rainfall_mm    1500 non-null   float64
 4   flood_occurred        1500 non-null   int64  
 5   flood_month           1500 non-null   int32  
 6   river_basin_width_km  1500 non-null   float64
dtypes: float64(4), int32(1), int64(1), object(1)
memory usage: 76.3+ KB


In [9]:
authentic_flood_dataset.describe()

Unnamed: 0,total_slope,total_elevation,annual_rainfall_mm,flood_occurred,flood_month,river_basin_width_km
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,22.629573,4419.834373,2719.313267,0.094667,7.458667,251.627673
std,13.113228,2527.907327,1287.193694,0.292852,1.11585,144.339266
min,0.14,70.1,500.14,0.0,6.0,10.65
25%,10.82,2271.4075,1616.655,0.0,6.0,123.22
50%,23.205,4395.07,2680.335,0.0,7.0,247.555
75%,34.035,6592.575,3820.6975,0.0,8.0,380.305
max,44.97,8844.12,4989.87,1.0,9.0,499.43


### Checking for null values

In [9]:
authentic_flood_dataset.apply(lambda x:sum(x.isnull()), axis=0)

basin_name              0
total_slope             0
total_elevation         0
annual_rainfall_mm      0
flood_occurred          0
flood_month             0
river_basin_width_km    0
dtype: int64

In [10]:
x = authentic_flood_dataset.

Unnamed: 0,total_slope,total_elevation,annual_rainfall_mm,flood_occurred,flood_month,river_basin_width_km
0,5.26,893.88,4530.03,0,8,80.88
1,42.29,8054.7,4591.29,0,8,309.5
2,28.25,5944.24,1912.45,0,8,467.3
3,15.07,7349.48,3609.75,0,6,105.89
4,6.27,7785.68,1722.35,1,8,384.64


### Serperating the flood label from the dataset

In [13]:
y = authentic_flood_dataset.iloc[:, -3]
y.head()

0    0
1    0
2    0
3    0
4    1
Name: flood_occurred, dtype: int64

## Data visualization

In [26]:
from sklearn import preprocessing
minmax = preprocessing.MinMaxScaler(feature_range=(0,1))
minmax.fit(x).transform(x)

array([[0.11420923, 0.09388855, 0.89757959, 0.        , 0.66666667,
        0.14368428],
       [0.9402186 , 0.91002756, 0.91122406, 0.        , 0.66666667,
        0.61142027],
       [0.62703547, 0.66949243, 0.31456457, 0.        , 0.66666667,
        0.9342649 ],
       ...,
       [0.83136293, 0.41137244, 0.86631267, 0.        , 1.        ,
        0.07870617],
       [0.59803703, 0.40257715, 0.45790281, 0.        , 0.66666667,
        0.14738737],
       [0.11220165, 0.88497633, 0.47616449, 0.        , 1.        ,
        0.02086828]])

### Splitting the data into training and testing sets

In [15]:
from sklearn import model_selection,neighbors
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
x_train.head()

Unnamed: 0,total_slope,total_elevation,annual_rainfall_mm,flood_occurred,flood_month,river_basin_width_km
804,10.26,5835.0,2563.33,0,8,253.65
486,38.9,8771.88,755.97,0,8,466.63
1022,40.14,6420.39,1556.39,0,9,218.88
680,20.46,5254.62,3548.96,0,9,90.47
599,1.8,3217.6,3341.56,0,8,117.86


In [16]:
y_train.head()

804     0
486     0
1022    0
680     0
599     0
Name: flood_occurred, dtype: int64

In [17]:
x_train_std = minmax.fit_transform(x_train)
x_test_std = minmax.transform(x_test)

### Training the model

In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr_clf = lr.fit(x_train_std,y_train)

lr_accuracy = cross_val_score(lr_clf,x_test_std,y_test,cv=3,scoring='accuracy',n_jobs=-1)

In [19]:
lr_accuracy.mean()

1.0

In [20]:
y_predict = lr_clf.predict(x_test_std)
print('Predicted chances of flood')
print(y_predict)

Predicted chances of flood
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0
 0 0 0 0]


In [21]:
print('Actual chances of flood')
print(y_test.values)

Actual chances of flood
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0
 0 0 0 0]


### Accuracy, recall and roc score estimation

In [22]:
from sklearn.metrics import accuracy_score,recall_score,roc_auc_score,confusion_matrix
print("\naccuracy score: %f"%(accuracy_score(y_test,y_predict)*100))
print("recall score: %f"%(recall_score(y_test,y_predict)*100))
print("roc score: %f"%(roc_auc_score(y_test,y_predict)*100))


accuracy score: 100.000000
recall score: 100.000000
roc score: 100.000000
