In [17]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the data
df = pd.read_csv('data_all.csv')

# Select columns B to U as features and V as target
feature_columns = df.columns[1:21]  # Columns B to U
target_column = df.columns[21]  # Column V (FloodProbability)
data = df[feature_columns.tolist() + [target_column]]

missing_values = data.isnull().sum()

# Step 3: Display the missing values for each column
print(missing_values)

MonsoonIntensity                        0
TopographyDrainage                      0
RiverManagement                         0
Deforestation                           0
Urbanization                            0
ClimateChange                           0
DamsQuality                             0
Siltation                               0
AgriculturalPractices                   0
Encroachments                           0
IneffectiveDisasterPreparedness         0
DrainageSystems                         0
CoastalVulnerability                    0
Landslides                              0
Watersheds                              0
DeterioratingInfrastructure             0
PopulationScore                         0
WetlandLoss                             0
InadequatePlanning                      0
PoliticalFactors                        0
FloodProbability                   745305
dtype: int64


In [18]:
df_cleaned = data.dropna()
missing_values = df_cleaned.isnull().sum()

In [19]:
print(missing_values)

MonsoonIntensity                   0
TopographyDrainage                 0
RiverManagement                    0
Deforestation                      0
Urbanization                       0
ClimateChange                      0
DamsQuality                        0
Siltation                          0
AgriculturalPractices              0
Encroachments                      0
IneffectiveDisasterPreparedness    0
DrainageSystems                    0
CoastalVulnerability               0
Landslides                         0
Watersheds                         0
DeterioratingInfrastructure        0
PopulationScore                    0
WetlandLoss                        0
InadequatePlanning                 0
PoliticalFactors                   0
FloodProbability                   0
dtype: int64


In [20]:
Last_column = 'FloodProbability'
X = df_cleaned.drop(columns=[Last_column])
y = df_cleaned[Last_column]

In [24]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def quick_svm_optimization(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Quick parameter configurations to try
    kernels = ['rbf', 'linear']
    C_values = [0.1, 1, 10]
    best_score = float('inf')
    best_model = None
    best_params = {}
    
    # Quick grid search
    for kernel in kernels:
        for C in C_values:
            # Create and train SVR
            svm_model = SVR(kernel=kernel, C=C)
            svm_model.fit(X_train_scaled, y_train)
            
            # Predict and calculate MSE
            y_pred = svm_model.predict(X_test_scaled)
            mse = mean_squared_error(y_test, y_pred)
            
            # Track best model
            if mse < best_score:
                best_score = mse
                best_model = svm_model
                best_params = {'kernel': kernel, 'C': C}
    
    # Predictions with best model
    y_pred_train = best_model.predict(X_train_scaled)
    y_pred_test = best_model.predict(X_test_scaled)
    
    # Metrics
    def print_metrics(y_true, y_pred, set_name):
        mse = mean_squared_error(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)
        rmse = np.sqrt(mse)
        
        print(f"\n{set_name} Set Metrics:")
        print(f"Mean Squared Error: {mse:.4f}")
        print(f"Mean Absolute Error: {mae:.4f}")
        print(f"R-squared: {r2:.4f}")
        print(f"Root Mean Squared Error: {rmse:.4f}")
    
    # Print results
    print("\nBest Parameters:")
    print(best_params)
    
    print_metrics(y_train, y_pred_train, "Training")
    print_metrics(y_test, y_pred_test, "Test")
    
    return best_model, best_params

# Example usage
model, best_params = quick_svm_optimization(X, y)


Best Parameters:
{'kernel': 'linear', 'C': 1}

Training Set Metrics:
Mean Squared Error: 0.0004
Mean Absolute Error: 0.0164
R-squared: 0.8396
Root Mean Squared Error: 0.0204

Test Set Metrics:
Mean Squared Error: 0.0004
Mean Absolute Error: 0.0164
R-squared: 0.8395
Root Mean Squared Error: 0.0204


In [None]:
# import numpy as np
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# scaler = StandardScaler()
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.model_selection import GridSearchCV
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# param_grid = {
#     'max_depth': [None, 5, 10, 15, 20, 25],
#     'min_samples_split': [2, 5, 10, 15],
#     'min_samples_leaf': [1, 2, 4, 6],
#     'max_features': [None, 'sqrt', 'log2']  # Corrected values
# }

# # Create Decision Tree Regressor
# dt_regressor = DecisionTreeRegressor(random_state=42)

# # Perform Grid Search with Cross-Validation
# grid_search = GridSearchCV(
#     estimator=dt_regressor,
#     param_grid=param_grid,
#     cv=5,
#     scoring='r2',
#     n_jobs=-1
# )

# # Fit Grid Search
# grid_search.fit(X_train, y_train)

# # Best model
# best_dt = grid_search.best_estimator_

# # Predictions
# y_train_pred = best_dt.predict(X_train)
# y_test_pred = best_dt.predict(X_test)

# # Evaluation Metrics
# print("Best Parameters:", grid_search.best_params_)
# print("\nTraining Set Metrics:")
# print("Mean Squared Error:", mean_squared_error(y_train, y_train_pred))
# print("Mean Absolute Error:", mean_absolute_error(y_train, y_train_pred))
# print("R-squared:", r2_score(y_train, y_train_pred))
# print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_train, y_train_pred)))

# print("\nTesting Set Metrics:")
# print("Mean Squared Error:", mean_squared_error(y_test, y_test_pred))
# print("Mean Absolute Error:", mean_absolute_error(y_test, y_test_pred))
# print("R-squared:", r2_score(y_test, y_test_pred))
# print("Root Mean Squared Error:", np.sqrt(mean_squared_error(y_test, y_test_pred)))

Best Parameters: {'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 6, 'min_samples_split': 15}

Training Set Metrics:
Mean Squared Error: 0.0013828548408835908
Mean Absolute Error: 0.029825658764238264
R-squared: 0.46910391462443746
Root Mean Squared Error: 0.037186756256543683

Testing Set Metrics:
Mean Squared Error: 0.001813339369845602
Mean Absolute Error: 0.03418918283148939
R-squared: 0.30236746551884397
Root Mean Squared Error: 0.04258332267268023


In [5]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
scaler = StandardScaler()
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# List of parameter sets to try
param_trials = [
    {'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt'},
    {'n_estimators': 300, 'max_depth': 25, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt'},
    {'n_estimators': 500, 'max_depth': None, 'min_samples_split': 15, 'min_samples_leaf': 6, 'max_features': 'log2'},
    {'n_estimators': 1000, 'max_depth': 15, 'min_samples_split': 20, 'min_samples_leaf': 10, 'max_features': None},
]

# Iterate through each trial
for trial in param_trials:
    print(f"Testing parameters: {trial}")
    
    # Create the model with the current parameters
    RF_model = RandomForestRegressor(
        n_estimators=trial['n_estimators'],
        max_depth=trial['max_depth'],
        min_samples_split=trial['min_samples_split'],
        min_samples_leaf=trial['min_samples_leaf'],
        max_features=trial['max_features'],
        random_state=42
    )
    
    # Train the model
    RF_model.fit(X_train, y_train)
    
    # Predictions on test and train sets
    y_pred_test = RF_model.predict(X_test)
    y_pred_train = RF_model.predict(X_train)
    
    # Calculate metrics for the test set
    mse_test = mean_squared_error(y_test, y_pred_test)
    mae_test = mean_absolute_error(y_test, y_pred_test)
    r2_test = r2_score(y_test, y_pred_test)
    rmse_test = np.sqrt(mse_test)
    
    # Calculate metrics for the train set
    mse_train = mean_squared_error(y_train, y_pred_train)
    mae_train = mean_absolute_error(y_train, y_pred_train)
    r2_train = r2_score(y_train, y_pred_train)
    rmse_train = np.sqrt(mse_train)
    
    # Print the results
    print(f"Test Set Metrics: R2: {r2_test:.4f}, RMSE: {rmse_test:.4f}, MAE: {mae_test:.4f}")
    print(f"Train Set Metrics: R2: {r2_train:.4f}, RMSE: {rmse_train:.4f}, MAE: {mae_train:.4f}")
    print("-" * 60)


Testing parameters: {'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt'}
Test Set Metrics: R2: 0.6270, RMSE: 0.0311, MAE: 0.0255
Train Set Metrics: R2: 0.8569, RMSE: 0.0193, MAE: 0.0159
------------------------------------------------------------
Testing parameters: {'n_estimators': 300, 'max_depth': 25, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_features': 'sqrt'}
Test Set Metrics: R2: 0.6214, RMSE: 0.0314, MAE: 0.0257
Train Set Metrics: R2: 0.8089, RMSE: 0.0223, MAE: 0.0181
------------------------------------------------------------
Testing parameters: {'n_estimators': 500, 'max_depth': None, 'min_samples_split': 15, 'min_samples_leaf': 6, 'max_features': 'log2'}
Test Set Metrics: R2: 0.6067, RMSE: 0.0320, MAE: 0.0262
Train Set Metrics: R2: 0.7562, RMSE: 0.0252, MAE: 0.0205
------------------------------------------------------------
Testing parameters: {'n_estimators': 1000, 'max_depth': 15, 'min_samples_split': 20,