## Class Distribution Plot

## Data Cleaning

In [1]:
df.isnull().sum()

NameError: name 'df' is not defined

In [None]:
total_rows = len(df)

# Calculate the number of null values for each column
null_counts = df.isnull().sum()

# Calculate the percentage of null values for each column
null_percentage = (null_counts / total_rows) * 100

# Display the percentage of null values
print(null_percentage)

## Compute Correlation Matrix and heatmap for columns having null values to understand which columns to impute which to drop

In [None]:
columns_with_missing = [
    'MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
    'ClimateChange', 'Siltation', 'AgriculturalPractices',
    'IneffectiveDisasterPreparedness', 'DrainageSystems',
    'CoastalVulnerability', 'DeterioratingInfrastructure',
    'PopulationScore', 'WetlandLoss', 'InadequatePlanning',
    'ClimateAnthropogenicInteraction', 'InfrastructurePreventionInteraction'
]

correlation_matrix = df[columns_with_missing + ['FloodProbability']].corr()

(correlation_matrix)

In [None]:
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f', linewidths=.5)
plt.title('Correlation Matrix with FloodProbability')
plt.show()

In [None]:
'''Rationale for Not Dropping Null Values Based on Correlation Analysis
In our analysis, we carefully examined the correlation matrix to guide our decision on handling null values. The decision not to drop rows with null values from certain columns was influenced by the following considerations:

Columns with Stronger Correlations:

ClimateAnthropogenicInteraction and InfrastructurePreventionInteraction: These columns exhibit strong correlations with the target variable, FloodProbability. Their imputation is crucial as they have a significant impact on flood predictions.
Less Impactful Columns:

Encroachments, PoliticalFactors, AgriculturalPractices, WetlandLoss, and InadequatePlanning: Although these columns have null values, their lower correlation with the target variable and other predictors suggests they have less impact on the overall model performance. Dropping rows with null values in these columns helps maintain a cleaner dataset while minimizing loss of important data.
Data Preservation:

Dropping rows with null values from columns that are less correlated but still potentially relevant can lead to significant data loss, affecting the overall integrity of the dataset. Imputation methods are employed to preserve as much data as possible, ensuring that the analysis remains comprehensive and robust.
By focusing on imputation for columns with higher correlation and impact, we aim to maintain the dataset's integrity while addressing null values in a way that enhances model accuracy and reliability.

'''

In [None]:
columns_to_drop = ['Encroachments', 'PoliticalFactors', 'AgriculturalPractices', 'WetlandLoss', 'InadequatePlanning']
df.dropna(subset=columns_to_drop, inplace=True)

## 1.Median Imputation Code

In [None]:
median_columns = ['Deforestation', 'Landslides', 'Watersheds']
df[median_columns] = df[median_columns].fillna(df[median_columns].median())

## 2.Linear Interpolation Imputation code

In [None]:
interpolate_columns = ['Urbanization', 'DamsQuality']
df[interpolate_columns] = df[interpolate_columns].interpolate(method='linear')

## 3.Iterative Imputation

In [None]:
columns_to_impute_iterative = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement', 'CoastalVulnerability', 'ClimateAnthropogenicInteraction']
iterative_imputer = IterativeImputer()

df[columns_to_impute_iterative] = iterative_imputer.fit_transform(df[columns_to_impute_iterative])

## 4.Multiple Imputation by Chained Equations (MICE)

In [None]:
columns_to_impute_mice = ['ClimateChange', 'Siltation', 'DeterioratingInfrastructure', 'InfrastructurePreventionInteraction',
                          'IneffectiveDisasterPreparedness', 'DrainageSystems', 'PopulationScore']
mice_imputer = IterativeImputer() 
df[columns_to_impute_mice] = mice_imputer.fit_transform(df[columns_to_impute_mice])

## outliers
## statistical methods
## z-score
## because data is approximately normally distributed

In [None]:
num_columns = len(columns)
num_rows = (num_columns // 3) + int(num_columns % 3 != 0)

# Create subplots
fig, axes = plt.subplots(nrows=num_rows, ncols=3, figsize=(18, 4 * num_rows))
axes = axes.flatten()

# Plot each column
for i, column in enumerate(columns):
    sns.histplot(df[column].dropna(), bins=30, kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {column}')
    axes[i].set_xlabel(column)
    axes[i].set_ylabel('Frequency')

# Remove unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
columns = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
           'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
           'Siltation', 'AgriculturalPractices', 'Encroachments',
           'IneffectiveDisasterPreparedness', 'DrainageSystems',
           'CoastalVulnerability', 'Landslides', 'Watersheds',
           'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
           'InadequatePlanning', 'PoliticalFactors', 'ClimateAnthropogenicInteraction',
           'InfrastructurePreventionInteraction']

def detect_outliers_zscore(df, columns, threshold=3):
    outliers = {}
    for column in columns:
        z_scores = np.abs(stats.zscore(df[column].dropna()))
        outliers[column] = df[column][z_scores > threshold].index.tolist()
    return outliers
    
zscore_outliers = detect_outliers_zscore(df, columns)
#zscore_outliers
for column, indices in zscore_outliers.items():
    print(f"{column}: {len(indices)} outliers")

## dropping outliers based on z-score of columns and domain knowledge

In [None]:
columns = ['MonsoonIntensity', 'TopographyDrainage', 'RiverManagement',
           'Deforestation', 'Urbanization', 'ClimateChange', 'DamsQuality',
           'Siltation', 'AgriculturalPractices', 'Encroachments',
           'IneffectiveDisasterPreparedness', 'DrainageSystems',
           'CoastalVulnerability', 'Landslides', 'Watersheds',
           'DeterioratingInfrastructure', 'PopulationScore', 'WetlandLoss',
           'InadequatePlanning', 'PoliticalFactors', 'ClimateAnthropogenicInteraction',
           'InfrastructurePreventionInteraction']

outliers_to_drop = {
    'MonsoonIntensity': 5,
    'TopographyDrainage': 2833,
    'RiverManagement': 7608,
    'Deforestation': 0,
    'Urbanization': 3022,
    'ClimateChange': 4793,
    'DamsQuality': 3123,
    'Siltation': 2949,
    'AgriculturalPractices': 2773,
    'Encroachments': 3033,
    'IneffectiveDisasterPreparedness': 4872,
    'DrainageSystems': 5334,
    'CoastalVulnerability': 5573,
    'Landslides': 2807,
    'Watersheds': 2986,
    'DeterioratingInfrastructure': 5009,
    'PopulationScore': 5134,
    'WetlandLoss': 0,
    'InadequatePlanning': 0,
    'PoliticalFactors': 0,
    'ClimateAnthropogenicInteraction': 6194,
    'InfrastructurePreventionInteraction': 7300
}

def drop_outliers_zscore(df, columns, outliers_to_drop):
    for column in columns:
        if outliers_to_drop.get(column, 0) > 0:
            # Calculate Z-Scores
            z_scores = stats.zscore(df[column].dropna())
            z_scores_df = pd.DataFrame({
                'index': df[column].dropna().index,
                'z_score': z_scores
            })
        
            top_outliers = z_scores_df.reindex(z_scores_df['z_score'].abs().sort_values(ascending=False).index)
            top_outliers = top_outliers.head(outliers_to_drop[column])
            # Drop the rows with the top N outliers
            df.drop(index=top_outliers['index'], inplace=True)

drop_outliers_zscore(df, columns, outliers_to_drop)

print("Updated DataFrame shape:", df.shape)


## SimpleImputer with Mean Imputation for outliers

In [None]:
columns_to_impute = ['TopographyDrainage', 'Urbanization', 'DamsQuality', 
                      'Siltation', 'AgriculturalPractices', 'Landslides', 
                      'Watersheds', 'InadequatePlanning', 'ClimateAnthropogenicInteraction', 
                      'InfrastructurePreventionInteraction']

mean_imputer = SimpleImputer(strategy='mean')

df[columns_to_impute] = mean_imputer.fit_transform(df[columns_to_impute])

