# Balance the Dataset

## Installations and Libraries

In [18]:
# Imports
!pip install dask[complete] dask-ml imbalanced-learn

import os
import dask.dataframe as dd
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd

zsh:1: no matches found: dask[complete]


## Load the Data

In [19]:
# Data Loading
# Check the current working directory
print("Current working directory:", os.getcwd())

# Define the path to your CSV file
file_path = '/Users/minarandolf/Capstone/Capstone-Project/datasets/contracts_clean_final.csv'

# Load the CSV file into a DataFrame
try:
    df = pd.read_csv(file_path)
    print(f"Number of observations: {len(df)}")
    print("Column names:")
    print(df.columns)
    print("Column types:")
    print(df.dtypes)
    print(df.head())
except FileNotFoundError:
    print(f"The file {file_path} does not exist in the current working directory.")

Current working directory: /Users/minarandolf/Capstone/Capstone-Project/datasets


  df = pd.read_csv(file_path)


Number of observations: 10353244
Column names:
Index(['Unnamed: 0', 'ANO_SID', 'CORPORATE_DEVISION', 'Bundesland', 'Kreis',
       'Typ', 'ORTPLZ', 'ORTS-NAME', 'STRASSE', 'SUM_INSURED',
       'CONSTRACTION_DESIGN', 'CONSTRUCTION_YEAR', 'WFL', 'ZONE', 'SF-SYSTEM',
       'TYPE_OF_DEDUCTIBLE', 'DRAIN_PIPE_INSURED', 'PRODUCTLINE',
       'PRIOR_DAMAGES', 'UVV-KZ', 'UNDERWRITER', 'PARTY-ID', 'contract_year',
       'PIPE_PREMIUM_AMOUNT', 'YEAR', 'DAMAGE_FLOOD_ZONE',
       'DAMAGE_HEAVY_RAIN_ZONE', 'LONGITUDE', 'LATITUDE', 'DAMAGE'],
      dtype='object')
Column types:
Unnamed: 0                  int64
ANO_SID                   float64
CORPORATE_DEVISION         object
Bundesland                 object
Kreis                      object
Typ                        object
ORTPLZ                      int64
ORTS-NAME                  object
STRASSE                    object
SUM_INSURED               float64
CONSTRACTION_DESIGN        object
CONSTRUCTION_YEAR         float64
WFL               

In [20]:
# Data Analysis
# Count the occurrences of each class in the target variable 'DAMAGE'
damage_counts = df['DAMAGE'].value_counts()

# Print the results
print("Number of observations for each class in 'DAMAGE':")
print(damage_counts)

# Display the percentage distribution
damage_percentage = df['DAMAGE'].value_counts(normalize=True) * 100
print("\nPercentage distribution of each class in 'DAMAGE':")
print(damage_percentage)

Number of observations for each class in 'DAMAGE':
DAMAGE
0    10203172
1      150072
Name: count, dtype: int64

Percentage distribution of each class in 'DAMAGE':
DAMAGE
0    98.550483
1     1.449517
Name: proportion, dtype: float64


## Creating a subset

In [21]:
# Create a smaller subset of the dataset with balanced years and similar distribution of 'DAMAGE'
subset_size = 20000
years = df['contract_year'].unique()
subset_list = []

for year in years:
    year_data = df[df['contract_year'] == year]
    no_damage = year_data[year_data['DAMAGE'] == 0]
    damage = year_data[year_data['DAMAGE'] == 1]
    
    # Calculate the sample size for each year based on the total subset size
    sample_size = subset_size // len(years)
    
    # Ensure the sample size is proportionate to the overall distribution
    no_damage_sample = no_damage.sample(n=int(sample_size * 0.985), random_state=42, replace=True)
    damage_sample = damage.sample(n=int(sample_size * 0.015), random_state=42, replace=True)
    
    subset_list.append(pd.concat([no_damage_sample, damage_sample]))

subset_df = pd.concat(subset_list).reset_index(drop=True)

In [22]:
# Check the amount of observations in the subset and the percentage distribution of each class in 'DAMAGE'
print(f"Number of observations in the subset: {len(subset_df)}")

subset_damage_counts = subset_df['DAMAGE'].value_counts()
print("Number of observations for each class in 'DAMAGE' (subset):")
print(subset_damage_counts)

subset_damage_percentage = subset_df['DAMAGE'].value_counts(normalize=True) * 100
print("\nPercentage distribution of each class in 'DAMAGE' (subset):")
print(subset_damage_percentage)

# Print the number of observations for each year in the subset
print("\nNumber of observations for each year in the subset:")
print(subset_df['contract_year'].value_counts())

Number of observations in the subset: 19987
Number of observations for each class in 'DAMAGE' (subset):
DAMAGE
0    19690
1      297
Name: count, dtype: int64

Percentage distribution of each class in 'DAMAGE' (subset):
DAMAGE
0    98.514034
1     1.485966
Name: proportion, dtype: float64

Number of observations for each year in the subset:
contract_year
2014-01-01    1817
2015-01-01    1817
2016-01-01    1817
2017-01-01    1817
2018-01-01    1817
2019-01-01    1817
2020-01-01    1817
2021-01-01    1817
2022-01-01    1817
2023-01-01    1817
2024-01-01    1817
Name: count, dtype: int64


In [23]:
# Save the subset before SMOTE to a CSV file
subset_df.to_csv('/Users/minarandolf/Capstone/Capstone-Project/datasets/subset_before_smote.csv', index=False)

## Balance the Data using SMOTE

In [26]:
# Define function to load data in chunks and apply SMOTE incrementally
def load_and_balance_data(df, chunk_size=5000):
    smote = SMOTE(random_state=42)
    balanced_data_X = []
    balanced_data_y = []

    # Process data in chunks
    for start in range(0, len(df), chunk_size):
        end = start + chunk_size
        chunk = df.iloc[start:end]
        
        # Separate features and target
        X_chunk = chunk.drop('DAMAGE', axis=1)
        y_chunk = chunk['DAMAGE']
        
        # Ensure numeric columns are in correct format
        X_chunk = X_chunk.apply(pd.to_numeric, errors='coerce').fillna(0)
        
        # Apply SMOTE to the chunk
        X_resampled_chunk, y_resampled_chunk = smote.fit_resample(X_chunk, y_chunk)
        
        balanced_data_X.append(X_resampled_chunk)
        balanced_data_y.append(y_resampled_chunk)
    
    # Concatenate all the resampled chunks
    X_resampled = np.vstack(balanced_data_X)
    y_resampled = np.hstack(balanced_data_y)

    return X_resampled, y_resampled

# Load and balance the subset data
X_resampled, y_resampled = load_and_balance_data(subset_df)

# Combine the balanced data into a DataFrame
balanced_df = pd.DataFrame(X_resampled, columns=subset_df.drop('DAMAGE', axis=1).columns)
balanced_df['DAMAGE'] = y_resampled

# Save the balanced subset to a CSV file
balanced_df.to_csv('/Users/minarandolf/Capstone/Capstone-Project/datasets/subset_after_smote.csv', index=False)

# Check the distribution after SMOTE
print("Distribution after SMOTE:")
print(pd.Series(y_resampled).value_counts())

Distribution after SMOTE:
0    19690
1    19690
Name: count, dtype: int64
