# Balance the Dataset

## Installations and Libraries

In [4]:
# Imports
!pip install dask[complete] dask-ml imbalanced-learn

import os
import dask.dataframe as dd
from dask_ml.preprocessing import DummyEncoder
from dask_ml.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import numpy as np
import pandas as pd

zsh:1: no matches found: dask[complete]


## Load the Data

In [5]:
# Data Loading
# Check the current working directory
print("Current working directory:", os.getcwd())

# Define the path to your CSV file
file_path = '/Users/minarandolf/Capstone/Capstone-Project/datasets/contract_undersampling.csv'

# Load the CSV file into a DataFrame
try:
    df = pd.read_csv(file_path)
    print(f"Number of observations: {len(df)}")
    print("Column names:")
    print(df.columns)
    print("Column types:")
    print(df.dtypes)
    print(df.head())
except FileNotFoundError:
    print(f"The file {file_path} does not exist in the current working directory.")

Current working directory: /Users/minarandolf/Capstone/Capstone-Project/datasets


  df = pd.read_csv(file_path)


Number of observations: 11574439
Column names:
Index(['ANO_SID', 'CORPORATE_DEVISION', 'ORTPLZ', 'ORTS-NAME', 'STRASSE',
       'SUM_INSURED', 'CONSTRACTION_DESIGN', 'CONSTRUCTION_YEAR', 'WFL',
       'ZONE', 'SF-SYSTEM', 'TYPE_OF_DEDUCTIBLE', 'DRAIN_PIPE_INSURED',
       'PRODUCTLINE', 'PRIOR_DAMAGES', 'UVV-KZ', 'UNDERWRITER', 'PARTY-ID',
       'contract_year', 'PIPE_PREMIUM_AMOUNT', 'YEAR', 'DAMAGE'],
      dtype='object')
Column types:
ANO_SID                float64
CORPORATE_DEVISION      object
ORTPLZ                 float64
ORTS-NAME               object
STRASSE                 object
SUM_INSURED            float64
CONSTRACTION_DESIGN     object
CONSTRUCTION_YEAR      float64
WFL                    float64
ZONE                    object
SF-SYSTEM              float64
TYPE_OF_DEDUCTIBLE       int64
DRAIN_PIPE_INSURED       int64
PRODUCTLINE             object
PRIOR_DAMAGES            int64
UVV-KZ                   int64
UNDERWRITER             object
PARTY-ID                objec

In [6]:
# Data Analysis
# Count the occurrences of each class in the target variable 'DAMAGE'
damage_counts = df['DAMAGE'].value_counts()

# Print the results
print("Number of observations for each class in 'DAMAGE':")
print(damage_counts)

# Display the percentage distribution
damage_percentage = df['DAMAGE'].value_counts(normalize=True) * 100
print("\nPercentage distribution of each class in 'DAMAGE':")
print(damage_percentage)

Number of observations for each class in 'DAMAGE':
DAMAGE
0    11389665
1      184774
Name: count, dtype: int64

Percentage distribution of each class in 'DAMAGE':
DAMAGE
0    98.403603
1     1.596397
Name: proportion, dtype: float64


## Balance the Data using SMOTE

In [10]:
# Define function to load data in chunks and apply SMOTE incrementally
def load_and_balance_data(file_path, chunk_size=100000):
    smote = SMOTE(random_state=42)
    balanced_data_X = []
    balanced_data_y = []

    # Process data in chunks
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        # Convert 'contract_year' to datetime
        chunk['contract_year'] = pd.to_datetime(chunk['contract_year'], errors='coerce')

        # Drop rows with NaN values in 'contract_year'
        chunk = chunk.dropna(subset=['contract_year'])
        
        # Separate features and target
        X_chunk = chunk.drop('DAMAGE', axis=1)
        y_chunk = chunk['DAMAGE']
        
        # Convert categorical columns to dummy variables
        categorical_columns = ['CORPORATE_DEVISION', 'ORTS-NAME', 'STRASSE', 'CONSTRACTION_DESIGN', 'ZONE', 'PRODUCTLINE', 'UNDERWRITER', 'PARTY-ID']
        X_chunk = pd.get_dummies(X_chunk, columns=categorical_columns, drop_first=True)
        
        # Ensure numeric columns are in correct format
        X_chunk = X_chunk.apply(pd.to_numeric, errors='coerce').fillna(0)
        
        # Apply SMOTE to the chunk
        X_resampled_chunk, y_resampled_chunk = smote.fit_resample(X_chunk, y_chunk)
        
        balanced_data_X.append(X_resampled_chunk)
        balanced_data_y.append(y_resampled_chunk)
    
    # Concatenate all the resampled chunks
    X_resampled = np.vstack(balanced_data_X)
    y_resampled = np.hstack(balanced_data_y)

    return X_resampled, y_resampled