<a href="https://colab.research.google.com/github/micah-shull/pipelines/blob/main/pipelines_05_pytorch_sklearn_pipeline_02_feature_engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Baseline Model Pipeline with Sklearn Wrapper

In [11]:
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, precision_recall_curve, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
df = pd.read_excel(url, header=1)

# Rename columns to lower case and replace spaces with underscores
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

# Select features and target
target = 'default_payment_next_month'
X = df.drop(columns=[target])
y = df[target]

# Perform stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identify column types
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Define preprocessing for numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Define preprocessing for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Fit and transform the data
preprocessing_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])
X_train_processed = preprocessing_pipeline.fit_transform(X_train)
X_test_processed = preprocessing_pipeline.transform(X_test)

# Apply SMOTE to balance the training dataset
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_resampled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_resampled.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test_processed, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

optimal_threshold = 0.8000141978263855

# Define a simple neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Define the sklearn wrapper for the neural network model
class SklearnSimpleNN(BaseEstimator, ClassifierMixin):
    def __init__(self, input_dim, learning_rate=0.001, epochs=50, batch_size=64, pos_weight=1.0):
        self.input_dim = input_dim
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.pos_weight = pos_weight  # Accept as float
        self.model = SimpleNN(self.input_dim)

    def fit(self, X, y):
        # Convert pos_weight to tensor here
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(self.pos_weight, dtype=torch.float32))
        optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        train_dataset = torch.utils.data.TensorDataset(torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32).unsqueeze(1))
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)

        for epoch in range(self.epochs):
            self.model.train()
            for inputs, targets in train_loader:
                optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = criterion(outputs, targets.view(-1, 1))
                loss.backward()
                optimizer.step()
        return self

    def predict(self, X):
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(torch.tensor(X, dtype=torch.float32))
            probabilities = torch.sigmoid(outputs)
            predictions = (probabilities > optimal_threshold).float()  # Use the manually adjusted threshold here
        return predictions.numpy().squeeze()

# Calculate the class weights
class_weights = len(y_train) / (2 * np.bincount(y_train))

# Create an instance of SklearnSimpleNN with the adjusted weight
input_dim = X_train_tensor.shape[1]
nn_estimator = SklearnSimpleNN(input_dim=input_dim, pos_weight=class_weights[1])

# Fit the model
nn_estimator.fit(X_train_tensor.numpy(), y_train_tensor.numpy())

# Predict on the test set with the optimal threshold
test_predictions_optimal_threshold = nn_estimator.predict(X_test_tensor.numpy())

# Evaluate the model with the optimal threshold
print(classification_report(y_test_tensor.numpy(), test_predictions_optimal_threshold))


              precision    recall  f1-score   support

         0.0       0.86      0.89      0.87      4673
         1.0       0.54      0.48      0.51      1327

    accuracy                           0.80      6000
   macro avg       0.70      0.68      0.69      6000
weighted avg       0.79      0.80      0.79      6000



### Rato Feature

The `RatioFeatures` transformer is a custom transformer class used in a data preprocessing pipeline. Its main purpose is to create new features by calculating the ratio between pairs of existing features. Specifically, in this context, it generates ratio features between the payment amounts (`pay_amt`) and the corresponding billing amounts (`bill_amt`) for different months.

#### What the `RatioFeatures` Transformer Does:

1. **Initialization**: The transformer is initialized without any parameters, meaning it doesn't require any specific setup when instantiated.

2. **Fit Method**: The `fit` method is a standard requirement for transformers in scikit-learn's API. In this transformer, it doesn't perform any action and simply returns `self`. This is because the transformation logic doesn't depend on the training data (it doesn't learn any parameters from the data).

3. **Transform Method**: The core functionality of the transformer is implemented in the `transform` method:
   - **Copying Data**: It starts by making a copy of the input DataFrame to avoid modifying the original data.
   - **Creating Ratios**: For each month (from 1 to 6), it computes the ratio of `pay_amt` to `bill_amt` and creates a new column for each ratio. For instance, `pay_amt1` is divided by `bill_amt1`, and the result is stored in a new column `pay_to_bill_ratio_1`. This process is repeated for all six months.
   - **Avoiding Division by Zero**: A small constant (`1e-9`) is added to the billing amounts in the denominator to prevent division by zero, which would otherwise result in `NaN` values.

### Purpose of Ratio Features:

The ratio features created by the `RatioFeatures` transformer can provide meaningful insights into the customer's payment behavior. For example:
- A high ratio might indicate that a customer is paying a large portion or all of their bill, which could be a positive indicator.
- A low ratio might indicate that a customer is paying only a small portion of their bill, which could be a negative indicator.

These new features can help improve the performance of machine learning models by providing additional information that might be more predictive of the target variable (in this case, whether the customer will default on their credit card payment next month).

### Example of the `RatioFeatures` Transformer Code:

```python
class RatioFeatures(TransformerMixin, BaseEstimator):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Create ratios for bill_amt and pay_amt columns
        for i in range(1, 7):
            bill_col = f'bill_amt{i}'
            pay_col = f'pay_amt{i}'
            ratio_col = f'pay_to_bill_ratio_{i}'
            X[ratio_col] = X[pay_col] / (X[bill_col] + 1e-9)  # Add a small constant to avoid division by zero
        return X
```

By including this transformer in a data preprocessing pipeline, we ensure that the ratio features are automatically created and included in the model training process, potentially enhancing the model's predictive power.

In [38]:
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, precision_recall_curve, f1_score
from imblearn.over_sampling import SMOTE

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
df = pd.read_excel(url, header=1)

# Rename columns to lower case and replace spaces with underscores
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

# Select features and target
target = 'default_payment_next_month'
X = df.drop(columns=[target])
y = df[target]

# Perform stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identify column types
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Custom transformer for ratio features
class RatioFeatures(TransformerMixin, BaseEstimator):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Create ratios for bill_amt and pay_amt columns
        for i in range(1, 7):
            bill_col = f'bill_amt{i}'
            pay_col = f'pay_amt{i}'
            ratio_col = f'pay_to_bill_ratio_{i}'
            X[ratio_col] = X[pay_col] / (X[bill_col] + 1e-9)  # Add a small constant to avoid division by zero
        return X

# Define preprocessing for numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Define preprocessing for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps and feature engineering
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('ratio', RatioFeatures(), numeric_features)
    ])

# Define feature engineering pipeline
feature_engineering_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the training data
X_train_fe = feature_engineering_pipeline.fit_transform(X_train)
X_test_fe = feature_engineering_pipeline.transform(X_test)

# Check for any remaining NaNs and impute them
imputer = SimpleImputer(strategy='median')
X_train_fe = imputer.fit_transform(X_train_fe)
X_test_fe = imputer.transform(X_test_fe)

# Apply SMOTE to balance the training dataset
smote = SMOTE(random_state=42)
X_train_resampled_fe, y_train_resampled_fe = smote.fit_resample(X_train_fe, y_train)

# Convert to PyTorch tensors
X_train_tensor_fe = torch.tensor(X_train_resampled_fe, dtype=torch.float32)  # Convert to dense
y_train_tensor_fe = torch.tensor(y_train_resampled_fe.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor_fe = torch.tensor(X_test_fe, dtype=torch.float32)  # Convert to dense
y_test_tensor_fe = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

optimal_threshold = 0.8000141978263855

# Define a simple neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Define the sklearn wrapper for the neural network model
class SklearnSimpleNN(BaseEstimator, ClassifierMixin):
    def __init__(self, input_dim, learning_rate=0.001, epochs=50, batch_size=64, pos_weight=1.0):
        self.input_dim = input_dim
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.pos_weight = pos_weight  # Accept as float
        self.model = SimpleNN(self.input_dim)

    def fit(self, X, y):
        # Convert pos_weight to tensor here
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(self.pos_weight, dtype=torch.float32))
        optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        train_dataset = torch.utils.data.TensorDataset(torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32).unsqueeze(1))
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)

        for epoch in range(self.epochs):
            self.model.train()
            for inputs, targets in train_loader:
                optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = criterion(outputs, targets.view(-1, 1))
                loss.backward()
                optimizer.step()
        return self

    def predict(self, X):
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(torch.tensor(X, dtype=torch.float32))
            probabilities = torch.sigmoid(outputs)
            predictions = (probabilities > optimal_threshold).float()  # Use the manually adjusted threshold here
        return predictions.numpy().squeeze()

# Calculate the class weights
class_weights = len(y_train) / (2 * np.bincount(y_train))

# Create an instance of SklearnSimpleNN with the adjusted weight
input_dim = X_train_tensor_fe.shape[1]
nn_estimator_fe = SklearnSimpleNN(input_dim=input_dim, pos_weight=class_weights[1])

# Fit the model with feature engineering
nn_estimator_fe.fit(X_train_tensor_fe.numpy(), y_train_tensor_fe.numpy())

# Predict on the test set with the optimal threshold
test_predictions_fe = nn_estimator_fe.predict(X_test_tensor_fe.numpy())

# Evaluate the model with the optimal threshold
print(classification_report(y_test_tensor_fe.numpy(), test_predictions_fe))


              precision    recall  f1-score   support

         0.0       0.80      0.89      0.84      4673
         1.0       0.37      0.22      0.28      1327

    accuracy                           0.74      6000
   macro avg       0.59      0.56      0.56      6000
weighted avg       0.71      0.74      0.72      6000



#### Analysis of the Classification Report

**Class 0 (No Default):**
- **Precision**: 0.80, meaning 80% of instances predicted as no default are actually no default.
- **Recall**: 0.89, meaning 89% of actual no default instances are correctly identified.
- **F1-score**: 0.84, which is the harmonic mean of precision and recall.
- **Support**: 4673 instances.

**Class 1 (Default):**
- **Precision**: 0.37, meaning 37% of instances predicted as default are actually default.
- **Recall**: 0.22, meaning only 22% of actual default instances are correctly identified.
- **F1-score**: 0.28, which is the harmonic mean of precision and recall.
- **Support**: 1327 instances.

**Overall Metrics:**
- **Accuracy**: 0.74, meaning 74% of all instances are correctly classified.
- **Macro Avg**: Precision: 0.59, Recall: 0.56, F1-score: 0.56.
- **Weighted Avg**: Precision: 0.71, Recall: 0.74, F1-score: 0.72.

### Interpretation

- The model performs well for Class 0 (no default) with high precision and recall.
- The performance for Class 1 (default) is relatively poor, with low precision and recall, resulting in a low F1-score.
- The overall accuracy is driven by the model's performance on the majority class (no default).

### Recommendations for Improvement

1. **Class Imbalance Handling**:
   - Despite using SMOTE, the model still struggles with class imbalance. Fine-tuning the oversampling ratio or experimenting with other techniques like ADASYN, or ensemble methods such as BalancedBaggingClassifier, might help.
   
2. **Feature Engineering**:
   - Explore additional feature engineering, such as creating more interaction features, or using domain knowledge to create features that might capture important aspects of the data.
   
3. **Threshold Tuning**:
   - Fine-tune the decision threshold specifically for Class 1 to improve precision and recall for the minority class.
   
4. **Model Complexity**:
   - Consider using a more complex model or an ensemble of models to capture more patterns in the data.
   
5. **Hyperparameter Tuning**:
   - Perform a thorough hyperparameter tuning using techniques like GridSearchCV or RandomizedSearchCV to find the optimal set of parameters for the model.

### Next Steps: Further Feature Engineering

1. **Interaction Features**:
   - Create interaction features between various columns to capture more complex relationships.
   
2. **Statistical Features**:
   - Calculate statistical features such as mean, median, variance, etc., for bill amounts and payment amounts.
   
3. **Behavioral Features**:
   - Features capturing behavioral patterns, such as the frequency of full payments or the number of months with no payment, could be insightful.

Let's implement additional feature engineering steps to capture more patterns in the data.




### Interaction Feature Engineering

### Summary of Changes

1. **Interaction Features**: Added a new transformer to create interaction features between bill amounts and payment amounts.
2. **Updated Preprocessing Pipeline**: Included the new interaction features transformer in the preprocessing pipeline.

These changes aim to capture more complex relationships in the data and potentially improve the model's performance on the minority class.

In [41]:
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, precision_recall_curve, f1_score
from imblearn.over_sampling import SMOTE

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
df = pd.read_excel(url, header=1)

# Rename columns to lower case and replace spaces with underscores
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

# Select features and target
target = 'default_payment_next_month'
X = df.drop(columns=[target])
y = df[target]

# Perform stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identify column types
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Custom transformer for ratio features
class RatioFeatures(TransformerMixin, BaseEstimator):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Create ratios for bill_amt and pay_amt columns
        for i in range(1, 7):
            bill_col = f'bill_amt{i}'
            pay_col = f'pay_amt{i}'
            ratio_col = f'pay_to_bill_ratio_{i}'
            X[ratio_col] = X[pay_col] / (X[bill_col] + 1e-9)  # Add a small constant to avoid division by zero
        return X

# Custom transformer for interaction features
class InteractionFeatures(TransformerMixin, BaseEstimator):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Create interaction features between bill_amt and pay_amt columns
        for i in range(1, 7):
            bill_col = f'bill_amt{i}'
            pay_col = f'pay_amt{i}'
            interaction_col = f'bill_pay_interaction_{i}'
            X[interaction_col] = X[bill_col] * X[pay_col]
        return X

# Define preprocessing for numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Define preprocessing for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps and feature engineering
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('ratio', RatioFeatures(), numeric_features),
        ('interaction', InteractionFeatures(), numeric_features)
    ])

# Define feature engineering pipeline
feature_engineering_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the training data
X_train_fe = feature_engineering_pipeline.fit_transform(X_train)
X_test_fe = feature_engineering_pipeline.transform(X_test)

# Check for any remaining NaNs and impute them
imputer = SimpleImputer(strategy='median')
X_train_fe = imputer.fit_transform(X_train_fe)
X_test_fe = imputer.transform(X_test_fe)

# Apply SMOTE to balance the training dataset
smote = SMOTE(random_state=42)
X_train_resampled_fe, y_train_resampled_fe = smote.fit_resample(X_train_fe, y_train)

# Convert to PyTorch tensors
X_train_tensor_fe = torch.tensor(X_train_resampled_fe, dtype=torch.float32)  # Convert to dense
y_train_tensor_fe = torch.tensor(y_train_resampled_fe.values, dtype=torch.float32).unsqueeze(1)
X_test_tensor_fe = torch.tensor(X_test_fe, dtype=torch.float32)  # Convert to dense
y_test_tensor_fe = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

optimal_threshold = 0.8000141978263855

# Define a simple neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 32)
        self.fc2 = nn.Linear(32, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Define the sklearn wrapper for the neural network model
class SklearnSimpleNN(BaseEstimator, ClassifierMixin):
    def __init__(self, input_dim, learning_rate=0.001, epochs=50, batch_size=64, pos_weight=1.0):
        self.input_dim = input_dim
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.batch_size = batch_size
        self.pos_weight = pos_weight  # Accept as float
        self.model = SimpleNN(self.input_dim)

    def fit(self, X, y):
        # Convert pos_weight to tensor here
        criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(self.pos_weight, dtype=torch.float32))
        optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        train_dataset = torch.utils.data.TensorDataset(torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32).unsqueeze(1))
        train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)

        for epoch in range(self.epochs):
            self.model.train()
            for inputs, targets in train_loader:
                optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = criterion(outputs, targets.view(-1, 1))
                loss.backward()
                optimizer.step()
        return self

    def predict(self, X):
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(torch.tensor(X, dtype=torch.float32))
            probabilities = torch.sigmoid(outputs)
            predictions = (probabilities > optimal_threshold).float()  # Use the manually adjusted threshold here
        return predictions.numpy().squeeze()

# Calculate the class weights
class_weights = len(y_train) / (2 * np.bincount(y_train))

# Create an instance of SklearnSimpleNN with the adjusted weight
input_dim = X_train_tensor_fe.shape[1]
nn_estimator_fe = SklearnSimpleNN(input_dim=input_dim, pos_weight=class_weights[1])

# Fit the model with feature engineering
nn_estimator_fe.fit(X_train_tensor_fe.numpy(), y_train_tensor_fe.numpy())

# Predict on the test set with the optimal threshold
test_predictions_fe = nn_estimator_fe.predict(X_test_tensor_fe.numpy())

# Evaluate the model with the optimal threshold
print(classification_report(y_test_tensor_fe.numpy(), test_predictions_fe))


              precision    recall  f1-score   support

         0.0       0.78      0.99      0.87      4673
         1.0       0.26      0.01      0.02      1327

    accuracy                           0.77      6000
   macro avg       0.52      0.50      0.45      6000
weighted avg       0.67      0.77      0.68      6000




### Analysis of the Classification Report

The updated classification report indicates that the model's performance for the minority class (default) has worsened, with very low precision and recall for Class 1 (default). This suggests that the additional features have not improved the model's ability to identify defaults and may have even led to overfitting to the majority class.

### Interpretation

- The model performs very well for Class 0 (no default) with high recall, but this comes at the cost of poor performance for Class 1 (default).
- The precision and recall for Class 1 are both very low, indicating that the model is not identifying default instances effectively.

### Recommendations for Further Improvement

1. **Further Tuning of SMOTE**:
   - Experiment with different SMOTE ratios to see if this can help balance the classes more effectively.
   
2. **Alternative Sampling Methods**:
   - Try other resampling techniques such as ADASYN, RandomUnderSampler, or a combination of oversampling and undersampling.

3. **Ensemble Methods**:
   - Consider using ensemble methods such as Random Forest, Gradient Boosting, or XGBoost, which can often handle imbalanced data better.

4. **Model Complexity and Hyperparameter Tuning**:
   - Perform hyperparameter tuning for the neural network model, such as adjusting the learning rate, number of epochs, batch size, and architecture of the network.

5. **Feature Selection**:
   - Use feature selection techniques such as Recursive Feature Elimination (RFE) or feature importance scores from tree-based models to select the most relevant features.

### Implementation of Further Tuning of SMOTE and Alternative Sampling Methods

Let's first experiment with different SMOTE ratios to see if this can improve the model's performance.



In [42]:
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import classification_report, precision_recall_curve, f1_score
from imblearn.over_sampling import SMOTE

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00350/default%20of%20credit%20card%20clients.xls"
df = pd.read_excel(url, header=1)

# Rename columns to lower case and replace spaces with underscores
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

# Select features and target
target = 'default_payment_next_month'
X = df.drop(columns=[target])
y = df[target]

# Perform stratified train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identify column types
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Custom transformer for ratio features
class RatioFeatures(TransformerMixin, BaseEstimator):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Create ratios for bill_amt and pay_amt columns
        for i in range(1, 7):
            bill_col = f'bill_amt{i}'
            pay_col = f'pay_amt{i}'
            ratio_col = f'pay_to_bill_ratio_{i}'
            X[ratio_col] = X[pay_col] / (X[bill_col] + 1e-9)  # Add a small constant to avoid division by zero
        return X

# Custom transformer for interaction features
class InteractionFeatures(TransformerMixin, BaseEstimator):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        # Create interaction features between bill_amt and pay_amt columns
        for i in range(1, 7):
            bill_col = f'bill_amt{i}'
            pay_col = f'pay_amt{i}'
            interaction_col = f'bill_pay_interaction_{i}'
            X[interaction_col] = X[bill_col] * X[pay_col]
        return X

# Define preprocessing for numeric columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Define preprocessing for categorical columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps and feature engineering
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('ratio', RatioFeatures(), numeric_features),
        ('interaction', InteractionFeatures(), numeric_features)
    ])

# Define feature engineering pipeline
feature_engineering_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# Fit and transform the training data
X_train_fe = feature_engineering_pipeline.fit_transform(X_train)
X_test_fe = feature_engineering_pipeline.transform(X_test)

# Check for any remaining NaNs and impute them
imputer = SimpleImputer(strategy='median')
X_train_fe = imputer.fit_transform(X_train_fe)
X_test_fe = imputer.transform(X_test_fe)

# Experiment with different SMOTE ratios
smote_ratios = [0.5, 0.75, 1.0]
best_model = None
best_f1_score = 0
best_ratio = None

for ratio in smote_ratios:
    smote = SMOTE(sampling_strategy=ratio, random_state=42)
    X_train_resampled_fe, y_train_resampled_fe = smote.fit_resample(X_train_fe, y_train)

    # Convert to PyTorch tensors
    X_train_tensor_fe = torch.tensor(X_train_resampled_fe, dtype=torch.float32)  # Convert to dense
    y_train_tensor_fe = torch.tensor(y_train_resampled_fe.values, dtype=torch.float32).unsqueeze(1)
    X_test_tensor_fe = torch.tensor(X_test_fe, dtype=torch.float32)  # Convert to dense
    y_test_tensor_fe = torch.tensor(y_test.values, dtype=torch.float32).unsqueeze(1)

    # Define a simple neural network model
    class SimpleNN(nn.Module):
        def __init__(self, input_dim):
            super(SimpleNN, self).__init__()
            self.fc1 = nn.Linear(input_dim, 32)
            self.fc2 = nn.Linear(32, 1)

        def forward(self, x):
            x = torch.relu(self.fc1(x))
            x = self.fc2(x)
            return x

    # Define the sklearn wrapper for the neural network model
    class SklearnSimpleNN(BaseEstimator, ClassifierMixin):
        def __init__(self, input_dim, learning_rate=0.001, epochs=50, batch_size=64, pos_weight=1.0):
            self.input_dim = input_dim
            self.learning_rate = learning_rate
            self.epochs = epochs
            self.batch_size = batch_size
            self.pos_weight = pos_weight  # Accept as float
            self.model = SimpleNN(self.input_dim)

        def fit(self, X, y):
            # Convert pos_weight to tensor here
            criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(self.pos_weight, dtype=torch.float32))
            optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
            train_dataset = torch.utils.data.TensorDataset(torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32).unsqueeze(1))
            train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)

            for epoch in range(self.epochs):
                self.model.train()
                for inputs, targets in train_loader:
                    optimizer.zero_grad()
                    outputs = self.model(inputs)
                    loss = criterion(outputs, targets.view(-1, 1))
                    loss.backward()
                    optimizer.step()
            return self

        def predict(self, X):
            self.model.eval()
            with torch.no_grad():
                outputs = self.model(torch.tensor(X, dtype=torch.float32))
                probabilities = torch.sigmoid(outputs)
                predictions = (probabilities > optimal_threshold).float()  # Use the manually adjusted threshold here
            return predictions.numpy().squeeze()

    # Calculate the class weights
    class_weights = len(y_train) / (2 * np.bincount(y_train))

    # Create an instance of SklearnSimpleNN with the adjusted weight
    input_dim = X_train_tensor_fe.shape[1]
    nn_estimator_fe = SklearnSimpleNN(input_dim=input_dim, pos_weight=class_weights[1])

    # Fit the model with feature engineering
    nn_estimator_fe.fit(X_train_tensor_fe.numpy(), y_train_tensor_fe.numpy())

    # Predict on the test set with the optimal threshold
    test_predictions_fe = nn_estimator_fe.predict(X_test_tensor_fe.numpy())

    # Evaluate the model with the optimal threshold
    report = classification_report(y_test_tensor_fe.numpy(), test_predictions_fe, output_dict=True)
    f1 = report['1']['f1-score']

    if f1 > best_f1_score:
        best_f1_score = f1
        best_model = nn_estimator_fe
        best_ratio = ratio

# Print the best ratio and the classification report of the best model
print(f'Best SMOTE Ratio: {best_ratio}')
print(classification_report(y_test_tensor_fe.numpy(), best_model.predict(X_test_tensor_fe.numpy())))


KeyError: '1'