In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import imodels
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [4]:
df = pd.read_csv("data/pima/pima.csv")

In [14]:
df.head(100)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
95,6,144,72,27,228,33.9,0.255,40,0
96,2,92,62,28,0,31.6,0.130,24,0
97,1,71,48,18,76,20.4,0.323,22,0
98,6,93,50,30,64,28.7,0.356,23,0


In [6]:
df["Outcome"].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [7]:
# split data into test and train
X = df.drop(columns="Outcome")
y = df["Outcome"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

## Train a simple model

In [8]:
from sklearn.model_selection import GridSearchCV

# Define the model
log_reg = LogisticRegression()

# Define the hyperparameters and their values for tuning
param_grid = {
    "C": [0.01, 0.1, 1, 10, 100],
    "penalty": ["l1", "l2"],
    "solver": ["liblinear"],  # 'liblinear' is compatible with l1 and l2 penalties
}

# Set up the grid search
grid_search = GridSearchCV(log_reg, param_grid, cv=5, scoring="accuracy")

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best hyperparameters:", best_params)
print("Best cross-validation accuracy:", best_score)


# Train the final model with the best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predict with the best model
y_pred_best = best_model.predict(X_test)

# Compute the f1 score
f1_best = f1_score(y_test, y_pred_best)
print("F1 score with the best model:", f1_best)

Best hyperparameters: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best cross-validation accuracy: 0.7655204584832733
F1 score with the best model: 0.6545454545454545


In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

df = pd.read_csv("data/pima/pima.csv")

y = df["Outcome"]
X = df.drop(columns=["Outcome"])

x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

x_train, x_val, y_train, y_val = train_test_split(
    x_train,
    y_train,
    test_size=0.20,
    random_state=1221,
    stratify=y_train,
)

scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_val = scaler.transform(x_val)

y_train = y_train.values
y_test = y_test.values
y_val = y_val.values

In [28]:
x_train

array([[0.05882353, 0.72727273, 0.67213115, ..., 0.61549925, 0.23694961,
        0.11666667],
       [0.47058824, 0.9040404 , 0.59016393, ..., 0.48733234, 0.28778938,
        0.25      ],
       [0.47058824, 0.60606061, 0.70491803, ..., 0.42324888, 0.0789832 ,
        0.01666667],
       ...,
       [0.17647059, 0.74747475, 0.54098361, ..., 0.48435171, 0.07762143,
        0.01666667],
       [0.23529412, 0.41919192, 0.70491803, ..., 0.4366617 , 0.10531094,
        0.21666667],
       [0.11764706, 0.56565657, 0.54098361, ..., 0.37257824, 0.10077167,
        0.05      ]])

In [18]:
y_train.shape

(491,)

In [19]:
x_train.shape

(491, 8)

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score
import torch
import torch.nn as nn
import torch.nn.functional as F


class LinearClassificationNet(nn.Module):
    """
    A fully-connected single-layer linear NN for classification.
    """

    def __init__(self, input_size=11, output_size=2):
        super(LinearClassificationNet, self).__init__()
        self.layer1 = nn.Linear(input_size, output_size, bias=False)

    def forward(self, x):
        x = self.layer1(x.float())
        return x


# Define a function to create and train the model
def create_and_train_model(lr, num_epochs):
    model = LinearClassificationNet(input_size=8, output_size=1)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    X_train_tensor = torch.tensor(x_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

    return model


# Define a function to evaluate the model
def evaluate_model(model):
    X_test_tensor = torch.tensor(x_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test_tensor)
        test_predictions = torch.round(torch.sigmoid(test_outputs))
        accuracy = (
            test_predictions.eq(y_test_tensor).sum().item()
        ) / y_test_tensor.size(0)

    return accuracy


# Define the hyperparameters and their values for tuning
param_grid = {
    "lr": [0.001, 0.01, 0.1],
    "num_epochs": [50, 100, 150],
}

# Perform grid search
best_accuracy = 0
best_params = None
for lr in param_grid["lr"]:
    for num_epochs in param_grid["num_epochs"]:
        model = create_and_train_model(lr, num_epochs)
        accuracy = evaluate_model(model)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_params = {"lr": lr, "num_epochs": num_epochs}

print("Best hyperparameters:", best_params)
print("Best accuracy:", best_accuracy)

# Train the final model with the best parameters
model = create_and_train_model(best_params["lr"], best_params["num_epochs"])

# Evaluate the final model
accuracy = evaluate_model(model)
print(f"Accuracy on test data: {accuracy:.4f}")

Best hyperparameters: {'lr': 0.1, 'num_epochs': 150}
Best accuracy: 0.6753246753246753
Accuracy on test data: 0.6623


## Apply Smote

In [15]:
print("Before SMOTE:", Counter(y_train))

smote = SMOTE(sampling_strategy=0.99, random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
print("After SMOTE:", Counter(y_smote))

minority_class_count = Counter(y_smote)[1]
under_sampler = RandomUnderSampler(sampling_strategy={0: minority_class_count})
X_balanced, y_balanced = under_sampler.fit_resample(X_smote, y_smote)

print("Final class distribution after undersampling:", Counter(y_balanced))
print("Final class distribution after undersampling:", Counter(y_test))

Before SMOTE: Counter({0: 401, 1: 213})
After SMOTE: Counter({0: 401, 1: 396})
Final class distribution after undersampling: Counter({0: 396, 1: 396})
Final class distribution after undersampling: Counter({0: 99, 1: 55})


# Train a simple logistic regression model on the new oversampled/undersampled dataset

In [16]:
model = LogisticRegression()
model.fit(X_balanced, y_balanced)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("F1:", f1)

Accuracy: 0.7142857142857143
F1: 0.6507936507936508


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
# Compare the logistic regressor with a random predictor
y_random = np.random.randint(0, 2, size=len(y_test))
accuracy_random = accuracy_score(y_test, y_random)
print("Accuracy of random predictor:", accuracy_random)

Accuracy of random predictor: 0.525974025974026


In [18]:
# Compare the logistic regressor with a majority classifier that always predicts 0
y_majority = np.zeros(len(y_test))
accuracy_majority = accuracy_score(y_test, y_majority)
print("Accuracy of majority classifier:", accuracy_majority)

Accuracy of majority classifier: 0.6428571428571429


In [None]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from collections import Counter
from sklearn.metrics import accuracy_score, f1_score

# Load the dataset
file_path = "./data/pima/pima.csv"  # Update with your file path
data = pd.read_csv(file_path)

# Preprocessing
# columns_with_zeros = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
# imputer = SimpleImputer(missing_values=0, strategy="mean")
# data[columns_with_zeros] = imputer.fit_transform(data[columns_with_zeros])

X = data.drop(columns=["Outcome"])
y = data["Outcome"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-validation-test split
X_train, X_temp, y_train, y_temp = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

# Apply SMOTE only to the training set
smote = SMOTE(sampling_strategy=0.83, random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
print("After SMOTE:", Counter(y_smote))

minority_class_count = Counter(y_smote)[1]
under_sampler = RandomUnderSampler(sampling_strategy={0: minority_class_count})
X_train_balanced, y_train_balanced = under_sampler.fit_resample(X_smote, y_smote)

print("Original class distribution:", y_train.value_counts())
print("Class distribution after SMOTE:", pd.Series(y_train_balanced).value_counts())


# Convert datasets to PyTorch tensors
def create_dataloader(X, y, batch_size=32, shuffle=False):
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y.values, dtype=torch.long)
    dataset = TensorDataset(X_tensor, y_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)


# Define the neural network
class NeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, 2)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x


# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=20):
    best_val_acc = 0
    best_model = None

    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation
        model.eval()
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                outputs = model(X_batch)
                _, predicted = torch.max(outputs, 1)
                val_total += y_batch.size(0)
                val_correct += (predicted == y_batch).sum().item()

        val_acc = val_correct / val_total
        print(
            f"Epoch {epoch+1}/{epochs}, Loss: {train_loss/len(train_loader):.4f}, Val Accuracy: {val_acc:.4f}"
        )

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = model.state_dict()

    return best_model, best_val_acc


# Hyperparameter tuning using grid search
input_size = X_train.shape[1]
param_grid = {
    "hidden_size1": [16, 32, 64],
    "hidden_size2": [8, 16, 32],
    "learning_rate": [0.01, 0.001, 0.0001],
}

best_params = None
best_model_state = None
best_val_acc = 0

for hidden_size1 in param_grid["hidden_size1"]:
    for hidden_size2 in param_grid["hidden_size2"]:
        for lr in param_grid["learning_rate"]:
            for batch_size in [32, 64, 128]:
                train_loader = create_dataloader(
                    X_train_balanced,
                    y_train_balanced,
                    shuffle=True,
                    batch_size=batch_size,
                )
                val_loader = create_dataloader(X_val, y_val, batch_size=batch_size)
                test_loader = create_dataloader(X_test, y_test, batch_size=batch_size)

                print(
                    f"Training with hidden_size1={hidden_size1}, hidden_size2={hidden_size2}, learning_rate={lr}"
                )
                model = NeuralNetwork(input_size, hidden_size1, hidden_size2)
                criterion = nn.CrossEntropyLoss()
                optimizer = optim.Adam(model.parameters(), lr=lr)
                model_state, val_acc = train_model(
                    model, train_loader, val_loader, criterion, optimizer, epochs=20
                )

                if val_acc > best_val_acc:
                    best_val_acc = val_acc
                    best_model_state = model_state
                    best_params = (hidden_size1, hidden_size2, lr)

print(
    f"Best Params: hidden_size1={best_params[0]}, hidden_size2={best_params[1]}, learning_rate={best_params[2]}"
)
print(f"Best Validation Accuracy: {best_val_acc:.4f}")

# Merge training and validation data
X_train_val = np.vstack((X_train_balanced, X_val))
y_train_val = np.hstack((y_train_balanced, y_val))

# Train the best model on the merged training and validation data
train_val_loader = create_dataloader(
    X_train_val, pd.Series(y_train_val), shuffle=True, batch_size=batch_size
)
best_model = NeuralNetwork(input_size, best_params[0], best_params[1])
best_model.load_state_dict(best_model_state)
optimizer = optim.Adam(best_model.parameters(), lr=best_params[2])
best_model_state, _ = train_model(
    best_model, train_val_loader, val_loader, criterion, optimizer, epochs=20
)

# Load the best model and evaluate on the test set
best_model.load_state_dict(best_model_state)
best_model.eval()

test_correct = 0
test_total = 0
y_true = []
y_pred = []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = best_model(X_batch)
        _, predicted = torch.max(outputs, 1)
        test_total += y_batch.size(0)
        test_correct += (predicted == y_batch).sum().item()
        y_true.extend(y_batch)
        y_pred.extend(predicted)

test_accuracy = test_correct / test_total
f1 = f1_score(y_true, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

# Majority classifier
y_majority = np.zeros(len(y_test))
accuracy_majority = accuracy_score(y_test, y_majority)
print("Accuracy of majority classifier:", accuracy_majority)


After SMOTE: Counter({0: 350, 1: 290})
Original class distribution: Outcome
0    350
1    187
Name: count, dtype: int64
Class distribution after SMOTE: Outcome
0    290
1    290
Name: count, dtype: int64
Training with hidden_size1=16, hidden_size2=8, learning_rate=0.01
Epoch 1/20, Loss: 0.6231, Val Accuracy: 0.7130
Epoch 2/20, Loss: 0.4938, Val Accuracy: 0.7565
Epoch 3/20, Loss: 0.4520, Val Accuracy: 0.7391
Epoch 4/20, Loss: 0.4602, Val Accuracy: 0.7478
Epoch 5/20, Loss: 0.4461, Val Accuracy: 0.7565
Epoch 6/20, Loss: 0.4237, Val Accuracy: 0.7391
Epoch 7/20, Loss: 0.4192, Val Accuracy: 0.7391
Epoch 8/20, Loss: 0.4127, Val Accuracy: 0.7391
Epoch 9/20, Loss: 0.4056, Val Accuracy: 0.7304
Epoch 10/20, Loss: 0.4040, Val Accuracy: 0.7391
Epoch 11/20, Loss: 0.4094, Val Accuracy: 0.7304
Epoch 12/20, Loss: 0.3935, Val Accuracy: 0.7478
Epoch 13/20, Loss: 0.3965, Val Accuracy: 0.7217
Epoch 14/20, Loss: 0.4042, Val Accuracy: 0.7304
Epoch 15/20, Loss: 0.4014, Val Accuracy: 0.7391
Epoch 16/20, Loss: 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Import libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [8]:
class LinearClassificationNet(nn.Module):
    """
    A fully-connected single-layer linear NN for classification.
    """

    def __init__(self, input_size=11, output_size=2):
        super(LinearClassificationNet, self).__init__()
        self.layer1 = nn.Linear(input_size, output_size, bias=False)

    def forward(self, x):
        x = self.layer1(x.float())
        return x


In [None]:
bb = torch.load(f"./artifacts/pima/bb/pima_no_smote.pth").to("cpu")

  bb = torch.load(f"./artifacts/pima/bb/pima_no_smote.pth")


In [14]:
file_path = "./data/pima/pima.csv"  # Update with your file path
data = pd.read_csv(file_path)


X = data.drop(columns=["Outcome"])
y = data["Outcome"]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
# Convert datasets to PyTorch tensors
def create_dataloader(X, y, batch_size=32, shuffle=False):
    X_tensor = torch.tensor(X, dtype=torch.float32)
    y_tensor = torch.tensor(y.values, dtype=torch.long)
    dataset = TensorDataset(X_tensor, y_tensor)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)


test_loader = create_dataloader(X_test, y_test, batch_size=32)

In [21]:
test_correct = 0
test_total = 0
y_true = []
y_pred = []
bb = bb.to("cpu")
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        X_batch = X_batch.to("cpu")
        y_batch = y_batch.to("cpu")
        outputs = bb(X_batch)
        _, predicted = torch.max(outputs, 1)
        test_total += y_batch.size(0)
        test_correct += (predicted == y_batch).sum().item()
        y_true.extend(y_batch)
        y_pred.extend(predicted)

test_accuracy = test_correct / test_total
f1 = f1_score(y_true, y_pred)
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
# majority classifier
y_majority = np.zeros(len(y_test))
accuracy_majority = accuracy_score(y_test, y_majority)
print("Accuracy of majority classifier:", accuracy_majority)

Test Accuracy: 0.7597
F1 Score: 0.7176
Accuracy of majority classifier: 0.6493506493506493
