In [4]:
import pandas as pd
import numpy as np

# Load dataset
file_path = "adult.csv"
data = pd.read_csv(file_path)

# Identify numerical and categorical features
numerical_features = [feature for feature in data.columns if data[feature].dtypes != "O"]
categorical_features = [feature for feature in data.columns if data[feature].dtypes == "O"]

# Apply log transformation to numerical features
log_scaled_numerical_data = data[numerical_features].apply(np.log1p)

# Handle categorical encoding
categorical_encoded_data = data[categorical_features].copy()

# Ordinal encoding for 'education'
education_order = [
    " Preschool", " 1st-4th", " 5th-6th", " 7th-8th", " 9th", " 10th", " 11th", " 12th",
    " Some-college", " Assoc-acdm", " Assoc-voc", " HS-grad", " Bachelors",
    " Masters", " Prof-school", " Doctorate"
]
education_map = {val: idx for idx, val in enumerate(education_order)}
categorical_encoded_data["education"] = categorical_encoded_data["education"].map(education_map)

# Binary encoding for 'sex'
categorical_encoded_data["sex"] = categorical_encoded_data["sex"].map({" Male": 0, " Female": 1})

# One-hot encoding for other categorical variables
one_hot_features = ["workclass", "marital-status", "occupation", "relationship", "race", "native-country"]
categorical_encoded_data = pd.get_dummies(categorical_encoded_data, columns=one_hot_features, drop_first=True)

# Drop NaNs introduced by log transformation
log_scaled_numerical_data = log_scaled_numerical_data.dropna()

# Align categorical data with numerical data index
categorical_encoded_data = categorical_encoded_data.loc[log_scaled_numerical_data.index]

# Concatenate numerical and categorical data
final_data = pd.concat([log_scaled_numerical_data, categorical_encoded_data], axis=1)

# Move target column 'income' to the first column
final_data.insert(0, "income", final_data.pop("income"))

# Convert income to binary (0 for <=50K, 1 for >50K)
final_data["income"] = final_data["income"].map({" <=50K": 0, " >50K": 1})

# Extract features and target variable
X = final_data.drop(columns=['income']).values.astype(np.float64)  # Ensure numerical values
y = final_data['income'].values.reshape(-1, 1).astype(np.float64)  # Ensure numerical values

# Standardize features
X_mean = np.mean(X, axis=0, dtype=float)
X_std = np.std(X, axis=0, dtype=float, ddof=1)
X = (X - X_mean) / X_std

# Logistic Regression Implementation from Scratch
class LogisticRegressionScratch:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        z = np.array(z, dtype=np.float64)  # Ensure z is a NumPy array
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros((n_features, 1))
        self.bias = 0

        for _ in range(self.epochs):
            linear_model = np.dot(X, self.weights) + self.bias
            y_predicted = self.sigmoid(linear_model)

            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        return (y_predicted >= 0.5).astype(int)

# Train the logistic regression model
model = LogisticRegressionScratch(learning_rate=0.01, epochs=2000)
model.fit(X, y)

# Make predictions
predictions = model.predict(X)

# Calculate accuracy
accuracy = np.mean(predictions == y) * 100
print(f"Model Accuracy: {accuracy:.2f}%")


Model Accuracy: 74.31%
