In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from scipy import stats

# Load dataset
df = pd.read_csv("/Users/akshay/Desktop/dsbda_practical/newdata/adult_dataset.csv")

# --- m. Data Cleaning (Remove NA, ?, Negative values etc.) ---

# Replace '?' with NaN and drop missing values
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# Drop rows with negative values in 'age', 'hours-per-week', 'capital-gain', or 'capital-loss'
numeric_cols = ['age', 'hours-per-week', 'capital-gain', 'capital-loss']
df = df[(df[numeric_cols] >= 0).all(axis=1)]

print("✅ Cleaned Dataset Sample:")
print(df.head())

# --- n. Error Correction (Outlier detection and removal) ---

# Remove outliers using Z-score on numeric columns
z_scores = np.abs(stats.zscore(df[numeric_cols]))
df = df[(z_scores < 3).all(axis=1)]

print("\n✅ After Removing Outliers:")
print(df.describe())

# --- o. Data Transformation ---

# Encode categorical columns
categorical_cols = df.select_dtypes(include='object').columns
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# Separate features and target
X = df.drop("income", axis=1)
y = df["income"]  # 1 = >50K, 0 = <=50K

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# --- p. Model Building and Accuracy Comparison ---

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
lr_acc = accuracy_score(y_test, lr_preds)

# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_test)
nb_acc = accuracy_score(y_test, nb_preds)

# Results
print("\n✅ Accuracy Comparison for Income Prediction:")
print(f"Logistic Regression Accuracy: {lr_acc * 100:.2f}%")
print(f"Naive Bayes Accuracy: {nb_acc * 100:.2f}%")

✅ Cleaned Dataset Sample:
   age  workclass  fnlwgt     education  educational-num      marital-status  \
0   25    Private  226802          11th                7       Never-married   
1   38    Private   89814       HS-grad                9  Married-civ-spouse   
2   28  Local-gov  336951    Assoc-acdm               12  Married-civ-spouse   
3   44    Private  160323  Some-college               10  Married-civ-spouse   
5   34    Private  198693          10th                6       Never-married   

          occupation   relationship   race gender  capital-gain  capital-loss  \
0  Machine-op-inspct      Own-child  Black   Male             0             0   
1    Farming-fishing        Husband  White   Male             0             0   
2    Protective-serv        Husband  White   Male             0             0   
3  Machine-op-inspct        Husband  Black   Male          7688             0   
5      Other-service  Not-in-family  White   Male             0             0   

   hou