In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load dataset
file_path = "/content/census-income .csv"
df = pd.read_csv(file_path)

# Convert target variable to binary (0: <=50K, 1: >50K)
df['annual_income'] = df['annual_income'].map({'<=50K': 0, '>50K': 1})

# Encode categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
label_encoders = {}
for col in categorical_cols:
    if col != "annual_income":  # Target column already mapped
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# Split dataset into features (X) and target (y)
X = df.drop(columns=["annual_income"])
y = df["annual_income"]

# Standardize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Build Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Make predictions
y_pred = log_reg.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.4f}")


Logistic Regression Accuracy: 0.8279


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Load dataset
file_path = "/content/census-income .csv"
df = pd.read_csv(file_path)

# Convert target variable to binary (0: <=50K, 1: >50K)
df['annual_income'] = df['annual_income'].map({'<=50K': 0, '>50K': 1})

# Encode categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
label_encoders = {}
for col in categorical_cols:
    if col != "annual_income":  # Target column already mapped
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# Split dataset into features (X) and target (y)
X = df.drop(columns=["annual_income"])
y = df["annual_income"]

# Standardize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Find the best max_depth using brute force
best_depth = 1
best_accuracy = 0

for depth in range(1, 21):  # Checking depths from 1 to 20
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    if acc > best_accuracy:
        best_accuracy = acc
        best_depth = depth

# Train the best model
best_dt = DecisionTreeClassifier(max_depth=best_depth, random_state=42)
best_dt.fit(X_train, y_train)
final_accuracy = accuracy_score(y_test, best_dt.predict(X_test))

# Print the best max_depth and accuracy
print(f"Best max_depth: {best_depth}")
print(f"Decision Tree Accuracy: {final_accuracy:.4f}")


Best max_depth: 7
Decision Tree Accuracy: 0.8600


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
file_path = "/content/census-income .csv"
df = pd.read_csv(file_path)

# Convert target variable to binary (0: <=50K, 1: >50K)
df['annual_income'] = df['annual_income'].map({'<=50K': 0, '>50K': 1})

# Encode categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
label_encoders = {}
for col in categorical_cols:
    if col != "annual_income":  # Target column already mapped
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

# Split dataset into features (X) and target (y)
X = df.drop(columns=["annual_income"])
y = df["annual_income"]

# Standardize numerical features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Perform Grid Search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Get the best parameters and train the best model
best_rf = grid_search.best_estimator_
y_pred = best_rf.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)

# Print best parameters and accuracy
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Random Forest Accuracy: {accuracy:.4f}")


Best Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest Accuracy: 0.8644
