In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# Define the file path 
file_path = r'C:\Users\Hiremath\OneDrive\Desktop\raw.githubusercontent.com_dsrscientist_dataset1_master_census_income.csv'

# Load the dataset into a Pandas DataFrame
data = pd.read_csv(file_path)

# Data Preprocessing
# Considering the target variable is 'income', and  to predict whether a person makes over $50K.
X = data.drop('Income', axis=1)
y = data['Income']

# Encode categorical variables using one-hot encoding
X = pd.get_dummies(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the input features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize a Logistic Regression model
model = LogisticRegression(max_iter=1000, solver='lbfgs')

# Hyperparameter Tuning using GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model using cross-validation
cv_scores = cross_val_score(best_model, X_train_scaled, y_train, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())

# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)

# Evaluate the model on the test set
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

# Display a classification report for more detailed evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))


Cross-Validation Scores: [0.85316699 0.85431862 0.84913628 0.8523709  0.84929929]
Mean CV Accuracy: 0.851658413442849
Test Accuracy: 0.8524262899262899
Classification Report:
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90      4912
        >50K       0.73      0.63      0.68      1600

    accuracy                           0.85      6512
   macro avg       0.81      0.78      0.79      6512
weighted avg       0.85      0.85      0.85      6512

