In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# Read the dataset
df = pd.read_csv('income.csv')

# Display basic info
print("URK22CS7048")
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nData information:")
print(df.info())
print("\nNull values:")
print(df.isnull().sum())

# Handle missing values
# For categorical columns, replace with mode
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])
    
# For numerical columns, replace with mean
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col] = df[col].fillna(df[col].mean())

print("\nAfter handling null values:")
print(df.isnull().sum())

# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

print("\nAfter encoding categorical variables:")
print(df.head())

URK22CS7048
Dataset shape: (48842, 7)

First 5 rows:
   age  fnlwgt  education_num  capital_gain  capital_loss  hours_per_week  \
0   39   77516             13          2174             0              40   
1   50   83311             13             0             0              13   
2   38  215646              9             0             0              40   
3   53  234721              7             0             0              40   
4   28  338409             13             0             0              40   

   income_level  
0             0  
1             0  
2             0  
3             0  
4             0  

Data information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             48842 non-null  int64
 1   fnlwgt          48842 non-null  int64
 2   education_num   48842 non-null  int64
 3   capital_gain    48842 non-nul

In [6]:
# Identify the target column
if 'income_level' in df.columns:
    target_column = 'income_level'
else:
    target_column = df.columns[-1]
print("URK22CS7048")    
print(f"\nTarget column: {target_column}")

# Split into features and target
X = df.drop(target_column, axis=1)
y = df[target_column]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training set: {X_train.shape}, Testing set: {X_test.shape}")

URK22CS7048

Target column: income_level
Training set: (34189, 6), Testing set: (14653, 6)


In [12]:
# Add this import at the top of your cell or notebook
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

# Make sure to define these variables before using them
# Identify the target column
print("URK22CS7048")
if 'income_level' in df.columns:
    target_column = 'income_level'
else:
    target_column = df.columns[-1]

# Split into features and target
X = df.drop(target_column, axis=1)
y = df[target_column]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("\n" + "="*50)
print("RANDOM FOREST CLASSIFIER")
print("="*50)

# Define different Random Forest models with different parameters
rf_models = [
    {"name": "RF Model 1", "params": {"n_estimators": 100, "random_state": 42}},
    {"name": "RF Model 2", "params": {"n_estimators": 200, "max_depth": 10, "random_state": 42}},
    {"name": "RF Model 3", "params": {"n_estimators": 100, "max_features": 'sqrt', "random_state": 42}},
    {"name": "RF Model 4", "params": {"n_estimators": 100, "min_samples_split": 5, "random_state": 42}}
]

# Function to evaluate model and print metrics
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"\nConfusion Matrix for {model_name}:")
    print(cm)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return model, y_pred

# Train and evaluate each Random Forest model
for model_config in rf_models:
    print(f"\n{model_config['name']} with parameters: {model_config['params']}")
    rf = RandomForestClassifier(**model_config['params'])
    evaluate_model(rf, X_train, X_test, y_train, y_test, model_config['name'])

URK22CS7048

RANDOM FOREST CLASSIFIER

RF Model 1 with parameters: {'n_estimators': 100, 'random_state': 42}



Confusion Matrix for RF Model 1:
[[10003  1106]
 [ 1713  1831]]
Accuracy: 0.8076
Precision: 0.7981
Recall: 0.8076
F1-score: 0.8012

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.90      0.88     11109
           1       0.62      0.52      0.57      3544

    accuracy                           0.81     14653
   macro avg       0.74      0.71      0.72     14653
weighted avg       0.80      0.81      0.80     14653


RF Model 2 with parameters: {'n_estimators': 200, 'max_depth': 10, 'random_state': 42}

Confusion Matrix for RF Model 2:
[[10700   409]
 [ 2005  1539]]
Accuracy: 0.8353
Precision: 0.8296
Recall: 0.8353
F1-score: 0.8168

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.96      0.90     11109
           1       0.79      0.43      0.56      3544

    accuracy                           0.84     14653
   macro avg       0.82      0.70      0.73     14653
w

In [11]:
# Optimized grid search with reduced parameters
print("URK22CS7048")
param_grid_rf = {
    'n_estimators': [100, 200],         # Reduced from 3 to 2 values
    'max_depth': [10, 20],              # Removed None option
    'min_samples_split': [2, 5],        # Reduced from 3 to 2 values
    'max_features': ['sqrt']            # Reduced from 2 to 1 value
}

# Use RandomizedSearchCV instead for faster results
from sklearn.model_selection import RandomizedSearchCV
random_search_rf = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions=param_grid_rf,
    n_iter=8,                          # Try only 8 combinations instead of all
    cv=3,
    scoring='accuracy',
    random_state=42
)
random_search_rf.fit(X_train, y_train)

URK22CS7048


In [14]:
# Evaluate best Random Forest model
best_rf = random_search_rf.best_estimator_  # Change grid_search_rf to random_search_rf
print("URK22CS7048")
print("\nBest Random Forest Model Evaluation:")
_, y_pred_best_rf = evaluate_model(best_rf, X_train, X_test, y_train, y_test, "Best RF Model")

print("\nRandom Forest Analysis Complete!")

URK22CS7048

Best Random Forest Model Evaluation:

Confusion Matrix for Best RF Model:
[[10715   394]
 [ 2013  1531]]
Accuracy: 0.8357
Precision: 0.8306
Recall: 0.8357
F1-score: 0.8170

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.96      0.90     11109
           1       0.80      0.43      0.56      3544

    accuracy                           0.84     14653
   macro avg       0.82      0.70      0.73     14653
weighted avg       0.83      0.84      0.82     14653


Random Forest Analysis Complete!


In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
import warnings
warnings.filterwarnings('ignore')

# Read the dataset
df = pd.read_csv('income.csv')
print("URK22CS7048")
# Display basic info
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())
print("\nData information:")
print(df.info())
print("\nNull values:")
print(df.isnull().sum())

# Handle missing values
# For categorical columns, replace with mode
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])
    
# For numerical columns, replace with mean
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col] = df[col].fillna(df[col].mean())

print("\nAfter handling null values:")
print(df.isnull().sum())

# Encode categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

print("\nAfter encoding categorical variables:")
print(df.head())

URK22CS7048
Dataset shape: (48842, 7)

First 5 rows:
   age  fnlwgt  education_num  capital_gain  capital_loss  hours_per_week  \
0   39   77516             13          2174             0              40   
1   50   83311             13             0             0              13   
2   38  215646              9             0             0              40   
3   53  234721              7             0             0              40   
4   28  338409             13             0             0              40   

   income_level  
0             0  
1             0  
2             0  
3             0  
4             0  

Data information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             48842 non-null  int64
 1   fnlwgt          48842 non-null  int64
 2   education_num   48842 non-null  int64
 3   capital_gain    48842 non-nul

In [21]:
# Identify the target column
print("URK22CS7048")
if 'income_level' in df.columns:
    target_column = 'income_level'
else:
    target_column = df.columns[-1]
    
print(f"\nTarget column: {target_column}")

# Split into features and target
X = df.drop(target_column, axis=1)
y = df[target_column]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print(f"Training set: {X_train.shape}, Testing set: {X_test.shape}")

URK22CS7048

Target column: income_level
Training set: (34189, 6), Testing set: (14653, 6)


In [22]:
print("URK22CS7048")
print("\n" + "="*50)
print("ADABOOST CLASSIFIER")
print("="*50)

# Define different AdaBoost models with different parameters
ada_models = [
    {"name": "AdaBoost Model 1", "params": {"n_estimators": 50, "random_state": 42}},
    {"name": "AdaBoost Model 2", "params": {"n_estimators": 100, "learning_rate": 0.5, "random_state": 42}},
    {"name": "AdaBoost Model 3", "params": {"n_estimators": 100, "learning_rate": 0.1, "random_state": 42}},
    {"name": "AdaBoost Model 4", "params": {"n_estimators": 200, "learning_rate": 1.0, "random_state": 42}}
]

# Function to evaluate model and print metrics
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"\nConfusion Matrix for {model_name}:")
    print(cm)
    
    # Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return model, y_pred

# Train and evaluate each AdaBoost model
for model_config in ada_models:
    print(f"\n{model_config['name']} with parameters: {model_config['params']}")
    ada = AdaBoostClassifier(**model_config['params'])
    evaluate_model(ada, X_train, X_test, y_train, y_test, model_config['name'])

URK22CS7048

ADABOOST CLASSIFIER

AdaBoost Model 1 with parameters: {'n_estimators': 50, 'random_state': 42}

Confusion Matrix for AdaBoost Model 1:
[[10655   454]
 [ 2048  1496]]
Accuracy: 0.8292
Precision: 0.8215
Recall: 0.8292
F1-score: 0.8102

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.96      0.89     11109
           1       0.77      0.42      0.54      3544

    accuracy                           0.83     14653
   macro avg       0.80      0.69      0.72     14653
weighted avg       0.82      0.83      0.81     14653


AdaBoost Model 2 with parameters: {'n_estimators': 100, 'learning_rate': 0.5, 'random_state': 42}

Confusion Matrix for AdaBoost Model 2:
[[10771   338]
 [ 2189  1355]]
Accuracy: 0.8275
Precision: 0.8237
Recall: 0.8275
F1-score: 0.8037

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.97      0.90     11109
           1       0.80      0.3

In [23]:
# Find best parameters for AdaBoost
print("URK22CS7048")
print("\nFinding best parameters for AdaBoost...")
param_grid_ada = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.5, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}

grid_search_ada = GridSearchCV(AdaBoostClassifier(random_state=42), 
                              param_grid_ada, cv=3, scoring='accuracy')
grid_search_ada.fit(X_train, y_train)

print("\nBest Parameters for AdaBoost:")
print(grid_search_ada.best_params_)

URK22CS7048

Finding best parameters for AdaBoost...

Best Parameters for AdaBoost:
{'algorithm': 'SAMME', 'learning_rate': 1.0, 'n_estimators': 200}


In [24]:
# Evaluate best AdaBoost model
print("URK22CS7048")
best_ada = grid_search_ada.best_estimator_
print("\nBest AdaBoost Model Evaluation:")
_, y_pred_best_ada = evaluate_model(best_ada, X_train, X_test, y_train, y_test, "Best AdaBoost Model")

print("\nAdaBoost Analysis Complete!")

URK22CS7048

Best AdaBoost Model Evaluation:

Confusion Matrix for Best AdaBoost Model:
[[10658   451]
 [ 2023  1521]]
Accuracy: 0.8312
Precision: 0.8237
Recall: 0.8312
F1-score: 0.8127

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.96      0.90     11109
           1       0.77      0.43      0.55      3544

    accuracy                           0.83     14653
   macro avg       0.81      0.69      0.72     14653
weighted avg       0.82      0.83      0.81     14653


AdaBoost Analysis Complete!
