In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

# 1. Load the dataset
df = pd.read_csv('/content/income.csv')

# 2. Split into features and target
#    Here we assume the last column is the target; adjust if your CSV is different.
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# 3. (Optional) One‑hot encode categorical features
#    If X contains non‑numeric columns, uncomment the next line:
# X = pd.get_dummies(X, drop_first=True)

# 4. Default AdaBoost with 10 estimators
clf_default = AdaBoostClassifier(n_estimators=10, random_state=42)
scores_default = cross_val_score(clf_default, X, y, cv=5)
print(f"Default AdaBoost (n_estimators=10) ⇒ Mean 5‑fold CV accuracy: {scores_default.mean():.4f}")

# 5. Tune n_estimators
n_estimators_list = [10, 50, 100, 150, 200]
results = []
for n in n_estimators_list:
    clf = AdaBoostClassifier(n_estimators=n, random_state=42)
    scores = cross_val_score(clf, X, y, cv=5)
    results.append({'n_estimators': n, 'mean_accuracy': scores.mean()})

# 6. Report tuning results
results_df = pd.DataFrame(results)
print("\nTuning results:")
print(results_df)

best = results_df.loc[results_df['mean_accuracy'].idxmax()]
print(f"\nBest performance: {best['mean_accuracy']:.4f} accuracy using {int(best['n_estimators'])} trees")


Default AdaBoost (n_estimators=10) ⇒ Mean 5‑fold CV accuracy: 0.8202

Tuning results:
   n_estimators  mean_accuracy
0            10       0.820237
1            50       0.830023
2           100       0.832050
3           150       0.832296
4           200       0.832624

Best performance: 0.8326 accuracy using 200 trees
