In [1]:
import os
os.getcwd()

'/Users/praveen/MSDS/MachineLearning/KaggleChallenges/Challenge2/Data'

In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd

# Assume df is preprocessed and cleaned
df = pd.read_csv("./training_data.csv", low_memory=False)

# Convert problematic columns
for col in ['governor_contribution_ratio', 'senate_contribution_ratio', 'us_senate_contribution_ratio']:
    df[col] = pd.to_numeric(df[col], errors='coerce')


# Drop high cardinality or non-informative
df = df.drop(columns=['zip_code'])
df = df.dropna()

# Binary label — you can tune this threshold
df['won'] = (df['winner_ratio'] > 0.0).astype(int)

df.head()

Unnamed: 0,general_sector,city,specific_sector,state,contributor_type,winner_ratio,candidacy_count,candidacy_democratic_count,candidacy_republican_count,contribution_count,...,house_and_assembly_contribution_ratio,politician_challenger_ratio,politician_democratic_ratio,politician_incumbency_ratio,politician_open_pos_ratio,politician_republican_ratio,senate_contribution_ratio,us_house_contribution_ratio,us_senate_contribution_ratio,won
0,Retired,MISSOULA,Retired,MT,Individual,1.0,4,4,0,6,...,0.0,0.0,1.0,0.5,0.5,0.0,0.0,0.0,0.0,1
1,Retired,CLEVELAND HTS,Retired,OH,Individual,0.5,2,2,0,2,...,0.216202,0.5,1.0,0.5,0.0,0.0,0.0,0.0,0.0,1
2,Public Sector Unions,NEWARK,Teachers unions,NJ,Non-Individual,1.0,4,4,0,4,...,3.257082,0.0,1.0,1.0,0.0,0.0,-2.257082,0.0,0.0,1
3,Retired,BOZEMAN,Retired,MT,Individual,1.0,4,4,0,4,...,0.2,0.0,1.0,0.5,0.5,0.0,0.2,0.0,0.0,1
4,No Employer Listed or Found,ORANGE PARK,No employer listed or discovered,FL,Individual,0.0,4,4,0,6,...,0.0,0.5,1.0,0.0,0.5,0.0,0.0,0.0,0.0,0


In [9]:
df.columns

Index(['general_sector', 'city', 'specific_sector', 'state',
       'contributor_type', 'winner_ratio', 'candidacy_count',
       'candidacy_democratic_count', 'candidacy_republican_count',
       'contribution_count', 'contribution_democratic_count',
       'contribution_republican_count', 'politician_challenger_count',
       'politician_count', 'politician_democratic_count',
       'politician_incumbency_count', 'politician_open_pos_count',
       'politician_republican_count', 'contribution_democratic_sum_2010_usd',
       'contribution_republican_sum_2010_usd', 'contribution_sum_2010_usd',
       'governor_contributions_sum_2010_usd',
       'house_and_assembly_contributions_sum_2010_usd',
       'senate_contributions_sum_2010_usd',
       'us_house_contributions_sum_2010_usd',
       'us_senate_contributions_sum_2010_usd', 'candidacy_democratic_ratio',
       'candidacy_republican_ratio', 'contribution_democratic_count_ratio',
       'contribution_republican_count_ratio', 'govern

In [1]:



# One-hot encode categorical vars
cat_cols = ['general_sector', 'city', 'specific_sector', 'state', 'contributor_type']
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# Split
X = df_encoded.drop(columns=['winner_ratio', 'won'])
y_binary = df_encoded['won']
y_continuous = df_encoded['winner_ratio']

X_train, X_test, y_train_bin, y_test_bin, y_train_ratio, y_test_ratio = train_test_split(
    X, y_binary, y_continuous, test_size=0.2, random_state=42
)

In [3]:
df_encoded.shape

(171192, 9654)

In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from tqdm import tqdm
import joblib

# Initialize model with warm_start
clf = RandomForestClassifier(n_estimators=1, warm_start=True, random_state=42)

best_mse = float("inf")
best_model = None

mse_progress = []

# Build one tree per iteration
for i in tqdm(range(1, 101), desc="Training Progress"):
    clf.n_estimators = i
    clf.fit(X_train, y_train_bin)

    # Predict probabilities for class 1
    y_proba = clf.predict_proba(X_test)[:, 1]

    # Compare with actual winner_ratio
    mse = mean_squared_error(y_test_ratio, y_proba)
    mse_progress.append(mse)

    # Print progress (optional)
    print(f"Trees: {i}, MSE: {mse:.5f}")

    # Save best model
    if mse < best_mse:
        best_mse = mse
        best_model = clf
        joblib.dump(best_model, f"best_rf_model_{i}_trees.pkl")

print(f"\n✅ Best MSE: {best_mse:.5f}")


Training Progress:   1%|          | 1/100 [00:14<24:02, 14.58s/it]

Trees: 1, MSE: 0.21215


Training Progress:   2%|▏         | 2/100 [00:23<17:54, 10.96s/it]

Trees: 2, MSE: 0.16650


Training Progress:   3%|▎         | 3/100 [00:31<15:38,  9.68s/it]

Trees: 3, MSE: 0.15338


Training Progress:   4%|▍         | 4/100 [00:38<14:06,  8.82s/it]

Trees: 4, MSE: 0.14663


Training Progress:   5%|▌         | 5/100 [00:47<13:48,  8.72s/it]

Trees: 5, MSE: 0.14200


Training Progress:   6%|▌         | 6/100 [00:58<14:56,  9.54s/it]

Trees: 6, MSE: 0.13940


Training Progress:   7%|▋         | 7/100 [01:09<15:44, 10.15s/it]

Trees: 7, MSE: 0.13764


Training Progress:   8%|▊         | 8/100 [01:21<16:16, 10.62s/it]

Trees: 8, MSE: 0.13612


Training Progress:   9%|▉         | 9/100 [01:39<19:34, 12.91s/it]

Trees: 9, MSE: 0.13503


Training Progress:  10%|█         | 10/100 [02:02<24:03, 16.04s/it]

Trees: 10, MSE: 0.13422


Training Progress:  11%|█         | 11/100 [02:22<25:36, 17.27s/it]

Trees: 11, MSE: 0.13316


Training Progress:  12%|█▏        | 12/100 [02:41<26:18, 17.94s/it]

Trees: 12, MSE: 0.13250


Training Progress:  13%|█▎        | 13/100 [02:57<25:08, 17.34s/it]

Trees: 13, MSE: 0.13195


Training Progress:  14%|█▍        | 14/100 [03:14<24:20, 16.98s/it]

Trees: 14, MSE: 0.13134


Training Progress:  15%|█▌        | 15/100 [03:28<22:59, 16.22s/it]

Trees: 15, MSE: 0.13106


Training Progress:  16%|█▌        | 16/100 [03:45<23:07, 16.52s/it]

Trees: 16, MSE: 0.13069


Training Progress:  17%|█▋        | 17/100 [04:00<22:19, 16.14s/it]

Trees: 17, MSE: 0.13037


Training Progress:  18%|█▊        | 18/100 [04:16<21:59, 16.09s/it]

Trees: 18, MSE: 0.13019


Training Progress:  19%|█▉        | 19/100 [04:30<20:54, 15.48s/it]

Trees: 19, MSE: 0.13011


Training Progress:  20%|██        | 20/100 [04:44<19:51, 14.89s/it]

Trees: 20, MSE: 0.12990


Training Progress:  21%|██        | 21/100 [04:58<19:26, 14.77s/it]

Trees: 21, MSE: 0.12985


Training Progress:  22%|██▏       | 22/100 [05:12<18:47, 14.46s/it]

Trees: 22, MSE: 0.12966


Training Progress:  23%|██▎       | 23/100 [05:30<19:47, 15.42s/it]

Trees: 23, MSE: 0.12946


Training Progress:  24%|██▍       | 24/100 [05:47<20:15, 15.99s/it]

Trees: 24, MSE: 0.12921


Training Progress:  25%|██▌       | 25/100 [06:08<21:53, 17.52s/it]

Trees: 25, MSE: 0.12907


Training Progress:  26%|██▌       | 26/100 [06:27<22:11, 17.99s/it]

Trees: 26, MSE: 0.12884


Training Progress:  27%|██▋       | 27/100 [06:45<21:39, 17.81s/it]

Trees: 27, MSE: 0.12875


Training Progress:  28%|██▊       | 28/100 [07:09<23:33, 19.63s/it]

Trees: 28, MSE: 0.12865


Training Progress:  29%|██▉       | 29/100 [07:35<25:32, 21.58s/it]

Trees: 29, MSE: 0.12863


Training Progress:  30%|███       | 30/100 [07:48<22:20, 19.15s/it]

Trees: 30, MSE: 0.12851


Training Progress:  31%|███       | 31/100 [08:02<20:06, 17.48s/it]

Trees: 31, MSE: 0.12848


Training Progress:  32%|███▏      | 32/100 [08:16<18:37, 16.43s/it]

Trees: 32, MSE: 0.12836


Training Progress:  33%|███▎      | 33/100 [08:34<18:45, 16.80s/it]

Trees: 33, MSE: 0.12834


Training Progress:  34%|███▍      | 34/100 [08:55<20:02, 18.22s/it]

Trees: 34, MSE: 0.12826


Training Progress:  35%|███▌      | 35/100 [09:22<22:29, 20.76s/it]

Trees: 35, MSE: 0.12820


Training Progress:  36%|███▌      | 36/100 [09:32<18:53, 17.70s/it]

Trees: 36, MSE: 0.12808


Training Progress:  37%|███▋      | 37/100 [09:38<14:48, 14.11s/it]

Trees: 37, MSE: 0.12804


Training Progress:  38%|███▊      | 38/100 [09:44<12:00, 11.62s/it]

Trees: 38, MSE: 0.12794


Training Progress:  39%|███▉      | 39/100 [09:50<10:01,  9.86s/it]

Trees: 39, MSE: 0.12785


Training Progress:  40%|████      | 40/100 [09:55<08:40,  8.68s/it]

Trees: 40, MSE: 0.12778


Training Progress:  41%|████      | 41/100 [10:02<07:55,  8.06s/it]

Trees: 41, MSE: 0.12767


Training Progress:  42%|████▏     | 42/100 [10:09<07:35,  7.85s/it]

Trees: 42, MSE: 0.12760


Training Progress:  43%|████▎     | 43/100 [10:16<07:09,  7.54s/it]

Trees: 43, MSE: 0.12750


Training Progress:  44%|████▍     | 44/100 [10:23<06:48,  7.30s/it]

Trees: 44, MSE: 0.12747


Training Progress:  45%|████▌     | 45/100 [10:29<06:26,  7.03s/it]

Trees: 45, MSE: 0.12745


Training Progress:  46%|████▌     | 46/100 [10:36<06:13,  6.92s/it]

Trees: 46, MSE: 0.12741


Training Progress:  47%|████▋     | 47/100 [10:43<06:11,  7.01s/it]

Trees: 47, MSE: 0.12736


Training Progress:  48%|████▊     | 48/100 [10:51<06:14,  7.21s/it]

Trees: 48, MSE: 0.12737


Training Progress:  49%|████▉     | 49/100 [10:57<05:54,  6.94s/it]

Trees: 49, MSE: 0.12729


Training Progress:  50%|█████     | 50/100 [11:04<05:45,  6.92s/it]

Trees: 50, MSE: 0.12725


Training Progress:  51%|█████     | 51/100 [11:12<05:52,  7.19s/it]

Trees: 51, MSE: 0.12720


Training Progress:  52%|█████▏    | 52/100 [11:21<06:16,  7.85s/it]

Trees: 52, MSE: 0.12712


Training Progress:  53%|█████▎    | 53/100 [11:30<06:21,  8.12s/it]

Trees: 53, MSE: 0.12711


Training Progress:  54%|█████▍    | 54/100 [11:38<06:04,  7.91s/it]

Trees: 54, MSE: 0.12710


Training Progress:  55%|█████▌    | 55/100 [11:45<05:53,  7.86s/it]

Trees: 55, MSE: 0.12703


Training Progress:  56%|█████▌    | 56/100 [11:53<05:45,  7.85s/it]

Trees: 56, MSE: 0.12694


Training Progress:  57%|█████▋    | 57/100 [12:01<05:32,  7.73s/it]

Trees: 57, MSE: 0.12697


Training Progress:  58%|█████▊    | 58/100 [12:07<05:09,  7.36s/it]

Trees: 58, MSE: 0.12703


Training Progress:  59%|█████▉    | 59/100 [12:16<05:14,  7.68s/it]

Trees: 59, MSE: 0.12696


Training Progress:  60%|██████    | 60/100 [12:30<06:24,  9.62s/it]

Trees: 60, MSE: 0.12692


Training Progress:  61%|██████    | 61/100 [12:48<07:54, 12.16s/it]

Trees: 61, MSE: 0.12692


Training Progress:  62%|██████▏   | 62/100 [12:59<07:26, 11.75s/it]

Trees: 62, MSE: 0.12690


Training Progress:  63%|██████▎   | 63/100 [13:07<06:37, 10.75s/it]

Trees: 63, MSE: 0.12690


Training Progress:  64%|██████▍   | 64/100 [13:16<06:06, 10.18s/it]

Trees: 64, MSE: 0.12685


Training Progress:  65%|██████▌   | 65/100 [13:25<05:47,  9.94s/it]

Trees: 65, MSE: 0.12683


Training Progress:  66%|██████▌   | 66/100 [13:33<05:21,  9.45s/it]

Trees: 66, MSE: 0.12681


Training Progress:  67%|██████▋   | 67/100 [13:41<04:55,  8.96s/it]

Trees: 67, MSE: 0.12678


Training Progress:  68%|██████▊   | 68/100 [13:49<04:31,  8.48s/it]

Trees: 68, MSE: 0.12679


Training Progress:  69%|██████▉   | 69/100 [13:56<04:08,  8.03s/it]

Trees: 69, MSE: 0.12675


Training Progress:  70%|███████   | 70/100 [14:03<03:57,  7.93s/it]

Trees: 70, MSE: 0.12671


Training Progress:  71%|███████   | 71/100 [14:11<03:50,  7.94s/it]

Trees: 71, MSE: 0.12666


Training Progress:  72%|███████▏  | 72/100 [14:18<03:30,  7.50s/it]

Trees: 72, MSE: 0.12668


Training Progress:  73%|███████▎  | 73/100 [14:24<03:12,  7.11s/it]

Trees: 73, MSE: 0.12664


Training Progress:  74%|███████▍  | 74/100 [14:30<02:57,  6.82s/it]

Trees: 74, MSE: 0.12664


Training Progress:  75%|███████▌  | 75/100 [14:36<02:46,  6.65s/it]

Trees: 75, MSE: 0.12659


Training Progress:  76%|███████▌  | 76/100 [14:43<02:36,  6.51s/it]

Trees: 76, MSE: 0.12658


Training Progress:  77%|███████▋  | 77/100 [14:50<02:35,  6.77s/it]

Trees: 77, MSE: 0.12659


Training Progress:  78%|███████▊  | 78/100 [15:00<02:52,  7.84s/it]

Trees: 78, MSE: 0.12656


Training Progress:  79%|███████▉  | 79/100 [15:18<03:48, 10.89s/it]

Trees: 79, MSE: 0.12658


Training Progress:  80%|████████  | 80/100 [15:28<03:30, 10.53s/it]

Trees: 80, MSE: 0.12658


Training Progress:  81%|████████  | 81/100 [15:35<03:01,  9.56s/it]

Trees: 81, MSE: 0.12655


Training Progress:  82%|████████▏ | 82/100 [15:42<02:37,  8.74s/it]

Trees: 82, MSE: 0.12651


Training Progress:  83%|████████▎ | 83/100 [15:48<02:16,  8.03s/it]

Trees: 83, MSE: 0.12648


Training Progress:  84%|████████▍ | 84/100 [15:55<01:59,  7.49s/it]

Trees: 84, MSE: 0.12650


Training Progress:  85%|████████▌ | 85/100 [16:01<01:47,  7.15s/it]

Trees: 85, MSE: 0.12649


Training Progress:  86%|████████▌ | 86/100 [16:07<01:36,  6.89s/it]

Trees: 86, MSE: 0.12649


Training Progress:  87%|████████▋ | 87/100 [16:14<01:27,  6.77s/it]

Trees: 87, MSE: 0.12646


Training Progress:  88%|████████▊ | 88/100 [16:20<01:20,  6.74s/it]

Trees: 88, MSE: 0.12646


Training Progress:  89%|████████▉ | 89/100 [16:29<01:19,  7.26s/it]

Trees: 89, MSE: 0.12647


Training Progress:  90%|█████████ | 90/100 [16:37<01:15,  7.54s/it]

Trees: 90, MSE: 0.12647


Training Progress:  91%|█████████ | 91/100 [16:45<01:08,  7.65s/it]

Trees: 91, MSE: 0.12646


Training Progress:  92%|█████████▏| 92/100 [16:52<01:00,  7.51s/it]

Trees: 92, MSE: 0.12647


Training Progress:  93%|█████████▎| 93/100 [16:59<00:50,  7.15s/it]

Trees: 93, MSE: 0.12648


Training Progress:  94%|█████████▍| 94/100 [17:05<00:41,  6.86s/it]

Trees: 94, MSE: 0.12648


Training Progress:  95%|█████████▌| 95/100 [17:11<00:33,  6.77s/it]

Trees: 95, MSE: 0.12648


Training Progress:  96%|█████████▌| 96/100 [17:18<00:26,  6.66s/it]

Trees: 96, MSE: 0.12646


Training Progress:  97%|█████████▋| 97/100 [17:24<00:19,  6.60s/it]

Trees: 97, MSE: 0.12644


Training Progress:  98%|█████████▊| 98/100 [17:31<00:13,  6.52s/it]

Trees: 98, MSE: 0.12646


Training Progress:  99%|█████████▉| 99/100 [17:37<00:06,  6.48s/it]

Trees: 99, MSE: 0.12643


Training Progress: 100%|██████████| 100/100 [17:43<00:00, 10.64s/it]

Trees: 100, MSE: 0.12644

✅ Best MSE: 0.12643





In [None]:
import matplotlib.pyplot as plt

plt.plot(range(1, 101), mse_progress, marker='o')
plt.xlabel('Number of Trees')
plt.ylabel('MSE')
plt.title('MSE vs Number of Trees')
plt.grid(True)
plt.show()
