In [4]:
"""
Kaggle campaign contribution lab – combined script
Created from photographed notebook cells.
"""

# ------------------------------------------------------------------
# 0. Imports & settings
# ------------------------------------------------------------------
import re
import warnings
import numpy as np
import pandas as pd
import networkx as nx
from networkx.algorithms import bipartite

from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

# ------------------------------------------------------------------
# 1. Load raw CSV files
# ------------------------------------------------------------------
# Bipartite state‑level contribution networks
all_candidates_state_bi = pd.read_csv("all_candidates_state_bipartite_weighted_network.csv")
winning_candidates_state_bi = pd.read_csv("winning_candidates_state_bipartite_weighted_network.csv")

for df in (all_candidates_state_bi, winning_candidates_state_bi):
    df.index = df["Unnamed: 0"]
    df.drop(columns=["Unnamed: 0", "Unnamed: 1"], inplace=True)

# Top‑100 contributor networks
federal_net = pd.read_csv("federal_contributor_top100_contributors_network.csv")
state_net   = pd.read_csv("state_contributor_top100_contributors_network.csv")
for df in (federal_net, state_net):
    df.index = df["Unnamed: 0"]
    df.drop(columns=["Unnamed: 0"], inplace=True)

# Tabular train/test sets
train_raw = pd.read_csv("training_data.csv")
test_raw  = pd.read_csv("test_data.csv")

# ------------------------------------------------------------------
# 2. Build bipartite graphs & projected state graphs
# ------------------------------------------------------------------
def build_projected_state_graph(df):
    """Return projected homogeneous state graph with custom weights."""
    B = nx.Graph()
    states     = list(df.columns)
    candidates = list(df.index)

    B.add_nodes_from(candidates, bipartite=0)
    B.add_nodes_from(states,     bipartite=1)

    for s in states:
        for c in df[df[s] > 0].index:
            B.add_edge(s, c, weight=df.loc[c, s])

    def my_weight(G, u, v, weight="weight"):
        w = 0
        for nbr in set(G[u]) & set(G[v]):
            w += G[u][nbr].get(weight, 1) + G[v][nbr].get(weight, 1)
        return w

    bottom_nodes = {n for n, d in B.nodes(data=True) if d["bipartite"] == 1}
    return bipartite.generic_weighted_projected_graph(B, bottom_nodes,
                                                      weight_function=my_weight)

G_all = build_projected_state_graph(all_candidates_state_bi)
G_win = build_projected_state_graph(winning_candidates_state_bi)

# ------------------------------------------------------------------
# 3. Helper to parse “top‑100” contributor column names
# ------------------------------------------------------------------
def parse_top100_cols(df_net):
    meta = {}
    for col in df_net.columns:
        name = " ".join(col.split(" ")[:-2])
        contr_type = "Non-Individual" if re.search(",", name) is None else "Individual"
        zip_code = col.split(" ")[-2] or 0
        state = col.split(" ")[-1]
        meta[col] = {"name": name, "zip_code": zip_code, "state": state,
                     "contributor_type": contr_type}
    return meta

federal_meta = parse_top100_cols(federal_net)
state_meta   = parse_top100_cols(state_net)

# ------------------------------------------------------------------
# 4. Merge raw train/test into a single “all_data” frame
# ------------------------------------------------------------------
train_raw["train_label"] = 1
test_raw["train_label"]  = 0
train_raw["index"]       = train_raw.index
test_raw["index"]        = test_raw.index

all_data = pd.concat([train_raw, test_raw], axis=0)

# ------------------------------------------------------------------
# 5. Add graph‑based centrality & community features
# ------------------------------------------------------------------
def add_degree(df, graph, col, weight=None):
    deg = pd.DataFrame(nx.degree(graph, weight=weight), columns=["state", col])
    return df.merge(deg, how="left")

all_data = (all_data
            .pipe(add_degree, G_all, "d_all")
            .pipe(add_degree, G_win, "d_win")
            .pipe(add_degree, G_all, "d_weight_all", weight="weight")
            .pipe(add_degree, G_win, "d_weight_win", weight="weight"))

# Community clustering with greedy modularity
def add_communities(df, graph, prefix):
    groups = nx.community.greedy_modularity_communities(graph, weight="weight")
    lbl = np.full(len(df), -1, dtype=int)
    num = np.full(len(df), -1, dtype=int)
    for cid, nodes in enumerate(groups):
        m = df["state"].isin(nodes)
        lbl[m] = cid
        num[m] = len(nodes)
    df[f"{prefix}"] = lbl
    df[f"{prefix}_num"] = num
    return df

all_data = add_communities(all_data, G_all, "greedy_modularity")
all_data = add_communities(all_data, G_win, "greedy_modularity_win")

# Louvain clustering
def add_louvain(df, graph, prefix):
    groups = nx.community.louvain_communities(graph)
    lbl = np.full(len(df), -1, dtype=int)
    num = np.full(len(df), -1, dtype=int)
    for cid, nodes in enumerate(groups):
        m = df["state"].isin(nodes)
        lbl[m] = cid
        num[m] = len(nodes)
    df[f"{prefix}"] = lbl
    df[f"{prefix}_num"] = num
    return df

all_data = add_louvain(all_data, G_all, "louvain_communities")
all_data = add_louvain(all_data, G_win, "louvain_communities_win")

# ------------------------------------------------------------------
# 6. Top‑100 contributor degree‑based features
# ------------------------------------------------------------------
def contributor_features(net, meta, prefix):
    d = pd.DataFrame(nx.degree(nx.from_pandas_adjacency(net), weight="weight"),
                     columns=["name", "d"])

    # Build individual lists for state, type, zip_code
    state_list, type_list, zip_list = [], [], []
    for name in d["name"]:
        meta_row = meta.get(name, {"state": None, "contributor_type": None, "zip_code": None})
        state_list.append(meta_row["state"])
        type_list.append(meta_row["contributor_type"])
        zip_list.append(meta_row["zip_code"])

    d[f"state_{prefix}"] = state_list
    d[f"type_{prefix}"] = type_list
    d[f"zip_{prefix}"] = zip_list

    # Group means — only on 'd' (degree)
    state = (
        d.groupby(f"state_{prefix}")["d"]
        .mean().reset_index().rename(columns={"d": f"state_{prefix}"})
    )
    type_ = (
        d.groupby(f"type_{prefix}")["d"]
        .mean().reset_index().rename(columns={"d": f"type_{prefix}"})
    )
    zip_ = (
        d.groupby(f"zip_{prefix}")["d"]
        .mean().reset_index().rename(columns={"d": f"zip_{prefix}"})
    )
    zip_[f"zip_{prefix}"] = zip_[f"zip_{prefix}"].astype(float)

    # Group by all three categorical keys
    state_type_zip = (
        d.groupby([f"state_{prefix}", f"type_{prefix}", f"zip_{prefix}"])["d"]
        .mean().reset_index().rename(columns={"d": f"mean_money_{prefix}"})
    )
    state_type_zip["zip_code"] = state_type_zip[f"zip_{prefix}"].astype(float)

    return state, type_, zip_, state_type_zip

# State‑level
s_state, s_type, s_zip, s_stz = contributor_features(state_net, state_meta, "100_s_d")
# Federal‑level
f_state, f_type, f_zip, f_stz = contributor_features(federal_net, federal_meta, "100_f_d")

all_data = all_data.merge(s_state, how="left", on="state")
all_data = all_data.merge(s_type, how="left", on="contributor_type")
all_data = all_data.merge(s_zip, how="left", on="zip_code")
all_data = all_data.merge(s_stz, how="left", on=["state", "contributor_type", "zip_code"])

all_data = all_data.merge(f_state, how="left", on="state")
all_data = all_data.merge(f_type, how="left", on="contributor_type")
all_data = all_data.merge(f_zip, how="left", on="zip_code")
all_data = all_data.merge(f_stz, how="left", on=["state", "contributor_type", "zip_code"])


# ------------------------------------------------------------------
# 7. Encode categorical columns, clean NA/inf
# ------------------------------------------------------------------
def ordinalize(df, col):
    labels = df[col].astype(str).unique().tolist()
    df[col] = df[col].astype(str).apply(lambda x: labels.index(x))
    return df

for col in ("general_sector", "city", "zip_code",
            "specific_sector", "state", "contributor_type"):
    if col in all_data.columns:
        all_data = ordinalize(all_data, col)

all_data = all_data.replace("#NAME?", 0)
all_data[np.isinf(all_data)] = 0
all_data = all_data.fillna(0)

# One‑hot encode selected discrete vars
all_data = pd.get_dummies(
    all_data,
    columns=["specific_sector", "state", "contributor_type",
             "greedy_modularity", "greedy_modularity_win",
             "louvain_communities", "louvain_communities_win"]
)

  train_raw = pd.read_csv("training_data.csv")


KeyError: 'state'

In [None]:
# ------------------------------------------------------------------
# 8. Train/test/validation split for modeling
# ------------------------------------------------------------------
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# First separate the training data from the final test data
train_df = all_data[all_data["train_label"] == 1].drop(columns=["index"])
test_df = all_data[all_data["train_label"] == 0]

# Now split the training data into train and validation sets (80% train, 20% validation)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=42)

# Prepare X and y for each set
y_train = train_df["winner_ratio"]
X_train = train_df.drop(columns=["winner_ratio", "train_label"])

y_val = val_df["winner_ratio"]
X_val = val_df.drop(columns=["winner_ratio", "train_label"])

test_idx = test_df["index"]
X_test = test_df.drop(columns=["winner_ratio", "train_label", "index"])

# ------------------------------------------------------------------
# 9. Train Random Forest model directly (no hyperparameter tuning)
# ------------------------------------------------------------------
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import numpy as np

# Create and train a Random Forest model with default parameters
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on train and validation sets
train_pred = rf.predict(X_train)
val_pred = rf.predict(X_val)

# Calculate RMSE for both sets
train_rmse = np.sqrt(metrics.mean_squared_error(y_train, train_pred))
val_rmse = np.sqrt(metrics.mean_squared_error(y_val, val_pred))

print(f"Training RMSE: {train_rmse:,.4f}")
print(f"Validation RMSE: {val_rmse:,.4f}")

# ------------------------------------------------------------------
# Plot actual vs predicted values for train and validation data
# ------------------------------------------------------------------
# Create a figure with two subplots side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

# Training data plot
ax1.scatter(y_train, train_pred, alpha=0.5)
ax1.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--')
ax1.set_xlabel('Actual Values')
ax1.set_ylabel('Predicted Values')
ax1.set_title(f'Training Data: Actual vs Predicted\nRMSE: {train_rmse:.4f}')

# Validation data plot
ax2.scatter(y_val, val_pred, alpha=0.5)
ax2.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--')
ax2.set_xlabel('Actual Values')
ax2.set_ylabel('Predicted Values')
ax2.set_title(f'Validation Data: Actual vs Predicted\nRMSE: {val_rmse:.4f}')

plt.tight_layout()
plt.savefig('actual_vs_predicted.png')
plt.show()

# ------------------------------------------------------------------
# 10. Predict test & save submission
# ------------------------------------------------------------------
test_pred = rf.predict(X_test)
submission = pd.DataFrame({"index": test_idx, "winner_ratio": test_pred})
submission.to_csv("sample_submission_combined_RF.csv", index=False)
print("Submission saved → sample_submission_combined_RF.csv")
print("Plot saved → actual_vs_predicted.png")

In [None]:
ratan

In [None]:
# ------------------------------------------------------------------
# 8. Train/test split for modeling
# ------------------------------------------------------------------
train_df = all_data[all_data["train_label"] == 1].drop(columns=["index"])
test_df  = all_data[all_data["train_label"] == 0]

y_train = train_df["winner_ratio"]
X_train = train_df.drop(columns=["winner_ratio", "train_label"])
test_idx = test_df["index"]
X_test  = test_df.drop(columns=["winner_ratio", "train_label", "index"])

# ------------------------------------------------------------------
# 9. Random Forest + RandomizedSearchCV
# ------------------------------------------------------------------
param_dists = {
    "n_estimators": range(1, 50, 5),
    "max_depth": range(1, 30, 5),
    "min_samples_split": [2, 3, 4],
    "min_samples_leaf": [1, 2, 3, 4, 5],
}

rf = RandomForestRegressor(random_state=42)
rf_cv = RandomizedSearchCV(
    rf, param_dists, n_iter=300, cv=3,
    scoring="neg_mean_squared_error", n_jobs=-1, verbose=1
)
rf_cv.fit(X_train, y_train)

train_rmse = np.sqrt(metrics.mean_squared_error(y_train, rf_cv.predict(X_train)))
print(f"Training RMSE: {train_rmse:,.4f}")

# ------------------------------------------------------------------
# 10. Predict test & save submission
# ------------------------------------------------------------------
test_pred = rf_cv.predict(X_test)
submission = pd.DataFrame({"index": test_idx, "winner_ratio": test_pred})
submission.to_csv("sample_submission_combined_RF.csv", index=False)
print("Submission saved → sample_submission_combined_RF.csv")
