### First Competition Model: Random Forest

For the first competition, we implemented a **Random Forest Classifier** to predict whether an individual's annual income exceeds $50,000. This model was chosen for its ability to handle both classification and regression tasks effectively, as well as its robustness to overfitting. The model's performance was evaluated using **balanced accuracy**, and hyperparameters were fine-tuned for optimal results.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score

In [None]:
# 1️⃣ Load the processed dataset
train_df = pd.read_csv("../data/train_processed.csv")
test_df = pd.read_csv("../data/test_processed.csv")

In [None]:
# 2️⃣ Split into features (X) and target (y)
X = train_df.drop(columns=["income"])
y = train_df["income"]

In [None]:
# 3️⃣ Train-validation split (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# 4️⃣ Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# 5️⃣ Evaluate the model
y_val_pred = rf_model.predict(X_val)
balanced_acc = balanced_accuracy_score(y_val, y_val_pred)
print(f"✅ Model Trained! Balanced Accuracy: {balanced_acc:.4f}")

In [None]:
# 6️⃣ Make predictions on test data
test_predictions = rf_model.predict(test_df)

In [None]:
# 7️⃣ Save predictions for submission
submission_file = "../submissions/census_income_mustafozoda_sharif_1_random.csv"
np.savetxt(submission_file, test_predictions, fmt="%d")

In [None]:
print(f"✅ Predictions saved to: {submission_file}")

In [None]:
import numpy as np

submission_file = "../submissions/census_income_mustafozoda_sharif_1_random.csv"
predictions = np.loadtxt(submission_file, dtype=int)

print("✅ Submission file loaded successfully!")
print("Unique values in submission file:",
      np.unique(predictions))  # Should be [0, 1]
print("Total predictions:", len(predictions))

In [None]:
from sklearn.model_selection import GridSearchCV

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees
    'max_depth': [10, 20, None],  # Depth of trees
    'min_samples_split': [2, 5, 10],  # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4]  # Minimum samples per leaf node
}



In [None]:

# Initialize model
rf_model = RandomForestClassifier(random_state=42)

In [None]:
# Grid search
grid_search = GridSearchCV(rf_model, param_grid, cv=5,
                           scoring='balanced_accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
# Best model
best_rf = grid_search.best_estimator_

In [None]:
# Evaluate on validation data
y_val_pred = best_rf.predict(X_val)
balanced_acc = balanced_accuracy_score(y_val, y_val_pred)
print(
    f"✅ Best Random Forest Model Trained! Balanced Accuracy: {balanced_acc:.4f}")

In [None]:
# Check for missing values in features
print("Missing values in X before splitting:")
print(X.isnull().sum().sort_values(ascending=False))

In [None]:
# Fill missing values in 'sex' with the most frequent value (mode)
X_train["sex"] = X_train["sex"].fillna(X_train["sex"].mode()[0])
X_val["sex"] = X_val["sex"].fillna(X_val["sex"].mode()[0])

In [None]:
print("Missing values in X_train:")
print(X_train.isnull().sum()[X_train.isnull().sum() > 0])

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(
    X_train)  # Fit & transform on training data
X_val_scaled = scaler.transform(X_val)  # Transform validation data

log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)

y_val_pred = log_reg.predict(X_val_scaled)
balanced_acc = balanced_accuracy_score(y_val, y_val_pred)
print(f"✅ Scaled Logistic Regression Balanced Accuracy: {balanced_acc:.4f}")

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(
    n_estimators=200, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)

y_val_pred = gb_model.predict(X_val)
balanced_acc = balanced_accuracy_score(y_val, y_val_pred)
print(f"✅ Gradient Boosting Balanced Accuracy: {balanced_acc:.4f}")

In [None]:
print("Missing values in test_df:")
print(test_df.isnull().sum()[test_df.isnull().sum() > 0])

In [None]:
# Fill missing values in 'sex' with the most frequent value (mode)
test_df["sex"] = test_df["sex"].fillna(test_df["sex"].mode()[0])
print("Final missing values in test_df:",
      test_df.isnull().sum().sum())  # Should print 0

In [None]:
test_predictions = gb_model.predict(test_df)
np.savetxt("../submissions/census_income_mustafozoda_sharif_1_random.csv",

           test_predictions, fmt="%d")

print("✅ Gradient Boosting Predictions Saved!")

In [None]:
# Train Random Forest again
rf_model.fit(X_train, y_train)

# Now make predictions
test_predictions = rf_model.predict(test_df)

# Save predictions
np.savetxt("../submissions/census_income_mustafozoda_sharif_1_random.csv",
           test_predictions, fmt="%d")
print("✅ Random Forest Predictions Saved!")

In [None]:
import numpy as np

submission_file = "../submissions/census_income_mustafozoda_sharif_1_random.csv"
predictions = np.loadtxt(submission_file, dtype=int)

print("Submission file loaded successfully!")
print("Unique values in submission file:",
      np.unique(predictions)) 
print("Total predictions:", len(predictions)) 

In [None]:
import subprocess

# Export preprocessing.ipynb
subprocess.run(["jupyter", "nbconvert", "--to", "html",
               "../notebooks/preprocessing.ipynb"])

# Export modeling.ipynb
subprocess.run(["jupyter", "nbconvert", "--to",
               "html", "../notebooks/modeling.ipynb"])

print("Notebooks exported successfully!")

In [None]:
subprocess.run(["jupyter", "nbconvert", "--to", "pdf",
               "../notebooks/preprocessing.ipynb"])
subprocess.run(["jupyter", "nbconvert", "--to",
               "pdf", "../notebooks/modeling.ipynb"])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot age distribution
plt.figure(figsize=(8, 5))
sns.histplot(train_df["age"], bins=30, kde=True, color="blue")
plt.title("Age Distribution")
plt.show()

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(train_df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(x="sex", hue="income", data=train_df, palette="viridis")
plt.title("Income Comparison by Sex")
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Predictions on validation set
y_val_pred = rf_model.predict(X_val)

# Create confusion matrix
cm = confusion_matrix(y_val, y_val_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)

# Display the matrix
disp.plot(cmap="Blues")
plt.title("Random Forest Confusion Matrix")
plt.show()