# Tackling Overfitting in Machine Learning using Random Forest and XGBoost

### Objective
Train Random Forest and XGBoost models on a dataset, identify overfitting, and apply techniques to reduce it.

### Step 1: Set Up the Environment
Install the required libraries:
```bash
pip install pandas numpy scikit-learn xgboost matplotlib
```

In [None]:
# Step 2: Load and Prepare the Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00477/Real%20estate%20valuation%20data%20set.xlsx'
data = pd.read_excel(url)
data.columns = ['Index', 'TransactionDate', 'HouseAge', 'DistanceToMRT', 'NumberOfStores', 'Latitude', 'Longitude', 'Price']
data = data.drop('Index', axis=1)

# Features and target
X = data.drop('Price', axis=1)
y = data['Price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 3: Train a Random Forest Model
from sklearn.ensemble import RandomForestRegressor

# Train a Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate
train_preds = rf.predict(X_train)
test_preds = rf.predict(X_test)

print('Random Forest:')
print(f'Train RMSE: {mean_squared_error(y_train, train_preds, squared=False):.2f}')
print(f'Test RMSE: {mean_squared_error(y_test, test_preds, squared=False):.2f}')

In [None]:
# Step 5: Mitigate Overfitting in Random Forest
rf_tuned = RandomForestRegressor(
    n_estimators=100,
    max_depth=10,
    max_features=0.8,
    min_samples_split=10,
    min_samples_leaf=4,
    random_state=42,
)
rf_tuned.fit(X_train, y_train)

# Evaluate
train_preds_tuned = rf_tuned.predict(X_train)
test_preds_tuned = rf_tuned.predict(X_test)

print('Tuned Random Forest:')
print(f'Train RMSE: {mean_squared_error(y_train, train_preds_tuned, squared=False):.2f}')
print(f'Test RMSE: {mean_squared_error(y_test, test_preds_tuned, squared=False):.2f}')

In [None]:
# Step 6: Train an XGBoost Model
import xgboost as xgb

xgb_model = xgb.XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# Evaluate
train_preds_xgb = xgb_model.predict(X_train)
test_preds_xgb = xgb_model.predict(X_test)

print('XGBoost:')
print(f'Train RMSE: {mean_squared_error(y_train, train_preds_xgb, squared=False):.2f}')
print(f'Test RMSE: {mean_squared_error(y_test, test_preds_xgb, squared=False):.2f}')

In [None]:
# Step 7: Mitigate Overfitting in XGBoost
xgb_tuned = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=6,
    min_child_weight=3,
    gamma=0.1,
    reg_alpha=0.5,
    reg_lambda=1.0,
    random_state=42,
)
xgb_tuned.fit(X_train, y_train)

# Evaluate
train_preds_xgb_tuned = xgb_tuned.predict(X_train)
test_preds_xgb_tuned = xgb_tuned.predict(X_test)

print('Tuned XGBoost:')
print(f'Train RMSE: {mean_squared_error(y_train, train_preds_xgb_tuned, squared=False):.2f}')
print(f'Test RMSE: {mean_squared_error(y_test, train_preds_xgb_tuned, squared=False):.2f}')

In [None]:
# Step 8: Visualize Feature Importance
import matplotlib.pyplot as plt

# Random Forest
feature_importance_rf = rf_tuned.feature_importances_
plt.barh(X.columns, feature_importance_rf)
plt.title('Random Forest Feature Importance')
plt.show()

# XGBoost
xgb.plot_importance(xgb_tuned)
plt.title('XGBoost Feature Importance')
plt.show()