### 1. Import Required Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

### 2. Load and Prepare Data

In [9]:
# Load your dataset (replace with your data)
data = pd.read_csv("C:/Users/Lenovo/Desktop/R_Software/Customer_churn_raw.csv")

In [13]:
# Split into features (X) and target (y)
X = data.drop('Customer Value', axis=1)
y = data['Customer Value']

In [15]:
X

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Tariff Plan,Status,Age,Churn
0,8,0,38.0,0,4370,71.0,5.0,17.0,1,1,5,0
1,0,0,39.0,0,,5.0,7.0,4.0,1,1,6,0
2,10,0,37.0,0,2453,60.0,359.0,24.0,1,1,2,0
3,10,0,38.0,0,4198,66.0,1.0,35.0,1,1,10,0
4,3,0,,0,2393,58.0,2.0,33.0,1,1,9,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3145,21,0,19.0,2,6697,147.0,92.0,44.0,2,1,68,0
3146,17,0,17.0,1,9237,177.0,80.0,42.0,1,1,65,0
3147,13,0,18.0,4,3157,-51.0,38.0,21.0,1,1,64,0
3148,7,0,11.0,2,4695,46.0,222.0,12.0,1,1,62,0


In [17]:
y

0        197.640
1         46.035
2       1536.520
3        240.020
4        145.805
          ...   
3145     721.980
3146     261.210
3147     280.320
3148    1077.640
3149     100.680
Name: Customer Value, Length: 3150, dtype: float64

In [19]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 3. Initialize the Gradient Boosting Regressor

In [22]:
model = GradientBoostingRegressor(
    n_estimators=100,  # Number of boosting stages
    learning_rate=0.1,  # Step size shrinkage to prevent overfitting
    max_depth=3,  # Maximum depth of individual trees
    random_state=42
)

### 4. Train the Model

In [25]:
model.fit(X_train, y_train)

ValueError: could not convert string to float: 'O'

### 5. Make Predictions

In [None]:
y_pred = model.predict(X_test)

### 6. Evaluate Model Performance

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²): {r2:.2f}")

### 7. Hyperparameter Tuning (Optional)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print(f"Best hyperparameters: {best_params}")

### 8. Visualize Results

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('True vs. Predicted Values')
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=2)  # Diagonal line
plt.show()

Key Hyperparameters to Tune:

    n_estimators: Number of boosting stages (higher = better performance but risk of overfitting).

    learning_rate: Reduces the contribution of each tree (lower values require more trees).

    max_depth: Controls the complexity of individual trees.

    subsample: Fraction of samples used for fitting trees (stochastic gradient boosting).

Advantages of Gradient Boosting:

    Handles non-linear relationships and interactions.

    Robust to outliers and missing data (with appropriate preprocessing).

    Often achieves state-of-the-art performance on structured data.

Notes:

    For large datasets, consider XGBoost, LightGBM, or CatBoost for faster training.

    Use early stopping (n_iter_no_change) to prevent overfitting.

    Check feature importance with model.feature_importances_.
