In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler

# Load data
data = pd.read_csv('Customer_Data.csv')

  # Clean data
data = data[data['Monthly_Charge'] >= 0]
data.fillna({'Gender': 'Unknown', 'Contract': 'Unknown', 'Internet_Type': 'None'}, inplace=True)

  # Feature engineering
data['Average_Monthly_Charge'] = data['Total_Charges'] / (data['Tenure_in_Months'] + 1)
data['Referral_Rate'] = data['Number_of_Referrals'] / (data['Tenure_in_Months'] + 1)
data['Purchase_Frequency'] = np.where(data['Tenure_in_Months'] > 0, 1 / data['Tenure_in_Months'], 0)
data['Engagement_Score'] = (
    data['Number_of_Referrals'] / (data['Number_of_Referrals'].max() + 1) +
    data['Multiple_Lines'].apply(lambda x: 0.3 if x == 'Yes' else 0) +
    data['Internet_Service'].apply(lambda x: 0.3 if x == 'Yes' else 0) +
    data['Streaming_TV'].apply(lambda x: 0.2 if x == 'Yes' else 0) +
    data['Streaming_Movies'].apply(lambda x: 0.2 if x == 'Yes' else 0)
)

  # Normalize numerical features
scaler = StandardScaler()
numerical_cols = ['Age', 'Tenure_in_Months', 'Monthly_Charge', 'Average_Monthly_Charge', 'Referral_Rate', 'Purchase_Frequency', 'Engagement_Score']
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

  # Clustering with K-Means
X_cluster = data[['Average_Monthly_Charge', 'Tenure_in_Months', 'Engagement_Score']].fillna(0)
kmeans = KMeans(n_clusters=3, random_state=42)
data['Cluster'] = kmeans.fit_predict(X_cluster)

  # Anomaly detection with DBSCAN
X_anomaly = data[['Average_Monthly_Charge', 'Tenure_in_Months', 'Engagement_Score']].fillna(0)
dbscan = DBSCAN(eps=0.5, min_samples=5)
data['Anomaly'] = dbscan.fit_predict(X_anomaly)

  # Save enhanced dataset
data.to_csv('Customer_Data_Enhanced.csv', index=False)

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

  # Load data
data = pd.read_csv('Customer_Data_Enhanced.csv')
data['Churn_Label'] = data['Customer_Status'].map({'Stayed': 0, 'Churned': 1, 'Joined': 0})  # Treat Joined as non-churned

  # Features and target
features = ['Age', 'Tenure_in_Months', 'Monthly_Charge', 'Number_of_Referrals', 'Average_Monthly_Charge', 'Engagement_Score']
X = data[features].fillna(0)
y = data['Churn_Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

  # Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(random_state=42)
}

  # Train and evaluate models
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred)
    })
    if name == 'Random Forest':
        data['Churn_Probability'] = model.predict_proba(X)[:, 1]
        importance = pd.DataFrame({'Feature': features, 'Importance': model.feature_importances_})
        importance.to_csv('Feature_Importance.csv', index=False)

  # Save results
pd.DataFrame(results).to_csv('Model_Performance.csv', index=False)
data.to_csv('Customer_Data_Predictions.csv', index=False)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[LightGBM] [Info] Number of positive: 1220, number of negative: 3197
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000083 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 721
[LightGBM] [Info] Number of data points in the train set: 4417, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.276206 -> initscore=-0.963362
[LightGBM] [Info] Start training from score -0.963362


In [7]:
from sklearn.neural_network import MLPClassifier
def train_ann(data):
    X = data[['Age', 'Tenure_in_Months', 'Monthly_Charge', 'Number_of_Referrals', 'Average_Monthly_Charge', 'Engagement_Score']].fillna(0)
    y = data['Customer_Status'].map({'Stayed': 0, 'Churned': 1, 'Joined': 0})
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    ann = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
    ann.fit(X_train, y_train)
    data['ANN_Churn_Probability'] = ann.predict_proba(X)[:, 1]
    data.to_csv('Customer_Data_ANN.csv', index=False)
train_ann(data)

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
def train_rnn(data):
    # Aggregate by Tenure_in_Months
    ts_data = data.groupby('Tenure_in_Months').agg({'Churn_Label': 'mean', 'Total_Charges': 'sum'}).reset_index()
    X = ts_data[['Total_Charges']].values
    y = ts_data['Churn_Label'].values
    X = X.reshape((X.shape[0], 1, X.shape[1]))  # Reshape for LSTM
    model = Sequential()
    model.add(LSTM(50, input_shape=(1, X.shape[2])))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy')
    model.fit(X, y, epochs=50, verbose=0)
    ts_data['RNN_Churn_Probability'] = model.predict(X)
    ts_data.to_csv('RNN_Forecast.csv', index=False)
train_rnn(data)

  super().__init__(**kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step
