In [51]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report

In [7]:
df = pd.read_csv(r"C:\Users\DELL\Documents\DATA_SCIENCE\Hamoye_Internship\LESSON 3\WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [8]:
# Preprocessing
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce').fillna(0)
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

In [9]:
# Split the data into train and test sets
x = df.drop('Churn', axis=1)
y = df['Churn']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [10]:
# feature selection
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
               'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
               'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [47]:
# Feature engineering
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train[numerical])
x_test_scaled = scaler.transform(x_test[numerical])

x_train_scaled_df = pd.DataFrame(x_train_scaled, columns=x_train[numerical].columns)
x_test_scaled_df = pd.DataFrame(x_test_scaled, columns=x_test[numerical].columns)

encoder = OneHotEncoder(sparse=False)
x_train_encoded = pd.DataFrame(encoder.fit_transform(x_train[categorical]))
x_test_encoded = pd.DataFrame(encoder.transform(x_test[categorical]))

x_train_encoded.columns = encoder.get_feature_names_out(categorical)
x_test_encoded.columns = encoder.get_feature_names_out(categorical)

x_train_final = pd.concat([x_train_encoded, x_train_scaled_df], axis=1)
x_test_final = pd.concat([x_test_encoded, x_test_scaled_df], axis=1)



In [61]:
# since RandomForestClassifier does not handle missing values (NaN) natively

from sklearn.impute import SimpleImputer

# Initialize the imputer
imputer = SimpleImputer(strategy='mean')

# Fit and transform the imputer on your training data
x_train_final_imputed = imputer.fit_transform(x_train_final)

In [52]:
# Training models
from sklearn.metrics import accuracy_score, classification_report

# Set random_state for reproducibility
random_state = 1

# Initialize and train Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=random_state)
rf_classifier.fit(x_train_final_imputed, y_train)

# Initialize and train Extra Trees Classifier
et_classifier = ExtraTreesClassifier(random_state=random_state)
et_classifier.fit(x_train_final_imputed, y_train)

# Initialize and train XGBoost Classifier
xgb_classifier = XGBClassifier(random_state=random_state)
xgb_classifier.fit(x_train_final_imputed, y_train)

# Initialize and train LightGBM Classifier
lgbm_classifier = LGBMClassifier(random_state=random_state)
lgbm_classifier.fit(x_train_final_imputed, y_train)

[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.037390 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785


In [54]:
# Make predictions
rf_predictions = rf_classifier.predict(x_test_final)
et_predictions = et_classifier.predict(x_test_final)
xgb_predictions = xgb_classifier.predict(x_test_final)
lgbm_predictions = lgbm_classifier.predict(x_test_final)



In [55]:
# Evaluate the models
print("Random Forest Classifier:")
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print("Classification Report:\n", classification_report(y_test, rf_predictions))

print("\nExtra Trees Classifier:")
print("Accuracy:", accuracy_score(y_test, et_predictions))
print("Classification Report:\n", classification_report(y_test, et_predictions))

print("\nXGBoost Classifier:")
print("Accuracy:", accuracy_score(y_test, xgb_predictions))
print("Classification Report:\n", classification_report(y_test, xgb_predictions))

print("\nLightGBM Classifier:")
print("Accuracy:", accuracy_score(y_test, lgbm_predictions))
print("Classification Report:\n", classification_report(y_test, lgbm_predictions))


Random Forest Classifier:
Accuracy: 0.794180269694819
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.88      0.87      1061
           1       0.59      0.53      0.56       348

    accuracy                           0.79      1409
   macro avg       0.72      0.70      0.71      1409
weighted avg       0.79      0.79      0.79      1409


Extra Trees Classifier:
Accuracy: 0.7686302342086586
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.86      0.85      1061
           1       0.54      0.48      0.50       348

    accuracy                           0.77      1409
   macro avg       0.68      0.67      0.68      1409
weighted avg       0.76      0.77      0.76      1409


XGBoost Classifier:
Accuracy: 0.7934705464868701
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.87      0.86      1061
       

In [56]:
from sklearn.model_selection import RandomizedSearchCV

# Define the hyperparameter grid
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features
}

# Create the Extra Trees Classifier
et_classifier = ExtraTreesClassifier(random_state=1)

# Run RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=et_classifier,
    param_distributions=hyperparameter_grid,
    cv=5,
    n_iter=10,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=1
)

# Fit the data to find the best hyperparameters
random_search.fit(x_train_final_imputed, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [57]:
# Access the best hyperparameters
best_hyperparameters = random_search.best_params_

# Print the best hyperparameters
print("Best Hyperparameters:")
for param, value in best_hyperparameters.items():
    print(f"{param}: {value}")

Best Hyperparameters:
n_estimators: 1000
min_samples_split: 9
min_samples_leaf: 8
max_features: sqrt


In [59]:
model = ExtraTreesClassifier(n_estimators=1000, min_samples_leaf=8, min_samples_split=9, max_features='sqrt', random_state=1)

In [63]:
model.fit(x_train_final_imputed, y_train)   
y_pred = model.predict(x_test_final)   
accuracy = accuracy_score(y_test, y_pred)



In [64]:
accuracy

0.8026969481902059