In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


#### Perform initial data preparation by converting the 'TotalCharges' column to numeric values and filling missing values with 0.
#### Convert the 'Churn' column to binary values, where 'No' is mapped to 0 and 'Yes' is mapped to 1

In [4]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce').fillna(0)
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

#### Split the data into an 80-20 train-test split with a random state of “1”

In [5]:
X = df.drop('Churn', axis=1)
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


#### Select these features:  
#### categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies','Contract', 'PaperlessBilling', 'PaymentMethod']
#### numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

In [6]:
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',

               'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',

               'Contract', 'PaperlessBilling', 'PaymentMethod']

numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

#### The numerical features should be scaled using StandardScaler, convert the output back to a dataframe and put back the column names.
#### The categorical features are one-hot encoded using OneHotEncoder(set sparse_output to false), convert the output back to a dataframe and put back the column names.
#### Combine scaled numerical and one-hot encoded categorical features into train and test set dataframes (use pd.concat)

In [7]:
scaler = StandardScaler()
ohe = OneHotEncoder(sparse_output=False)

X_train_num = scaler.fit_transform(X_train[numerical])
X_test_num = scaler.transform(X_test[numerical])

# Convert scaled features back to DataFrame and put back column names
X_train_num = pd.DataFrame(X_train_num, columns=numerical)
X_test_num = pd.DataFrame(X_test_num, columns=numerical)

# One-hot encode categorical features
X_train_cat = ohe.fit_transform(X_train[categorical])
X_test_cat = ohe.transform(X_test[categorical])

# Get feature names after one-hot encoding
feature_names = ohe.get_feature_names_out(categorical)

# Convert one-hot encoded features back to DataFrame and put back column names
X_train_cat = pd.DataFrame(X_train_cat, columns=feature_names)
X_test_cat = pd.DataFrame(X_test_cat, columns=feature_names)

# Combine scaled numerical and one-hot encoded categorical features
X_train = pd.concat([X_train_num, X_train_cat], axis=1)
X_test = pd.concat([X_test_num, X_test_cat], axis=1)



#### Use scikit learn to train a random forest and extra trees classifier, and use xgboost and lightgbm to train an extreme boosting model and a light gradient boosting model. Use random_state = 1 for training all models and evaluate on the test set. Answer from question 14

In [8]:
models = [
    ("Random Forest", RandomForestClassifier(random_state=1)),
    ("Extra Trees", ExtraTreesClassifier(random_state=1)),
    ("XGBoost", XGBClassifier(random_state=1)),
    ("LightGBM", LGBMClassifier(random_state=1))
]

# Iterate over the models and train them
for name, model in models:
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"{name} Accuracy: {acc}")

Random Forest Accuracy: 0.7913413768630234
Extra Trees Accuracy: 0.7672107877927609
XGBoost Accuracy: 0.7934705464868701
[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000398 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
LightGBM Accuracy: 0.8034066713981547


#### To improve the Extra Trees Classifier, you will use the following parameters (number of estimators, minimum number of samples, minimum number of samples for leaf node and the number of features to consider when looking for the best split) for the hyperparameter grid needed to run a Randomized Cross Validation Search (RandomizedSearchCV). 

#### n_estimators = [50, 100, 300, 500, 1000]

#### min_samples_split = [2, 3, 5, 7, 9]

#### min_samples_leaf = [1, 2, 4, 6, 8]

#### max_features = ['auto', 'sqrt', 'log2', None] 

#### hyperparameter_grid = {'n_estimators': n_estimators,

####                       'min_samples_leaf': min_samples_leaf,

####                       'min_samples_split': min_samples_split,

####                       'max_features': max_features}

#### Using the ExtraTreesClassifier as your estimator with cv=5, n_iter=10, scoring = 'accuracy', n_jobs = -1, verbose = 1 and random_state = 1. What are the best hyperparameters from the randomized search CV?


In [9]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier

n_estimators = [50, 100, 300, 500, 1000]

min_samples_split = [2, 3, 5, 7, 9]

min_samples_leaf = [1, 2, 4, 6, 8]

max_features = ['auto', 'sqrt', 'log2', None] 

hyperparameter_grid = {'n_estimators': n_estimators,

                       'min_samples_leaf': min_samples_leaf,

                       'min_samples_split': min_samples_split,

                       'max_features': max_features}

extra_trees = ExtraTreesClassifier(random_state=1)

# Initialize the RandomizedSearchCV
random_cv = RandomizedSearchCV(estimator=extra_trees,
                               param_distributions=hyperparameter_grid,
                               cv=5, n_iter=10, scoring = 'accuracy',
                               n_jobs = -1, verbose = 1, 
                               random_state = 1)
random_cv.fit(X_train, y_train)

best_params = random_cv.best_params_
print(f"Best parameters: {best_params}")


Fitting 5 folds for each of 10 candidates, totalling 50 fits


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
19 fits failed with the following error:
Traceback (most recent call last):
  File "/home/chukwuneku/anaconda3/envs/hamoye/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/chukwuneku/anaconda3/envs/hamoye/lib/python3.12/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/home/chukwuneku/anaconda3/envs/hamoye/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/home/chukwuneku/anaconda3/envs/hamoye/lib/python3.12/site-packages/sklear

Best parameters: {'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}


#### Train a new ExtraTreesClassifier Model with the new Hyperparameters from the RandomizedSearchCV (with random_state = 1). Is the accuracy of the new optimal model higher or lower than the initial ExtraTreesClassifier model with no hyperparameter tuning?


In [10]:
new_model = ExtraTreesClassifier(n_estimators=best_params['n_estimators'],
                                 min_samples_split=best_params['min_samples_split'],
                                 min_samples_leaf=best_params['min_samples_leaf'],
                                 max_features=best_params['max_features'],
                                 random_state=1)

new_model.fit(X_train, y_train)
new_model_accuracy = new_model.score(X_test, y_test)

old_model = ExtraTreesClassifier(random_state=1)
old_model.fit(X_train, y_train)
old_model_accuracy = old_model.score(X_test, y_test)

print(f"Old model accuracy: {old_model_accuracy}")
print(f"New model accuracy: {new_model_accuracy}")

Old model accuracy: 0.7672107877927609
New model accuracy: 0.8041163946061036


#### Find the feature importance using the optimal ExtraTreesClassifier model. Which features are the two most important respectively?

In [11]:
best_model = random_cv.best_estimator_

best_model.fit(X_train, y_train)

feature_importances = best_model.feature_importances_

importance_df = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': feature_importances
})

importance_df = importance_df.sort_values(by='Importance', ascending=False)

print(importance_df.iloc[:2])

                    Feature  Importance
37  Contract_Month-to-month    0.152237
0                    tenure    0.092800
