<a href="https://colab.research.google.com/github/medicadex/Hamoye_Project/blob/main/Adebowale_qazeem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
df= pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(np.array(df.columns))

['customerID' 'gender' 'SeniorCitizen' 'Partner' 'Dependents' 'tenure'
 'PhoneService' 'MultipleLines' 'InternetService' 'OnlineSecurity'
 'OnlineBackup' 'DeviceProtection' 'TechSupport' 'StreamingTV'
 'StreamingMovies' 'Contract' 'PaperlessBilling' 'PaymentMethod'
 'MonthlyCharges' 'TotalCharges' 'Churn']


In [None]:
from sklearn.model_selection import train_test_split

# Convert 'TotalCharges' to numeric and fill missing values with 0
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce').fillna(0)

# Convert 'Churn' column to binary
df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

# Define features
categorical = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
               'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
               'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
numerical = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Split data into train and test sets
X = df[categorical + numerical]
y = df['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Define preprocessing steps
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        ('cat', categorical_transformer, categorical)
    ])

# Fit and transform the preprocessing pipeline on the training data
X_train_prep = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

# Get column names for one-hot encoded features
ohe_feature_names = preprocessor.named_transformers_['cat']['onehot'].get_feature_names_out(categorical)

# Combine scaled numerical and one-hot encoded categorical features into train and test set dataframes
X_train_prep_df = pd.DataFrame(X_train_prep, columns=numerical + list(ohe_feature_names))
X_test_prep_df = pd.DataFrame(X_test_prep, columns=numerical + list(ohe_feature_names))




In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Train models
rf = RandomForestClassifier(random_state=1)
et = ExtraTreesClassifier(random_state=1)
xgb = XGBClassifier(random_state=1)
lgbm = LGBMClassifier(random_state=1)

rf.fit(X_train_prep_df, y_train)
et.fit(X_train_prep_df, y_train)
xgb.fit(X_train_prep_df, y_train)
lgbm.fit(X_train_prep_df, y_train)

# Evaluate on test set
rf_score = rf.score(X_test_prep_df, y_test)
et_score = et.score(X_test_prep_df, y_test)
xgb_score = xgb.score(X_test_prep_df, y_test)
lgbm_score = lgbm.score(X_test_prep_df, y_test)

print("Random Forest Accuracy:", rf_score)
print("Extra Trees Accuracy:", et_score)
print("XGBoost Accuracy:", xgb_score)
print("LightGBM Accuracy:", lgbm_score)


[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001561 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
Random Forest Accuracy: 0.7913413768630234
Extra Trees Accuracy: 0.7672107877927609
XGBoost Accuracy: 0.7934705464868701
LightGBM Accuracy: 0.8034066713981547


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import ExtraTreesClassifier

# Define hyperparameter grid
n_estimators = [50, 100, 300, 500, 1000]
min_samples_split = [2, 3, 5, 7, 9]
min_samples_leaf = [1, 2, 4, 6, 8]
max_features = ['auto', 'sqrt', 'log2', None]

hyperparameter_grid = {
    'n_estimators': n_estimators,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split,
    'max_features': max_features
}

# Instantiate ExtraTreesClassifier
etc = ExtraTreesClassifier(random_state=1)

# Set up RandomizedSearchCV
random_search = RandomizedSearchCV(etc, hyperparameter_grid, cv=5, n_iter=10, scoring='accuracy', n_jobs=-1, verbose=1, random_state=1)

# Fit RandomizedSearchCV on training data
random_search.fit(X_train_prep_df, y_train)

# Retrieve the best hyperparameters
best_params = random_search.best_params_

# Print the best hyperparameters
print("Best Hyperparameters:", best_params)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Hyperparameters: {'n_estimators': 1000, 'min_samples_split': 9, 'min_samples_leaf': 8, 'max_features': 'sqrt'}


In [None]:
from sklearn.ensemble import ExtraTreesClassifier

# Instantiate a new ExtraTreesClassifier with the best hyperparameters
best_etc = ExtraTreesClassifier(random_state=1, n_estimators=1000, min_samples_split=9, min_samples_leaf=8, max_features='sqrt')

# Fit the new model on the training data
best_etc.fit(X_train_prep_df, y_train)

# Evaluate the accuracy of the new model on the test set
best_etc_score = best_etc.score(X_test_prep_df, y_test)

# Compare the accuracy of the new model with the initial model
print("Initial ExtraTreesClassifier Accuracy:", et_score)
print("Optimal ExtraTreesClassifier Accuracy:", best_etc_score)

# Check if the accuracy of the new optimal model is higher or lower than the initial model
if best_etc_score > et_score:
    print("The accuracy of the new optimal model is higher.")
elif best_etc_score < et_score:
    print("The accuracy of the new optimal model is lower.")
else:
    print("The accuracy of the new optimal model is the same as the initial model.")


Initial ExtraTreesClassifier Accuracy: 0.7672107877927609
Optimal ExtraTreesClassifier Accuracy: 0.8041163946061036
The accuracy of the new optimal model is higher.


In [None]:
# Get feature importances from the optimal ExtraTreesClassifier model
feature_importances = best_etc.feature_importances_

# Create a DataFrame to display feature importances alongside feature names
feature_importance_df = pd.DataFrame({'Feature': X_train_prep_df.columns, 'Importance': feature_importances})

# Sort feature importances in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the two most important features
print("Two most important features:")
print(feature_importance_df.head(2))


Two most important features:
                    Feature  Importance
37  Contract_Month-to-month    0.152237
0                    tenure    0.092800
