In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Load your data
data = pd.read_csv('/content/WA_Fn-UseC_-Telco-Customer-Churn.csv')
# Data Preprocessing
# Convert 'TotalCharges' column to numeric values and fill missing values with 0
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce').fillna(0)

# Convert the 'Churn' column to binary values
data['Churn'] = (data['Churn'] == 'Yes').astype(int)

# Split the data into an 80-20 train-test split with a random state of 1
X = data.drop('Churn', axis=1)
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Define feature lists
categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
                        'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
                        'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']

numerical_features = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Create transformers for preprocessing
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(sparse=False))
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Use ColumnTransformer to apply transformers to specific feature groups
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Combine numerical and one-hot encoded categorical features into train and test dataframes
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier(random_state=1))])

# Fit the classifier with the training data
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"RandomForestClassifier Accuracy: {accuracy:.2f}")




RandomForestClassifier Accuracy: 0.79


In [2]:
from xgboost import XGBClassifier

# Create and fit the XGBoost classifier
xgb_clf = Pipeline(steps=[('preprocessor', preprocessor),
                         ('classifier', XGBClassifier(random_state=1))])

xgb_clf.fit(X_train, y_train)

# Make predictions using XGBoost classifier
xgb_y_pred = xgb_clf.predict(X_test)

# Calculate accuracy
xgb_accuracy = accuracy_score(y_test, xgb_y_pred)
print(f"XGBoost Classifier Accuracy: {xgb_accuracy:.2f}")




XGBoost Classifier Accuracy: 0.79


In [3]:
from lightgbm import LGBMClassifier

# Create and fit the LightGBM classifier
lgbm_clf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', LGBMClassifier(random_state=1))])

lgbm_clf.fit(X_train, y_train)

# Make predictions using LightGBM classifier
lgbm_y_pred = lgbm_clf.predict(X_test)

# Calculate accuracy
lgbm_accuracy = accuracy_score(y_test, lgbm_y_pred)
print(f"LightGBM Classifier Accuracy: {lgbm_accuracy:.2f}")




[LightGBM] [Info] Number of positive: 1521, number of negative: 4113
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001039 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 670
[LightGBM] [Info] Number of data points in the train set: 5634, number of used features: 46
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.269968 -> initscore=-0.994785
[LightGBM] [Info] Start training from score -0.994785
LightGBM Classifier Accuracy: 0.80


In [5]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score

# Define the hyperparameters obtained from RandomizedSearchCV
new_hyperparameters = {
    'n_estimators': 500,  # Replace with the optimal value you obtained
    'max_depth': 20,     # Replace with the optimal value you obtained
    # Include other hyperparameters with their optimal values
}

# Initialize and train the new ExtraTreesClassifier model with the optimized hyperparameters
new_et_model = ExtraTreesClassifier(random_state=1, **new_hyperparameters)
new_et_model.fit(X_train, y_train)

# Make predictions using the new model
new_et_y_pred = new_et_model.predict(X_test)

# Calculate accuracy for the new model
new_et_accuracy = accuracy_score(y_test, new_et_y_pred)

# Calculate accuracy for the initial ExtraTreesClassifier model (assuming you've trained it earlier)
initial_et_y_pred = initial_et_model.predict(X_test)
initial_et_accuracy = accuracy_score(y_test, initial_et_y_pred)

# Compare the accuracies
if new_et_accuracy > initial_et_accuracy:
    print("The accuracy of the new optimal model is higher than the initial model.")
elif new_et_accuracy < initial_et_accuracy:
    print("The accuracy of the new optimal model is lower than the initial model.")
else:
    print("The accuracy of the new optimal model is the same as the initial model.")


ValueError: ignored