In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

In [2]:
data = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn (1).csv')
data.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
print("\nMissing values:")
print(data.isnull().sum())


Missing values:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [15]:
data['TotalCharges'] = data['TotalCharges'].replace(' ', np.nan)
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data['TotalCharges'].fillna(data['TotalCharges'].mean(), inplace=True)

for column in data.select_dtypes(include=['object']):
    if column != 'customerID':
        data[column] = LabelEncoder().fit_transform(data[column])

X = data.drop(['customerID', 'Churn'], axis=1)
y = data['Churn']

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [50]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)


y_pred = logreg.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')


rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred_rf = rf_classifier.predict(X_test)


accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)

print(f'Random Forest - Accuracy: {accuracy_rf:.2f}')
print(f'Random Forest - Precision: {precision_rf:.2f}')
print(f'Random Forest - Recall: {recall_rf:.2f}')
print(f'Random Forest - F1 Score: {f1_rf:.2f}')


num_features_to_select = 5  
rfe = RFE(estimator=rf_classifier, n_features_to_select=num_features_to_select)
rfe.fit(X_train, y_train)


selected_features = X_train.columns[rfe.support_]
print(f'Selected Features: {selected_features}')




Accuracy: 0.80
Precision: 0.65
Recall: 0.54
F1 Score: 0.59
Random Forest - Accuracy: 0.79
Random Forest - Precision: 0.63
Random Forest - Recall: 0.49
Random Forest - F1 Score: 0.55


AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [42]:
param_grid = {
    'n_estimators': [5, 10, 50, 100, 200],
    'max_depth': [None, 5, 10 ,20, 30]
}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_


In [44]:
rfe = RFE(RandomForestClassifier(**best_params), n_features_to_select=10)
X_train_rfe = rfe.fit_transform(X_train, y_train)
X_test_rfe = rfe.transform(X_test)

rf_optimized = RandomForestClassifier(**best_params)
rf_optimized.fit(X_train_rfe, y_train)
rf_optimized_preds = rf_optimized.predict(X_test_rfe)

rf_optimized_accuracy = accuracy_score(y_test, rf_optimized_preds)
rf_optimized_precision = precision_score(y_test, rf_optimized_preds)
rf_optimized_recall = recall_score(y_test, rf_optimized_preds)
rf_optimized_f1 = f1_score(y_test, rf_optimized_preds)

print(f"Optimized Random Forest - Accuracy: {rf_optimized_accuracy}, Precision: {rf_optimized_precision}, Recall: {rf_optimized_recall}, F1: {rf_optimized_f1}")


Optimized Random Forest - Accuracy: 0.7881516587677725, Precision: 0.6266666666666667, Recall: 0.5026737967914439, F1: 0.5578635014836796


In [45]:
feature_importances = rf_optimized.feature_importances_

important_features = pd.Series(feature_importances, index=X.columns[rfe.support_])

important_features = important_features.sort_values(ascending=False)

print("\nImportant Features:")
print(important_features)


Important Features:
MonthlyCharges     0.214088
TotalCharges       0.189448
tenure             0.177838
Contract           0.162182
OnlineSecurity     0.065510
TechSupport        0.057757
PaymentMethod      0.048357
InternetService    0.035057
OnlineBackup       0.026355
MultipleLines      0.023409
dtype: float64


In [46]:
print("\nBased on the evaluation metrics, the Optimized Random Forest model is the best-performing model.")
print("The top features contributing to customer churn prediction are:")
print(important_features.head(5))


Based on the evaluation metrics, the Optimized Random Forest model is the best-performing model.
The top features contributing to customer churn prediction are:
MonthlyCharges    0.214088
TotalCharges      0.189448
tenure            0.177838
Contract          0.162182
OnlineSecurity    0.065510
dtype: float64
