<a href="https://colab.research.google.com/github/nguyenhson03/Telco-Customer-churn/blob/main/Telco_Customer_Churn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
blastchar_telco_customer_churn_path = kagglehub.dataset_download('blastchar/telco-customer-churn')

print('Data source import complete.')


<div style="background-color:lightblue; padding:20px; text-align:center; font-weight: bold;">
    <h1>Description of columns 📞:</h1>
</div>


* customerID: Unique customer identifier.

* gender: Customer's gender .

* SeniorCitizen: Indicates senior citizen status .

* Partner: Indicates if customer has a partner .

* Dependents: Indicates if customer has dependents .

* tenure: Number of months with the company.

* PhoneService: Indicates phone service .

* MultipleLines: Indicates multiple lines .

* InternetService: Type of internet service .

* OnlineSecurity: Online security service .

* OnlineBackup: Online backup service .

* DeviceProtection: Device protection service .

* TechSupport: Tech support service .

* StreamingTV: Streaming TV service .

* StreamingMovies: Streaming movies service .

* Contract: Contract type .

* PaperlessBilling: Paperless billing .

* PaymentMethod: Payment method .

* MonthlyCharges: Monthly charges.

* TotalCharges: Total charges.

* Churn: Indicates if customer churned.


<div style="background-color:lightblue; padding:20px; text-align:center; font-weight: bold;">
    <h1>Import some important libaraies :</h1>
</div>


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

<div style="background-color:lightblue; padding:20px; text-align:center; font-weight: bold;">
    <h1>Remove warnings:</h1>
</div>


In [None]:
import warnings
warnings.simplefilter("ignore")

<div style="background-color:lightblue; padding:20px; text-align:center; font-weight: bold;">
    <h1>Display all columns and rows in the output :</h1>
</div>


In [None]:
pd.set_option("display.max_columns",None)
pd.set_option('display.max_rows',None)


<div style="background-color:lightblue; padding:20px; text-align:center; font-weight: bold;">
    <h1>Load our data :</h1>
</div>


In [None]:
df=pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')

<div style="background-color:lightblue; padding:20px; text-align:center; font-weight: bold;">
    <h1>Let's have imformation about the dataset:</h1>
</div>


In [None]:
df.head()

In [None]:
# i will drop this column because it isn't useful
df.drop(columns=['customerID'], inplace=True)


In [None]:
#show number of rows and columns
df.shape

In [None]:
#show datatype and count non null values
df.info()

* After i see datatypes of features,there are some columns need to modefy their type

In [None]:
# Convert 'TotalCharges' to numeric, setting errors='coerce' to handle spaces and non-numeric values
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')


In [None]:
# Columns suitable for conversion to categorical
categorical_cols = [
    'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
    'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
    'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
    'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn'
]

# Convert each column to categorical
for col in categorical_cols:
    df[col] = df[col].astype('category')

In [None]:
df.info()

In [None]:
#show if there are full duplicates
df.duplicated().sum()

* there are duplicates i will drop them

In [None]:
# Drop full duplicates
df.drop_duplicates(inplace=True)


In [None]:
nans = df.isna().sum().sort_values(ascending=False)
pct = 100 * nans / df.shape[0]
nan_stats = pd.concat([nans, pct], axis=1)
nan_stats.columns = ['num_of_nans', 'percentage_of_nans']
nan_stats


* TotalCharges column has 11 nans , it is a small number so i will drop them

In [None]:
df.dropna(subset=['TotalCharges'], inplace=True)


In [None]:
# summary statistics od data
df.describe()

In [None]:
# info about categorical variables
df.describe(include="category")

<div style="background-color:lightblue; padding:20px; text-align:center; font-weight: bold;">
    <h1>EDA☎️:</h1>
</div>


In [None]:
# Define custom colors
light_blue_color = '#ADD8E6'
light_pink_color = '#FFB6C1'
light_green_color = '#BDFCC9'
light_purple_color = '#DAC4F7'
# Create a color palette for pie charts
palette = [light_blue_color, light_pink_color, light_green_color, light_purple_color]


In [None]:
# Distribution of Churn
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
status_counts = df['Churn'].value_counts()
axes[0].pie(status_counts, labels=status_counts.index, autopct='%1.1f%%', startangle=90, colors=[light_blue_color, light_pink_color])
axes[0].set_title('Distribution of Churn')
sns.countplot(x='Churn', data=df, palette=[light_blue_color, light_pink_color], ax=axes[1])
axes[1].set_title('Count Plot of Churn')

plt.tight_layout()
plt.show()

* Based on the visualization of churn distribution, we observe that the percentage of customers who churned is approximately 26.5%, while the percentage of customers who did not churn is approximately 73.5%.

* i have imbalanced data so i will deal with it


In [None]:
# devide categorical columns in listist to ease plot them
customer_info = ["gender", "SeniorCitizen", "Partner", "Dependents"]
services = ["PhoneService", "MultipleLines", "InternetService", "OnlineSecurity",
            "OnlineBackup", "DeviceProtection", "TechSupport", "StreamingTV",
            "StreamingMovies"]
billing_info = ["Contract", "PaperlessBilling", "PaymentMethod"]

In [None]:
# Plot customer info columns
plt.figure(figsize=(15, 5))
for i, col in enumerate(customer_info, 1):
    plt.subplot(1, 4, i)
    counts = df[col].value_counts(normalize=True)
    plt.pie(counts, labels=counts.index, autopct='%1.1f%%', startangle=90, colors=palette)
    plt.title(f'Customer Info: {col}')
    plt.axis('equal')
plt.tight_layout()
plt.show()

In [None]:
# Plot services columns
plt.figure(figsize=(15, 5))
for i, col in enumerate(services, 1):
    plt.subplot(2, 5, i)
    counts = df[col].value_counts(normalize=True)
    plt.pie(counts, labels=counts.index, autopct='%1.1f%%', startangle=90, colors=palette)
    plt.title(f'Services: {col}')
    plt.axis('equal')
plt.tight_layout()
plt.show()

In [None]:
# Plot billing info columns
plt.figure(figsize=(15, 5))
for i, col in enumerate(billing_info, 1):
    plt.subplot(2, 4, i)
    if col in ['MonthlyCharges', 'TotalCharges', 'tenure']:
        sns.histplot(df[col], kde=True, color=light_blue_color)
        plt.title(f'Billing Info: {col}')
    else:
        counts = df[col].value_counts(normalize=True)
        plt.pie(counts, labels=counts.index, autopct='%1.1f%%', startangle=90, colors=palette)
        plt.title(f'Billing Info: {col}')
        plt.axis('equal')
plt.tight_layout()
plt.show()

In [None]:
# Define numeric columns
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

num_rows = len(numeric_cols)
num_cols = 2

# Create subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 5*num_rows))

if num_rows == 1:
    axes = axes.reshape(1, -1)
for i, column in enumerate(numeric_cols):
    # Box plot
    sns.boxplot(x=df[column], ax=axes[i, 0], color=light_blue_color)
    axes[i, 0].set_title(f'Boxplot of {column}')
    axes[i, 0].set_xlabel(column)
    # KDE plot
    sns.kdeplot(data=df[column], ax=axes[i, 1], color=light_pink_color, fill=True)
    axes[i, 1].set_title(f'KDE Plot of {column}')
    axes[i, 1].set_xlabel(column)

plt.tight_layout()
plt.show()

In [None]:
sns.pairplot(df, hue='Churn', palette='husl')

plt.show()


<div style="background-color:lightblue; padding:20px; text-align:center; font-weight: bold;">
    <h1>preprocessing:</h1>
</div>

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
# Replace values in the 'Churn' column
df['Churn'] = df['Churn'].replace({'No': 0, 'Yes': 1})


In [None]:
# Define numeric and categorical features
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_features = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
                        'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                        'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                        'Contract', 'PaperlessBilling', 'PaymentMethod']

In [None]:
# Create preprocessing pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Create preprocessing pipeline for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps for numeric and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

<div style="background-color:lightblue; padding:20px; text-align:center; font-weight: bold;">
    <h1>Modeling:</h1>
</div>

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score

In [None]:
# Define classifiers/models
classifiers = [
    ('logreg', LogisticRegression(max_iter=1000)),
    ('rf', RandomForestClassifier()),
    ('gbc', GradientBoostingClassifier())
]

In [None]:
# Feature selection and Voting Classifier
voting_clf = VotingClassifier(estimators=classifiers, voting='soft')


In [None]:
# Define features (X) and target variable (y)
X = df.drop(columns=['Churn'])
y = df['Churn']


In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Pipeline with SMOTE
pipeline = ImbPipeline([
    ('preprocessor', preprocessor),
    ('smote', SMOTE()),
    ('voting_clf', voting_clf)
])


In [None]:
param_grid = {
    'voting_clf__logreg__C': [0.1, 1.0, 10],
    'voting_clf__rf__n_estimators': [50, 100, 200],
    'voting_clf__gbc__learning_rate': [0.01, 0.1, 0.2]
}


In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc')
grid_search.fit(X_train, y_train)

In [None]:
# Best parameters and estimator
print("Best parameters found: ", grid_search.best_params_)
best_estimator = grid_search.best_estimator_

In [None]:

# Predictions
y_pred = best_estimator.predict(X_test)


In [None]:
# Evaluation
roc_auc = roc_auc_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print(f"ROC AUC: {roc_auc:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy: {accuracy:.2f}")