###importing libraries

In [141]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

###importing dataframe

In [142]:
df = pd.read_csv('/content/IT_customer_churn.csv')
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [92]:
df.shape

(7043, 20)

In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 


All types are correct except for Totalcharges which must be floated  and we will correct this problem in the modeling stage.  

In [94]:
df.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

  So we no longer need to impute this dataframe.

###Data Analysis

In [96]:
import plotly.express as px

def stacked_plot(df, column, target):
    """
    Function to generate a stacked bar chart between two variables using Plotly.
    """
    # Calculate the percentage of each category within the specified column
    temp_df = (df.groupby([column, target]).size() / df.groupby(column)[target].count()).reset_index()

    # Create a stacked bar chart
    fig = px.bar(temp_df, x=column, y=0, color=target, barmode='stack',
                 labels={'0': 'Churn Percentage'})

    # Customize the plot layout
    fig.update_layout(title=f"Stacked Bar Chart: {column} vs. {target}",
                      autosize=False,
                      width=400,
                      height=400,
                      xaxis_title=column,
                      yaxis_title='Churn Percentage')

    # Show the plot
    fig.show()

# Example usage:
# stacked_plot(df, 'category', 'churn')



In [97]:
stacked_plot(df, 'gender', 'Churn' )

stacked_plot(df, 'SeniorCitizen', 'Churn' )

From above plots, we can say:

gender alone cannot predict customer churn.

Seniorcitizen can be an effective factor in predicting customer churn.

In [98]:
stacked_plot(df, 'Partner','Churn')
stacked_plot(df, 'Dependents','Churn')

As you see :

Both Partner and Dependent factors, unlike gender, are effective in predicting customer churn (with a closer look, you can understand that the Dependent factor is more effective than the Partner factor)



###Tenure

Tenure in the context of customer churn analysis refers to the duration or length of time a customer has been using a product, service, or website. It is a crucial metric because it sheds light on customer loyalty and behavior over time.Without detailed data analysis, it can also be assumed that the longer a customer's tenure is, the less likely she will not use the service again.

In this dataframe, the tenure unit is the month.

In [99]:
df['tenure'].describe().reset_index()

Unnamed: 0,index,tenure
0,count,7043.0
1,mean,32.371149
2,std,24.559481
3,min,0.0
4,25%,9.0
5,50%,29.0
6,75%,55.0
7,max,72.0


In [102]:
df['tenure'].value_counts().head(5).reset_index()

Unnamed: 0,tenure,count
0,1,613
1,72,362
2,2,238
3,3,200
4,4,176


In [103]:
df['tenure'].value_counts().tail(5).reset_index()

Unnamed: 0,tenure,count
0,28,57
1,39,56
2,44,51
3,36,50
4,0,11


In [100]:
df['tenure'].value_counts(normalize = True).reset_index()

Unnamed: 0,tenure,proportion
0,1,0.087037
1,72,0.051399
2,2,0.033792
3,3,0.028397
4,4,0.024989
...,...,...
68,28,0.008093
69,39,0.007951
70,44,0.007241
71,36,0.007099


In [110]:
# Group by tenure and Churn, and calculate counts
churn_counts = df.groupby(["tenure", "Churn"]).size().reset_index(name="counts")

# Create the countplot
fig = px.bar(
    data_frame=churn_counts,
    x="tenure",
    y="counts",
    color="Churn",
    barmode="group",  # Group bars side by side
    labels={"tenure": "Tenure (months)", "counts": "Count"},
    title="Churn Distribution by Tenure"
)

# Show the plot
fig.show()


As it is clearly seen in the diagram, tenure has an inverse relationship with churn, and as tenure increases, churn decreases

In [115]:
def tenure(t):
    if t <= 12:
        return 1
    elif t > 12 and t<= 24:
        return 2
    elif t > 24 and t<= 36:
        return 3
    elif t > 36 and t<= 48:
        return 4
    elif t > 48 and t<= 60:
        return 5
    elif t > 60 and t<= 72:
        return 6

df['tenure_yearly'] = df['tenure'].apply(lambda x: tenure(x))

In [124]:
df.drop(columns=['tenure_categorical'], inplace=True)
df.head()


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,...,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn,tenure_yearly
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,...,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No,1
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,...,No,No,No,One year,No,Mailed check,56.95,1889.5,No,3
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,...,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,...,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No,4
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,...,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1


In [118]:
# Group by tenure_yearly and Churn, and calculate counts
churn_counts = df.groupby(["tenure_yearly", "Churn"]).size().reset_index(name="counts")

# Create the countplot
fig = px.bar(
    data_frame=churn_counts,
    x="tenure_yearly",
    y="counts",
    color="Churn",
    barmode="group",  # Group bars side by side
    labels={"tenure_yearly": "tenure(yearly)", "counts": "Count"},
    title="Churn Distribution by tenure_yearly"
)

# Show the plot
fig.show()

In [126]:
df.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges', 'Churn', 'tenure_yearly'],
      dtype='object')

In [127]:
stacked_plot(df,'PhoneService' ,'Churn')
stacked_plot(df, 'MultipleLines', 'Churn')


as we see:

Two factor PhoneService and
MultipleLines have very little impact on churn

In [129]:
stacked_plot(df,'OnlineSecurity' ,'Churn')
stacked_plot(df,'OnlineBackup' ,'Churn')

In [130]:
stacked_plot(df,'StreamingTV' ,'Churn')
stacked_plot(df,'StreamingMovies' ,'Churn')

The four factors of StreamingTV, StreamingMovies, OnlineSecurity and OnlineBackup have the same effect on churn

Entertainment Services (StreamingTV and StreamingMovies):

These features relate to the availability of streaming content (TV shows, movies, etc.).

Effect on Churn:

Positive: Offering high-quality streaming services can enhance customer satisfaction and retention.

Negative: If the streaming experience is poor (e.g., buffering, limited content), customers may churn.


Security Services (OnlineSecurity and OnlineBackup):

These features focus on protecting customers’ data and digital assets.
Effect on Churn:

Positive: Robust security features can instill trust and reduce churn.

Negative: Lack of security (e.g., vulnerability to cyber threats) may lead to churn.

In [128]:
stacked_plot(df,'InternetService' ,'Churn')

Fiber optic customers have the highest churn rate, likely due to cost and service expectations.
DSL customers show moderate churn, possibly influenced by affordability and competition.
No internet service customers have the lowest churn, but there’s still room for potential growth.

note:Customers without internet service might not churn due to the absence of a subscription.

In [131]:
stacked_plot(df,'DeviceProtection' ,'Churn')
stacked_plot(df, 'TechSupport', 'Churn')

In [132]:
stacked_plot(df, 'PaperlessBilling', 'Churn')

Therefore, paperlessbilling has an important relationship with customer churn


In [133]:
stacked_plot(df, 'Contract', 'Churn')

From the diagram I can see:

The longer the contract, the lower the customer churn rate, and this is an important factor in churn

In [134]:
stacked_plot(df, 'PaymentMethod', 'Churn')

In [135]:
df['MonthlyCharges'].describe()

count    7043.000000
mean       64.761692
std        30.090047
min        18.250000
25%        35.500000
50%        70.350000
75%        89.850000
max       118.750000
Name: MonthlyCharges, dtype: float64

In [138]:
df['MonthlyCharges'].value_counts().head(10).reset_index()

Unnamed: 0,MonthlyCharges,count
0,20.05,61
1,19.85,45
2,19.95,44
3,19.9,44
4,20.0,43
5,19.7,43
6,19.65,43
7,19.55,40
8,20.15,40
9,19.75,39


In [139]:
stacked_plot(df,'Contract', 'PaymentMethod')

In [144]:
stacked_plot(df,'Contract', 'InternetService')

The longer the contract, the less the use of fiber optic

In [145]:
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes
