# Data Loading and preprocessing

# Handling missing Values & Data Cleaning

# Descriptive Statistics and Summary

# Visualization (Univariate, biviriate, multivariate)

# Insight on Chrun Behavior

> Add blockquote



In [1]:
# load important libraries for EDA Project

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# lets load the data into  df variable

In [5]:
df = pd.read_csv("/content/WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [6]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [7]:
# As we see , the data is huge . so all columns no showing
# So lets check the total columns name

In [8]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [9]:
# lets check the shape of data

In [10]:
df.shape

(7043, 21)

In [11]:
# The data set contains 7043 rows indicate custmores with 21 features
# Each row represent a unique customer with various attributes
# related to demographic, services , and billing
# the dataset size is sufficient for meanigful analysis and prediction
# With 21 features , we have a good mix of categorical and numerical
# to explore custmmer behavior and churn pattern

In [13]:
# lets check data type of each feature

In [14]:
df.dtypes

Unnamed: 0,0
customerID,object
gender,object
SeniorCitizen,int64
Partner,object
Dependents,object
tenure,int64
PhoneService,object
MultipleLines,object
InternetService,object
OnlineSecurity,object


In [17]:
# most of the columns are categorical (object type)
# Seniorcitizen is stored as int64 , but since its binary variable
# 0 & 1 it can be treated as categorical
# Total charges is stores as an objects , which seems incorrect
# since it represent a numerical values
# It may cantain missing values or be formetted incorrect
# We need to convert it to float for proper analysis

In [20]:
df['TotalCharges'] = pd.to_numeric(['TotalCharges'], errors='coerce')

ValueError: Length of values (1) does not match length of index (7043)

In [21]:
# Cehck the conversion

In [22]:
df['TotalCharges'].dtype

dtype('O')

In [23]:
#check for missing values in each columns

In [26]:
df.isnull().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [27]:
# Only one columns Total Charges has missing values
# 11 missing values
# since Total charges is numerical , these missing values were
# likely empty spaces in original dataset before conversion
# posible ways to handle missing values :-
# 1-Fill them with median or mean of TotalCharges
# fill them with zero , but total charges not be zero
# Drop these 11 rows, but if we do 11 custmors also drop
# from 7043 . we may loss data
# So better to fill the missing values
# fill them with median because meadian is good if we any may contain
# outliers

In [30]:
df['TotalCharges'].median()

TypeError: Cannot convert ['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5'] to numeric

In [31]:
# so lets check the missing values agian

In [32]:
df ['TotalCharges'].isnull().sum()

np.int64(0)

In [33]:
# lets check for the duplicate records

In [35]:
df.duplicated().sum()

np.int64(0)