# Task 1: Telco Customer Churn — Data Cleaning & Leakage Notes
Goal: Clean the dataset and document all decisions.


In [2]:
import pandas as pd
import numpy as np


In [5]:
df = pd.read_csv("Telco-Customer-Churn.csv")


In [6]:
df = pd.read_csv("Telco-Customer-Churn.csv")


In [7]:
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [9]:
df.shape


(7043, 21)

In [10]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


## Initial Inspection
- Loaded the Telco Customer Churn dataset into a pandas data frame.
- Reviewed the first few rows.
- Checked the number of rows and columns.
- Checked the data types using info() and checked for null values.
- Although it seems to show no null values TotalCharges is said to be an object instead of numeric. This may mean some values are empty strings.


In [12]:
df.isnull().sum()



customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [13]:
df["TotalCharges"].head(10)


0      29.85
1     1889.5
2     108.15
3    1840.75
4     151.65
5      820.5
6     1949.4
7      301.9
8    3046.05
9    3487.95
Name: TotalCharges, dtype: object

In [14]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")


In [15]:
df.isnull().sum()


customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

# Hidden Missing Values in TotalCharges
Although initial inspection showed no null values, the TotalCharges
column was stored as an object due to empty string entries.
After converting it to a numeric type, these invalid entries were
correctly identified as missing values.


In [16]:
df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["TotalCharges"].fillna(df["TotalCharges"].median(), inplace=True)


In [17]:
df.isnull().sum()



customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

# Missing Values Handling
Missing values in the `TotalCharges` column were assigned using the median.
Median was chosen because it is less sensitive to extreme values and
more stable for such numeric values.


In [18]:
df.duplicated().sum()


np.int64(0)

In [19]:
df = df.drop_duplicates()


# Duplicate Records
The dataset was checked for duplicate rows using `duplicated()`.
No duplicates were found.

In [20]:
df.describe()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7043.0,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692,2281.916928
std,0.368612,24.559481,30.090047,2265.270398
min,0.0,0.0,18.25,18.8
25%,0.0,9.0,35.5,402.225
50%,0.0,29.0,70.35,1397.475
75%,0.0,55.0,89.85,3786.6
max,1.0,72.0,118.75,8684.8


# Outlier Check
Basic statistics of numerical columns were reviewed using `describe()`.
Some values appeared larger than the majority of the data, especially in
charge-related columns. These values were not changed since they represent
normal customer behavior and not data errors. IQR method also shows no outliers detected.


In [26]:
numeric_cols = ["tenure", "MonthlyCharges", "TotalCharges"]


In [27]:
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"{col}: {outliers.shape[0]} outliers detected")


tenure: 0 outliers detected
MonthlyCharges: 0 outliers detected
TotalCharges: 0 outliers detected


In [22]:
yes_no_cols = [
    "Partner", "Dependents", "PhoneService",
    "PaperlessBilling", "Churn"
]

for col in yes_no_cols:
    df[col] = df[col].map({"Yes": 1, "No": 0})


In [23]:
df[yes_no_cols].head()


Unnamed: 0,Partner,Dependents,PhoneService,PaperlessBilling,Churn
0,1,0,0,1,0
1,0,0,1,0,0
2,0,0,1,1,1
3,0,0,0,0,0
4,0,0,1,1,1


# Categorical Encoding
Categorical columns containing "Yes" and "No" values were encoded
as 1 and 0 respectively. This makes the data easier to work with in
large models.


# Data Leakage Consideration
This task focuses only on data cleaning and preprocessing, and no model
training is performed. Therefore, data leakage does not directly occur
at this stage.

In [25]:
df.to_csv("Telco_Customer_Churn_Cleaned.csv", index=False)


# Final Output
The cleaned dataset is saved as `Telco_Customer_Churn_Cleaned.csv` for
further analysis.