<a id="setup"></a>
# <p style="background-color: blue; font-family:calibri; color:white; font-size:140%; font-family:Verdana; text-align:center; border-radius:15px 50px;">Cleaning & Transformation</p>

<a id="libraries"></a>
# <b><span style='color:lightblue'> Importing Necessary Libraries</span></b>

In [99]:
import pandas as pd
import numpy as np
import os
import sys

import seaborn as sns
import matplotlib.pyplot as plt

<a id="load_dataset"></a>
# <b><span style='color:lightblue'> Load Dataset</span></b>

In [100]:
# Calling get_dataframe function from load_data.py

sys.path.append("..") 

from src.load_data import get_dataframe 

df = get_dataframe()

df.head()

Looking for file at: /Users/maxsmith/2025_data_science_projects/telco_customer_churn/data/raw_original_data.csv


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


<a id="setup"></a>
# <p style="background-color: #4893D7; font-family:calibri; color:white; font-size:140%; font-family:Verdana; text-align:center; border-radius:15px 50px;">Step 3 | Handling Naming Conventions</p>
<a id="libraries"></a>

In [101]:
# No extra value gained from retaining phone service within 'No phone service' (contained within other features), so will change to No. Need to locate first.

search_values = ['No internet service', 'No phone service']
matching_columns = []

# Create loop
for col in df.columns:
    if df[col].astype(str).isin(search_values).any():
        matching_columns.append(col)

print("Columns containing service flags:", matching_columns)

Columns containing service flags: ['MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']


In [102]:
# Creating function

def replace_service_values(df, columns, values_to_replace=None, replacement_value='No'):
    if values_to_replace is None:
        values_to_replace = ['No internet service', 'No phone service']

    for col in columns:
        df.loc[:, col] = df[col].replace(values_to_replace, replacement_value)

    return df


In [103]:
# Applying function

matching_columns = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'MultipleLines']

df = replace_service_values(df, matching_columns)


In [104]:
# Checking they have been replace by calling first loop function
search_values = ['No internet service', 'No phone service']
matching_columns = []

# Create loop
for col in df.columns:
    if df[col].astype(str).isin(search_values).any():
        matching_columns.append(col)

print("Columns containing service flags:", matching_columns)

Columns containing service flags: []


<a id="setup"></a>
# <p style="background-color: #4893D7; font-family:calibri; color:white; font-size:140%; font-family:Verdana; text-align:center; border-radius:15px 50px;">Step 4 | Handling Missing Values</p>
<a id="libraries"></a>

In [110]:
# Need to investigate TotalCharges column which we found to be a) non-numeric, and b) include 11 null-like missing values

custom_null_values = [
    '', ' ', '   ', 'NA', 'na', 'Na', 'NaN', 'nan', 'Nan',
    'null', 'NULL', 'Null', 'None', 'none'
]

mask = df.astype(str).isin(custom_null_values)
rows_with_custom_nulls = df[mask.any(axis=1)]
rows_with_custom_nulls



Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,4472-LVYGI,Female,0,Yes,Yes,0,No,No,DSL,Yes,...,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No,No,...,No,No,No,No,Two year,No,Mailed check,20.25,,No
936,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,...,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No,No,...,No,No,No,No,Two year,No,Mailed check,25.75,,No
1340,1371-DWPAZ,Female,0,Yes,Yes,0,No,No,DSL,Yes,...,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
3331,7644-OMVMY,Male,0,Yes,Yes,0,Yes,No,No,No,...,No,No,No,No,Two year,No,Mailed check,19.85,,No
3826,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No,No,...,No,No,No,No,Two year,No,Mailed check,25.35,,No
4380,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No,No,...,No,No,No,No,Two year,No,Mailed check,20.0,,No
5218,2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No,No,...,No,No,No,No,One year,Yes,Mailed check,19.7,,No
6670,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,...,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,No


In [111]:
# Create function to convert to numeric column 

def convert_column_to_numeric(df, column_name):
    df[column_name] = pd.to_numeric(df[column_name], errors='coerce')
    return df


In [112]:
# Apply function

convert_column_to_numeric(df, column_name='TotalCharges')

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.50,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.50,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.90,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.60,Yes


In [113]:
# Confirm function has worked

df['TotalCharges'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 7043 entries, 0 to 7042
Series name: TotalCharges
Non-Null Count  Dtype  
--------------  -----  
7032 non-null   float64
dtypes: float64(1)
memory usage: 55.1 KB


In [114]:
# Locate the nulls

empty_rows = df[df['TotalCharges'].isna()]
empty_rows


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
488,4472-LVYGI,Female,0,Yes,Yes,0,No,No,DSL,Yes,...,Yes,Yes,Yes,No,Two year,Yes,Bank transfer (automatic),52.55,,No
753,3115-CZMZD,Male,0,No,Yes,0,Yes,No,No,No,...,No,No,No,No,Two year,No,Mailed check,20.25,,No
936,5709-LVOEQ,Female,0,Yes,Yes,0,Yes,No,DSL,Yes,...,Yes,No,Yes,Yes,Two year,No,Mailed check,80.85,,No
1082,4367-NUYAO,Male,0,Yes,Yes,0,Yes,Yes,No,No,...,No,No,No,No,Two year,No,Mailed check,25.75,,No
1340,1371-DWPAZ,Female,0,Yes,Yes,0,No,No,DSL,Yes,...,Yes,Yes,Yes,No,Two year,No,Credit card (automatic),56.05,,No
3331,7644-OMVMY,Male,0,Yes,Yes,0,Yes,No,No,No,...,No,No,No,No,Two year,No,Mailed check,19.85,,No
3826,3213-VVOLG,Male,0,Yes,Yes,0,Yes,Yes,No,No,...,No,No,No,No,Two year,No,Mailed check,25.35,,No
4380,2520-SGTTA,Female,0,Yes,Yes,0,Yes,No,No,No,...,No,No,No,No,Two year,No,Mailed check,20.0,,No
5218,2923-ARZLG,Male,0,Yes,Yes,0,Yes,No,No,No,...,No,No,No,No,One year,Yes,Mailed check,19.7,,No
6670,4075-WKNIU,Female,0,Yes,Yes,0,Yes,Yes,DSL,No,...,Yes,Yes,Yes,No,Two year,No,Mailed check,73.35,,No


In [115]:
# Create function to remove missing values

def remove_missing_values(df,column_name):
    df = df[df[column_name].notna()]
    return df

In [116]:
# Removing rows where Total Charges are null
df = remove_missing_values(df,column_name='TotalCharges')

In [120]:
# Confirm still no true missing values

df.isna().sum().sum()

0

In [118]:
# Confirm still no null-like values

custom_null_values = [
    '', ' ', '   ', 'NA', 'na', 'Na', 'NaN', 'nan', 'Nan',
    'null', 'NULL', 'Null', 'None', 'none'
]

mask = df.astype(str).isin(custom_null_values)
rows_with_custom_nulls = df[mask.any(axis=1)]
rows_with_custom_nulls

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn


<a id="setup"></a>
# <p style="background-color: #4893D7; font-family:calibri; color:white; font-size:140%; font-family:Verdana; text-align:center; border-radius:15px 50px;">Step 5 | Handling Duplicates</p>
<a id="libraries"></a>

In [121]:
# Finding duplicate rows and ordering them by customerID
duplicate_rows = df[df.duplicated(keep=False)]

duplicate_rows_sorted = duplicate_rows.sort_values(by=['customerID'])

duplicate_rows_sorted.head(10)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn


In [122]:
# There are no duplicate rows (non-normal, again), so no action here

In [123]:
for col in df.columns:
    num_duplicates = df[col].duplicated().sum()
    if num_duplicates > 0:
        print(f"Column '{col}' has {num_duplicates} duplicate values.")
    else:
        print(f"Column '{col}' has all unique values.")


Column 'customerID' has all unique values.
Column 'gender' has 7030 duplicate values.
Column 'SeniorCitizen' has 7030 duplicate values.
Column 'Partner' has 7030 duplicate values.
Column 'Dependents' has 7030 duplicate values.
Column 'tenure' has 6960 duplicate values.
Column 'PhoneService' has 7030 duplicate values.
Column 'MultipleLines' has 7030 duplicate values.
Column 'InternetService' has 7029 duplicate values.
Column 'OnlineSecurity' has 7030 duplicate values.
Column 'OnlineBackup' has 7030 duplicate values.
Column 'DeviceProtection' has 7030 duplicate values.
Column 'TechSupport' has 7030 duplicate values.
Column 'StreamingTV' has 7030 duplicate values.
Column 'StreamingMovies' has 7030 duplicate values.
Column 'Contract' has 7029 duplicate values.
Column 'PaperlessBilling' has 7030 duplicate values.
Column 'PaymentMethod' has 7028 duplicate values.
Column 'MonthlyCharges' has 5448 duplicate values.
Column 'TotalCharges' has 502 duplicate values.
Column 'Churn' has 7030 duplica

In [124]:
# Importantly, customerID has no duplicate values, all other columns with duplicate values are expected

<a id="setup"></a>
# <p style="background-color: #4893D7; font-family:calibri; color:white; font-size:140%; font-family:Verdana; text-align:center; border-radius:15px 50px;">Step 6 | Handling Numerical Values</p>
<a id="libraries"></a>

In [140]:
# As identified in EDA, some tenure values are 0 so need to be investigated. Will investogate other numerical values alongside.

numeric_columns = df.select_dtypes(include=['number']).columns.tolist()

for col in numeric_columns:
    print(f"{col} min: {df[col].min()}")
    print(f"{col} max: {df[col].max()}")
    

SeniorCitizen min: 0
SeniorCitizen max: 1
tenure min: 1
tenure max: 72
MonthlyCharges min: 18.25
MonthlyCharges max: 118.75
TotalCharges min: 18.8
TotalCharges max: 8684.8


In [141]:
# Rows have been removed when TotalCharges == 0 rows were removed.
# No other unwanted 0 values (expected in SeniorCitizen as binary)