Load the CSV files

In [7]:
from pathlib import Path
import pandas as pd

# Find the folder where this notebook lives
NOTEBOOK_DIR = Path.cwd()

# Go from this notebook folder to the raw data folder
#    "../data/raw" means:
#    - ".."  -> go up one folder (out of 'Pedro' and back to 'Sandbox')
#    - "data/raw" -> go into 'data', then into 'raw'
DATA_RAW = (NOTEBOOK_DIR / "../data/raw").resolve()

print("Notebook dir:", NOTEBOOK_DIR)
print("DATA_RAW:", DATA_RAW)
print("Exists?", DATA_RAW.exists())
print("CSV files:", [p.name for p in DATA_RAW.glob("*.csv")])

# Load all CSV files into Pandas DataFrames
demographics = pd.read_csv(DATA_RAW / "Telco_customer_churn_demographics.csv")
location     = pd.read_csv(DATA_RAW / "Telco_customer_churn_location.csv")
population   = pd.read_csv(DATA_RAW / "Telco_customer_churn_population.csv")
services     = pd.read_csv(DATA_RAW / "Telco_customer_churn_services.csv")
status       = pd.read_csv(DATA_RAW / "Telco_customer_churn_status.csv")

# Print shapes just to confirm
for name, df in {
    "Demographics": demographics,
    "Location": location,
    "Population": population,
    "Services": services,
    "Status": status,
}.items():
    print(f"{name:12s} -> {df.shape}")


Notebook dir: /Users/pedro.cabeco/Project_EDSB-1/notebooks
DATA_RAW: /Users/pedro.cabeco/Project_EDSB-1/data/raw
Exists? True
CSV files: ['Telco_customer_churn_status.csv', 'Telco_customer_churn_services.csv', 'Telco_customer_churn_demographics.csv', 'Telco_customer_churn_location.csv', 'Telco_customer_churn_population.csv']
Demographics -> (7043, 9)
Location     -> (7043, 9)
Population   -> (1671, 3)
Services     -> (7043, 30)
Status       -> (7043, 11)


Make “clean” copies and remove useless columns

Some columns have the same value for every row (like Count = 1), or don’t add anything new (Quarter = "Q3" for everyone, or Country = "United States" for everyone). These don’t help the model and just add noise.

We will:

- Make copies of the original tables (so we don’t ruin the raw data).
- Drop columns that are constant or clearly redundant.

In [8]:
# Make copies, so we leave the original DataFrames untouched
demo_clean = demographics.copy()
loc_clean  = location.copy()
pop_clean  = population.copy()
serv_clean = services.copy()
stat_clean = status.copy()

# ---- Demographics: drop 'Count' (always 1)
demo_clean = demo_clean.drop(columns=["Count"])

# ---- Location:
# - 'Count' is always 1
# - 'Country' is always "United States"
# - 'State' is always "California"
# - 'Lat Long' is just text version of (Latitude, Longitude)
loc_clean = loc_clean.drop(columns=["Count", "Country", "State", "Lat Long"])

# ---- Population:
# - 'ID' is just an internal index, doesn't link to customers
pop_clean = pop_clean.drop(columns=["ID"])

# ---- Services:
# - 'Count' is always 1
# - 'Quarter' is always "Q3"
serv_clean = serv_clean.drop(columns=["Count", "Quarter"])

# ---- Status:
# - same reasoning as Services
stat_clean = stat_clean.drop(columns=["Count", "Quarter"])

# Check new shapes
for name, df in {
    "Demographics (clean)": demo_clean,
    "Location (clean)": loc_clean,
    "Population (clean)": pop_clean,
    "Services (clean)": serv_clean,
    "Status (clean)": stat_clean,
}.items():
    print(f"{name:20s} -> {df.shape}")


Demographics (clean) -> (7043, 8)
Location (clean)     -> (7043, 5)
Population (clean)   -> (1671, 2)
Services (clean)     -> (7043, 28)
Status (clean)       -> (7043, 9)


Fix missing values where the meaning is obvious

Two columns in services have missing values:

- Offer – if it’s empty, it likely means no promo offer was given.
- Internet Type – missing when Internet Service = "No" (they simply don’t have internet).

We translate these into explicit labels so the model doesn’t see them as “mystery gaps”.

In [9]:
# ---- Fix 'Offer'
# If Offer is NaN (missing), we treat it as "No Offer"
serv_clean["Offer"] = serv_clean["Offer"].fillna("No Offer")

# If Internet Service = "No" → Internet Type = "None" (no condition on NaN)
serv_clean.loc[serv_clean["Internet Service"].eq("No"), "Internet Type"] = "No Internet"

# Any remaining NaN (weird cases) → "Unknown"
serv_clean["Internet Type"] = serv_clean["Internet Type"].fillna("Unknown")

# Sanity check
serv_clean["Internet Type"].value_counts(dropna=False)

# Quick peek
serv_clean[["Internet Service", "Internet Type", "Offer"]].head()

Unnamed: 0,Internet Service,Internet Type,Offer
0,Yes,DSL,No Offer
1,Yes,Fiber Optic,Offer E
2,Yes,Fiber Optic,Offer D
3,Yes,Fiber Optic,Offer C
4,Yes,Fiber Optic,Offer C


Merge all customer-level tables by Customer ID

Now we want to build one big table where each row is:

- one customer, with demographics + location + services + status.

Customer ID is the key that appears in all four of those tables.

We will do a sequence of left joins:

- Start from demo_clean
- Add loc_clean
- Add serv_clean
- Add stat_clean

“Left join” means: keep all customers from the left table even if the right table is missing something.

In [10]:
# Merge 4 customer-level tables on 'Customer ID'
cust_merged = (
    demo_clean
    .merge(loc_clean,  on="Customer ID", how="left")
    .merge(serv_clean, on="Customer ID", how="left")
    .merge(stat_clean, on="Customer ID", how="left")
)

print("Customer-level merged shape:", cust_merged.shape)

# Check that each Customer ID appears only once
dup_ids = cust_merged["Customer ID"].duplicated().sum()
print("Number of duplicate Customer IDs:", dup_ids)

# Basic sanity check: see the churn label distribution
print("\nChurn Label counts:")
print(cust_merged["Churn Label"].value_counts(dropna=False))


Customer-level merged shape: (7043, 47)
Number of duplicate Customer IDs: 0

Churn Label counts:
Churn Label
No     5174
Yes    1869
Name: count, dtype: int64


Add ZIP-code-level population via Zip Code

The population table is different: it doesn’t have one row per customer.
It has one row per ZIP code.

So we merge it using Zip Code (which lives inside loc_clean and is already inside cust_merged now).

In [11]:
# Merge ZIP-level population info using 'Zip Code'
full_df = cust_merged.merge(pop_clean, on="Zip Code", how="left")

print("Full dataset shape:", full_df.shape)

# How many customers don't have a matching population row?
missing_pop = full_df["Population"].isna().sum()
print("Customers without population info:", missing_pop)

# Peek at some important columns
key_cols = [
    "Customer ID",
    "Gender",
    "Age",
    "City",
    "Zip Code",
    "Population",
    "Tenure in Months",
    "Monthly Charge",
    "Total Revenue",
    "Satisfaction Score",
    "Customer Status",
    "Churn Label",
    "Churn Value",
    "Churn Score",
    "CLTV",
]

full_df[key_cols].head()

Full dataset shape: (7043, 48)
Customers without population info: 0


Unnamed: 0,Customer ID,Gender,Age,City,Zip Code,Population,Tenure in Months,Monthly Charge,Total Revenue,Satisfaction Score,Customer Status,Churn Label,Churn Value,Churn Score,CLTV
0,8779-QRDMV,Male,78,Los Angeles,90022,68701,1,39.65,59.65,3,Churned,Yes,1,91,5433
1,7495-OOKFY,Female,74,Los Angeles,90063,55668,8,80.65,1024.1,3,Churned,Yes,1,69,5302
2,1658-BYGOY,Male,71,Los Angeles,90065,47534,18,95.45,1910.88,2,Churned,Yes,1,81,3179
3,4598-XLKNJ,Female,78,Inglewood,90303,27778,25,98.5,2995.07,2,Churned,Yes,1,88,5337
4,4846-WHAFZ,Female,80,Whittier,90602,26265,37,76.5,3102.36,2,Churned,Yes,1,67,2793


We’ll create a data/processed folder next to data/raw and save the final table there.

In [12]:
# Create processed data folder (if it doesn't exist yet)
DATA_PROCESSED = (NOTEBOOK_DIR / "../data/processed").resolve()
DATA_PROCESSED.mkdir(parents=True, exist_ok=True)

# Save as CSV
output_csv = DATA_PROCESSED / "telco_churn_master.csv"
full_df.to_csv(output_csv, index=False)

print("Saved files:")
print(" -", output_csv)

Saved files:
 - /Users/pedro.cabeco/Project_EDSB-1/data/processed/telco_churn_master.csv
