## Step 1: Load CSV

Load CSV

In [1]:
# --- Minimal loader for Telco datasets ---
from pathlib import Path
import pandas as pd

# 1) Point to your repo root (adjust only if your path is different)
ROOT = Path(r"C:\Users\miga\Documents\GitHub\Project_EDSB")
DATA_RAW = ROOT / "data" / "raw"
print("DATA_RAW:", DATA_RAW)

# 2) Files to load
files = {
    "demographics": "Telco_customer_churn_demographics.csv",
    "location":     "Telco_customer_churn_location.csv",
    "population":   "Telco_customer_churn_population.csv",
    "services":     "Telco_customer_churn_services.csv",
    "status":       "Telco_customer_churn_status.csv",
}

# 3) Read (auto-detects comma vs semicolon) + sanity checks
dfs = {}
for name, fname in files.items():
    path = DATA_RAW / fname
    if not path.exists():
        raise FileNotFoundError(f"Missing file: {path}")
    df = pd.read_csv(path, sep=None, engine="python", encoding="utf-8-sig")
    dfs[name] = df

# 4) Show shapes + full column names (short, no truncation)
for name, df in dfs.items():
    print(f"\n{name}: {df.shape}")
    print(df.columns.tolist())

# Optional: keep as variables for later steps
demographics = dfs["demographics"]
location     = dfs["location"]
population   = dfs["population"]
services     = dfs["services"]
status       = dfs["status"]





DATA_RAW: C:\Users\miga\Documents\GitHub\Project_EDSB\data\raw

demographics: (7043, 9)
['Customer ID', 'Count', 'Gender', 'Age', 'Under 30', 'Senior Citizen', 'Married', 'Dependents', 'Number of Dependents']

location: (7043, 9)
['Customer ID', 'Count', 'Country', 'State', 'City', 'Zip Code', 'Lat Long', 'Latitude', 'Longitude']

population: (1671, 3)
['ID', 'Zip Code', 'Population']

services: (7043, 30)
['Customer ID', 'Count', 'Quarter', 'Referred a Friend', 'Number of Referrals', 'Tenure in Months', 'Offer', 'Phone Service', 'Avg Monthly Long Distance Charges', 'Multiple Lines', 'Internet Service', 'Internet Type', 'Avg Monthly GB Download', 'Online Security', 'Online Backup', 'Device Protection Plan', 'Premium Tech Support', 'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing', 'Payment Method', 'Monthly Charge', 'Total Charges', 'Total Refunds', 'Total Extra Data Charges', 'Total Long Distance Charges', 'Total Revenue']

status

## Step 2: Initial Data Exploration and Light Cleaning
- Let's explore each dataset individually first using pandas profiling 

In [3]:
%pip install ydata-profiling

Collecting ydata-profiling
  Downloading ydata_profiling-4.18.0-py2.py3-none-any.whl.metadata (22 kB)
Collecting PyYAML<6.1,>=6.0.3 (from ydata-profiling)
  Downloading pyyaml-6.0.3-cp312-cp312-win_amd64.whl.metadata (2.4 kB)
Collecting jinja2<3.2,>=3.1.6 (from ydata-profiling)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting visions<0.8.2,>=0.7.5 (from visions[type_image_path]<0.8.2,>=0.7.5->ydata-profiling)
  Downloading visions-0.8.1-py3-none-any.whl.metadata (11 kB)
Collecting minify-html>=0.15.0 (from ydata-profiling)
  Downloading minify_html-0.18.1-cp312-cp312-win_amd64.whl.metadata (18 kB)
Collecting filetype>=1.0.0 (from ydata-profiling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting phik<0.13,>=0.12.5 (from ydata-profiling)
  Downloading phik-0.12.5-cp312-cp312-win_amd64.whl.metadata (5.6 kB)
Collecting multimethod<2,>=1.4 (from ydata-profiling)
  Downloading multimethod-1.12-py3-none-any.whl.metadata (9.6 kB)
Collecting

In [None]:
# Set up reports folder

from pathlib import Path

# ROOT should already exist from your previous cell.
# If not, uncomment and set it:
# ROOT = Path(r"C:/Users/Miguel/Documents/GitHub/Project_EDS")

REPORTS_DIR = ROOT / "reports" / "profiling_raw_tables"
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

print("Reports will be saved to:", REPORTS_DIR)


Reports will be saved to: C:\Users\miga\Documents\GitHub\Project_EDSB\reports\profiling_raw_tables


In [5]:
# Generate YData Profiling reports for each raw table

from ydata_profiling import ProfileReport

def create_profile(df, name: str, output_dir: Path = REPORTS_DIR):
    """
    Generate a YData Profiling HTML report for a dataframe.
    """
    print(f"üîç Creating profile for: {name} ...")
    profile = ProfileReport(
        df,
        title=f"Telco Customer Churn ‚Äì {name} table",
        explorative=True,     # richer report
        minimal=False
    )
    output_path = output_dir / f"{name}_profiling.html"
    profile.to_file(output_path)
    print(f"‚úÖ Saved: {output_path}\n")

# If dfs was created in Step 1, it should contain:
# 'demographics', 'location', 'population', 'services', 'status'
for name, df in dfs.items():
    create_profile(df, name)


üîç Creating profile for: demographics ...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9/9 [00:00<00:00, 177.06it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

‚úÖ Saved: C:\Users\miga\Documents\GitHub\Project_EDSB\reports\profiling_raw_tables\demographics_profiling.html

üîç Creating profile for: location ...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9/9 [00:00<00:00, 76.54it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

‚úÖ Saved: C:\Users\miga\Documents\GitHub\Project_EDSB\reports\profiling_raw_tables\location_profiling.html

üîç Creating profile for: population ...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:00<00:00, 300.98it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

‚úÖ Saved: C:\Users\miga\Documents\GitHub\Project_EDSB\reports\profiling_raw_tables\population_profiling.html

üîç Creating profile for: services ...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 30/30 [00:00<00:00, 485.49it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

‚úÖ Saved: C:\Users\miga\Documents\GitHub\Project_EDSB\reports\profiling_raw_tables\services_profiling.html

üîç Creating profile for: status ...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11/11 [00:00<00:00, 315.30it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

‚úÖ Saved: C:\Users\miga\Documents\GitHub\Project_EDSB\reports\profiling_raw_tables\status_profiling.html



2.1 Demographics table insights:
- No missing values, no duplicates, no constant columns.
- No pre-merge cleaning required.


2.2 Location table insights:
- Country and Count columns are constant.
- Lat Long is redundant because we already have Latitude & Longitude.
- We will convert Zip Code to string to ensure "1234" becomes "01234" if needed.
- We will drop columns "Lat Long", "Country" and "Count".




2.2.1 Location table pre-merge cleaninhg

In [9]:
# Start from the raw location df in dfs
location = dfs["location"].copy()

# 1) Standardize column names
location = standardize_columns(location)
print("Location columns after standardize:", location.columns.tolist())

# 2) Drop unneeded columns IF they exist
cols_to_drop = [c for c in ["count", "lat_long", "country"] if c in location.columns]
location = location.drop(columns=cols_to_drop)
print("Dropped columns:", cols_to_drop)

# 3) Convert zip_code to 5-char string
if "zip_code" in location.columns:
    location["zip_code"] = (
        location["zip_code"]
        .astype(str)
        .str.zfill(5)
    )

location.head()


Location columns after standardize: ['customer_id', 'count', 'country', 'state', 'city', 'zip_code', 'lat_long', 'latitude', 'longitude']
Dropped columns: ['count', 'lat_long', 'country']


Unnamed: 0,customer_id,state,city,zip_code,latitude,longitude
0,8779-QRDMV,California,Los Angeles,90022,34.02381,-118.156582
1,7495-OOKFY,California,Los Angeles,90063,34.044271,-118.185237
2,1658-BYGOY,California,Los Angeles,90065,34.108833,-118.229715
3,4598-XLKNJ,California,Inglewood,90303,33.936291,-118.332639
4,4846-WHAFZ,California,Whittier,90602,33.972119,-118.020188


2.3 Population Table insights: 
- No missing values, no duplicate rows, no obvious data quality alerts.
- We'll drop unneeded ID column as it is just an internal index and we have Zip Code to merge to the main table.
- We'll make sure Zip Code has the same format as in the location table (string, 5-digit, left-paded)
- Standardise column names

2.3.1 Population table pre-merging cleaning

In [10]:
# 2.2.2 Population table pre-merge cleaning

# 1) Inspect current structure (optional sanity check)
print(population.head())
print(population.info())

# 2) Drop unneeded technical ID column
if "ID" in population.columns:
    population = population.drop(columns=["ID"])

# 3) Ensure Zip Code is a 5-char string aligned with `location`
if "Zip Code" in population.columns:
    population["Zip Code"] = (
        population["Zip Code"]
        .astype(str)
        .str.zfill(5)
    )

# 4) Standardise column names
population = population.rename(
    columns={
        "Zip Code": "zip_code",
        "Population": "population"
    }
)

# 5) Quick check after cleaning
print(population.head())
print(population.describe(include="all"))


   zip_code  population
0     90001       54492
1     90002       44586
2     90003       58198
3     90004       67852
4     90005       43019
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1671 entries, 0 to 1670
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   zip_code    1671 non-null   int64
 1   population  1671 non-null   int64
dtypes: int64(2)
memory usage: 26.2 KB
None
   zip_code  population
0     90001       54492
1     90002       44586
2     90003       58198
3     90004       67852
4     90005       43019
           zip_code     population
count   1671.000000    1671.000000
mean   93678.992220   20276.384201
std     1817.763591   20689.117300
min    90001.000000      11.000000
25%    92269.000000    1789.000000
50%    93664.000000   14239.000000
75%    95408.000000   32942.500000
max    96161.000000  105285.000000
