# Data SetUp

## Imports

In [8]:
import pandas as pd
import numpy as np
import shutil
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import kagglehub

# DataSet DownLoad & SetUp

In [9]:
# --- Directories ---
raw_data_dir = Path("raw_data")
processed_data_dir = Path("processed_data")

# Create directories if not exist
raw_data_dir.mkdir(exist_ok=True)
processed_data_dir.mkdir(exist_ok=True)

# File path for raw dataset
raw_dataset = raw_data_dir / "Mall_Customers.csv"

# Check and download dataset if not already available
if raw_dataset.exists():
    print("✔️ Dataset is already downloaded.")
else:
    dataset_path = Path(kagglehub.dataset_download("vjchoudhary7/customer-segmentation-tutorial-in-python"))

    if not dataset_path.exists():
        raise FileNotFoundError("⚠ Dataset not found.")

    # Handle possible subfolder
    data_root = dataset_path / "Mall_Customers.csv" if (dataset_path / "Mall_Customers.csv").exists() else dataset_path

    # Copy CSV file(s) into raw_data folder
    for item in dataset_path.iterdir():
        if item.suffix == ".csv":
            target = raw_data_dir / item.name
            shutil.copy2(item, target)

    print("✔️ Dataset successfully downloaded to:", raw_data_dir)

✔️ Dataset is already downloaded.


## Load Dataset

In [10]:
df = pd.read_csv(raw_dataset)

print("✔️ Dataset loaded successfully!")
print("Shape of dataset:", df.shape)

df.head()


✔️ Dataset loaded successfully!
Shape of dataset: (200, 5)


Unnamed: 0,CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


## Basic Info

In [11]:
df.info()

# --- Check for missing values ---
print("\nMissing values per column:")
print(df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB

Missing values per column:
CustomerID                0
Gender                    0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64


## Drop Unnecessary Columns

In [12]:
# --- Drop CustomerID (identifier, not useful for clustering) ---
if "CustomerID" in df.columns:
    df = df.drop(columns=["CustomerID"])

print("✔️ Dropped 'CustomerID'. New shape:", df.shape)
df.head()


✔️ Dropped 'CustomerID'. New shape: (200, 4)


Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,Male,19,15,39
1,Male,21,15,81
2,Female,20,16,6
3,Female,23,16,77
4,Female,31,17,40


## Encode Categorical Columns

In [13]:
# --- Encode Gender (Male=0, Female=1) ---
if "Gender" in df.columns:
    df["Gender"] = df["Gender"].map({"Male": 0, "Female": 1})
    print("✔️ Encoded 'Gender' column (Male=0, Female=1).")

df.head()


✔️ Encoded 'Gender' column (Male=0, Female=1).


Unnamed: 0,Gender,Age,Annual Income (k$),Spending Score (1-100)
0,0,19,15,39
1,0,21,15,81
2,1,20,16,6
3,1,23,16,77
4,1,31,17,40


## Save Processed Dataset

In [14]:
processed_dataset = processed_data_dir / "customers_clean.csv"
df.to_csv(processed_dataset, index=False)

print("✔️ Processed dataset saved to:", processed_dataset)


✔️ Processed dataset saved to: processed_data\customers_clean.csv
