In [4]:
#Data Loading and project setup - day 1
"""
Objectives: Understand business problem ,
Load dataset correctly,
Inspect structure & data quality
"""

#To avoid FileNotFoundError in Jupyter, I explicitly set the project root and dynamically detect dataset files using os.listdir() instead of hardcoding paths. This makes the pipeline robust across environments.


# import os - allows python to work with folders and files ,check file paths, move between directories,list files inside folders , Without os, Python cannot safely interact with your computerâ€™s file system.
#pandas is the main library for data analysis , pd is short alias which is used to read csv/excel files,clean data , analyze data , prepare features of ML 
import os
import pandas as pd

# 1. Set project root explicitly (NO guessing)
PROJECT_ROOT = r"C:\Users\CORE\Desktop\customer_segmentation" # MAIN FOLDER OF OUR PROJECT ( EVERYTHING DATA , NOTEBOOKS , SRC ) LIVES INSIDE THIS FOLDER  , r"" means raw string (Prevents Python from misreading backslashes \ as special characters)

# 2. Build data folder path safely
DATA_DIR = os.path.join(PROJECT_ROOT, "data") # joins folder names safely 

# 3. Check what files actually exist
print("Files inside data folder:", os.listdir(DATA_DIR)) # os.listdir - Returns a list of all files and folders inside a directory

# 4. Load dataset safely (CSV or Excel)
file_path = None

for file in os.listdir(DATA_DIR):   # python looks at each file inside data/ one by one 
    if file.lower().endswith(".csv"):  #converts filename to lowercase , .csv checks file extension confirms its csv file 
        file_path = os.path.join(DATA_DIR, file)
        df = pd.read_csv(file_path, encoding="ISO-8859-1") # build full file path , read csv using pandas , store data in df , break stops loop after first valid file , it avoids loading multiple files accidently 
        break
    elif file.lower().endswith((".xls", ".xlsx")):   #supports both excel formats 
        file_path = os.path.join(DATA_DIR, file)
        df = pd.read_excel(file_path)   #excel handles encoding internally , pandas reads it directly 
        break

if file_path is None:
    raise FileNotFoundError("No CSV or Excel file found in data folder")  #prevents silent failure , gives clear message 

print("Loaded file:", file_path) # confirms exact file name , exact path loaded 

# I explicitly defined the project root, build paths using os.path.join, inspect available files using os.listdir, and dynamically load CSV or Excel files. This avoids path-related errors and makes the data pipeline robust.


Files inside data folder: ['Online Retail.csv']
Loaded file: C:\Users\CORE\Desktop\customer_segmentation\data\Online Retail.csv


In [5]:
df.head()  # shows the first 5 rows of your DataFrame df by default.

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [5]:
df.head()
df.shape  #returns a tuple(number_of_rows, number_of_columns)


(541909, 8)