In [11]:

import pandas as pd
import numpy as np

# 2. Load the Dataset
file_path = "/kaggle/input/customers-sales/customer_sales_data.csv"
df = pd.read_csv(file_path)


# 3. Inspect the Data
print("First 5 rows:\n", df.head(), "\n")
print("Info:\n")
print(df.info(), "\n")
print("Summary statistics:\n", df.describe(), "\n")

# 4. Handle Missing Data
print("Missing values per column:\n", df.isnull().sum(), "\n")

# Fill numerical columns with mean, categorical with "Unknown"
for col in df.select_dtypes(include=[np.number]).columns:
    df[col] = df[col].fillna(df[col].mean())

for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].fillna("Unknown")

# 5. Correct Data Types
df["Customer Since"] = pd.to_datetime(df["Customer Since"], errors="coerce")
df["Age"] = df["Age"].astype(int)

# 6. Standardize Categorical Values
df["Gender"] = df["Gender"].str.capitalize().str.strip()
df["Country"] = df["Country"].str.title().str.strip()

# 7. Handle Duplicates
print("Duplicate rows before:", df.duplicated().sum())
df = df.drop_duplicates()
print("Duplicate rows after:", df.duplicated().sum(), "\n")

# 8. Feature Engineering - Create Age Group
bins = [0, 18, 30, 45, 60, 100]
labels = ["Teen", "Young Adult", "Adult", "Middle Aged", "Senior"]
df["Age Group"] = pd.cut(df["Age"], bins=bins, labels=labels, right=False)

# 9. Rename Columns
df = df.rename(columns={
    "Customer ID": "customer_id",
    "Customer Name": "customer_name",
    "Gender": "gender",
    "Age": "age",
    "Country": "country",
    "Customer Since": "customer_since",
    "Sales Amount": "sales_amount"
})

# 10. Export Cleaned Data
output_path = "/kaggle/working/customer_sales_data_cleaned.csv"
df.to_csv(output_path, index=False)
print(f" Cleaned dataset saved to {output_path}")


First 5 rows:
    Customer ID Customer Name  Gender  Age  Country Customer Since  \
0         1001         David  Female   64  Germany     2015-01-01   
1         1002         Emily    Male   29    China     2015-01-02   
2         1003          Sara    Male   33   France     2015-01-03   
3         1004         David    Male   41    Egypt     2015-01-04   
4         1005           Doe    Male   36  Germany     2015-01-05   

   Sales Amount  
0        149.01  
1        415.29  
2        513.28  
3        669.61  
4        953.00   

Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Customer ID     1000 non-null   int64  
 1   Customer Name   1000 non-null   object 
 2   Gender          1000 non-null   object 
 3   Age             1000 non-null   int64  
 4   Country         1000 non-null   object 
 5   Customer Since  1000 non-nul