# Cleaning and Transforming Retail Sales Dataset

## Importing Data

In [35]:
import numpy as np
import pandas as pd

In [36]:
retail_df = pd.read_csv("customer_shopping_data.csv")

retail_df.info()
retail_df.head()
# retail_df.tail()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99457 entries, 0 to 99456
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   invoice_no      99457 non-null  object 
 1   customer_id     99457 non-null  object 
 2   gender          99457 non-null  object 
 3   age             99457 non-null  int64  
 4   category        99457 non-null  object 
 5   quantity        99457 non-null  int64  
 6   price           99457 non-null  float64
 7   payment_method  99457 non-null  object 
 8   invoice_date    99457 non-null  object 
 9   shopping_mall   99457 non-null  object 
dtypes: float64(1), int64(2), object(7)
memory usage: 7.6+ MB


Unnamed: 0,invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall
0,I138884,C241288,Female,28,Clothing,5,1500.4,Credit Card,5/8/2022,Kanyon
1,I317333,C111565,Male,21,Shoes,3,1800.51,Debit Card,12/12/2021,Forum Istanbul
2,I127801,C266599,Male,20,Clothing,1,300.08,Cash,9/11/2021,Metrocity
3,I173702,C988172,Female,66,Shoes,5,3000.85,Credit Card,16/05/2021,Metropol AVM
4,I337046,C189076,Female,53,Books,4,60.6,Cash,24/10/2021,Kanyon


In [37]:
# Check missing data
print(retail_df.isnull().sum())

invoice_no        0
customer_id       0
gender            0
age               0
category          0
quantity          0
price             0
payment_method    0
invoice_date      0
shopping_mall     0
dtype: int64


In [38]:
# Check duplicate data
print(retail_df.duplicated().sum())

0


In [39]:
# Check for multiple representations
print(retail_df['invoice_no'].value_counts()) # must be unique
print("----------------------------------------")
print(retail_df['customer_id'].value_counts()) # must be unique
print("----------------------------------------")
print(retail_df['gender'].value_counts()) # 'Male' or 'Female' only
print("----------------------------------------")
print(retail_df['age'].value_counts())
print("----------------------------------------")
print(retail_df['category'].value_counts()) # 'Clothing', 'Cosmetics', 'Food & Beverage', 'Toys', 'Shoes', 'Souvenir', 'Technology', and 'Books' only
print("----------------------------------------")
print(retail_df['quantity'].value_counts())
print("----------------------------------------")
print(retail_df['price'].value_counts())
print("----------------------------------------")
print(retail_df['payment_method'].value_counts()) # 'Cash', 'Credit Card', and 'Debit Card' only
print("----------------------------------------")
print(retail_df['invoice_date'].value_counts())
print("----------------------------------------")
print(retail_df['shopping_mall'].value_counts()) # 'Mall of Istanbul', 'Kanyon', 'Metrocity', 'Metropol AVM', 'Istinye Park', 'Zorlu Center', 'Cevahir AVM', 'Forum Istanbul', 
                                            # 'Viaport Outlet', and 'Emaar Square Mall' only

invoice_no
I138884    1
I291671    1
I175779    1
I226143    1
I683818    1
          ..
I319287    1
I173400    1
I828064    1
I194850    1
I232867    1
Name: count, Length: 99457, dtype: int64
----------------------------------------
customer_id
C241288    1
C116138    1
C382765    1
C285074    1
C405356    1
          ..
C220083    1
C286933    1
C301304    1
C214184    1
C273973    1
Name: count, Length: 99457, dtype: int64
----------------------------------------
gender
Female    59482
Male      39975
Name: count, dtype: int64
----------------------------------------
age
37    2057
22    2051
64    2002
43    2000
51    1993
30    1981
24    1977
40    1960
48    1955
36    1954
38    1954
28    1953
27    1950
39    1947
21    1947
61    1945
52    1945
19    1936
56    1916
33    1913
46    1911
62    1909
44    1904
53    1903
67    1901
69    1901
23    1897
26    1896
68    1893
42    1892
41    1892
32    1891
63    1886
29    1885
49    1883
34    1883
47    1880
57    1879

In [40]:
# Check for outliers

retail_df['age'].min() # 18
retail_df['age'].max() # 69

retail_df['price'].min() # 5.23
retail_df['price'].max() # 5250.0

np.float64(5250.0)

## Data Processing / Transformation

In [43]:
# Convert invoice_date into datetime
retail_df['invoice_date'] = pd.to_datetime(retail_df['invoice_date'], format='%d/%m/%Y')

# Column transformation

# Create age groups
bins = [17, 26, 42, 58, 77]
labels = ['Gen Z', 'Millennial', 'Gen X', 'Baby Boomer']
retail_df['age_group'] = pd.cut(retail_df['age'], bins=bins, labels=labels, right=True)

# Extract year, month, and day from invoice_date
retail_df['invoice_year'] = retail_df['invoice_date'].dt.year
retail_df['invoice_month'] = retail_df['invoice_date'].dt.month
retail_df['invoice_day'] = retail_df['invoice_date'].dt.day

# Convert Turkish Lira to USD based on the Mid-market exchange rate of 1 USD = 41.45 TRY as of September 26, 2025
retail_df['price_usd'] = (retail_df['price'] / 41.45).round(2)

# Get revenue using quantity and price_usd
retail_df['revenue'] = (retail_df['quantity'] * retail_df['price_usd']).round(2)

retail_df.head()

Unnamed: 0,invoice_no,customer_id,gender,age,category,quantity,price,payment_method,invoice_date,shopping_mall,age_group,invoice_year,invoice_month,invoice_day,price_usd,revenue
0,I138884,C241288,Female,28,Clothing,5,1500.4,Credit Card,2022-08-05,Kanyon,Millennial,2022,8,5,36.2,181.0
1,I317333,C111565,Male,21,Shoes,3,1800.51,Debit Card,2021-12-12,Forum Istanbul,Gen Z,2021,12,12,43.44,130.32
2,I127801,C266599,Male,20,Clothing,1,300.08,Cash,2021-11-09,Metrocity,Gen Z,2021,11,9,7.24,7.24
3,I173702,C988172,Female,66,Shoes,5,3000.85,Credit Card,2021-05-16,Metropol AVM,Baby Boomer,2021,5,16,72.4,362.0
4,I337046,C189076,Female,53,Books,4,60.6,Cash,2021-10-24,Kanyon,Gen X,2021,10,24,1.46,5.84


In [44]:
# Save the transformed dataset to a new CSV file
retail_df.to_csv('preprocessed_customer_shopping_data.csv', index=False)