In [154]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('whitegrid')
from sklearn.pipeline import Pipeline

In [155]:
df = pd.read_csv('customer_shopping_data.csv')
df.head()

Unnamed: 0,invoice_no,customer_id,gender,age,category,quantity,Price,payment_method,invoice_date,shopping_mall
0,I337046,C189076,Female,53,Books,4,60.6,Cash,24/10/2021,Kanyon
1,I139207,C191708,Female,29,Books,1,15.15,Credit Card,28/10/2022,Emaar Square Mall
2,I294687,C300786,Male,65,Books,2,30.3,Debit Card,16/01/2021,Metrocity
3,I174250,C204553,Female,42,Books,5,75.75,Cash,16/12/2022,Metrocity
4,I117291,C134449,Male,46,Books,5,75.75,Credit Card,9/12/22,Zorlu Center


In [156]:
# Check data types, existance of Null value and issues with column name

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99457 entries, 0 to 99456
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   invoice_no      99457 non-null  object 
 1   customer_id     99457 non-null  object 
 2   gender          99457 non-null  object 
 3    age            99457 non-null  int64  
 4   category        99448 non-null  object 
 5   quantity        99457 non-null  int64  
 6   Price           99442 non-null  float64
 7   payment_method  99448 non-null  object 
 8   invoice_date    99448 non-null  object 
 9   shopping_mall   99439 non-null  object 
dtypes: float64(1), int64(2), object(7)
memory usage: 7.6+ MB


In [None]:
# Check details on the dataset

df.describe(include='all')

Unnamed: 0,invoice_no,customer_id,gender,age,category,quantity,Price,payment_method,invoice_date,shopping_mall
count,99457,99457,99457,99457.0,99448,99457.0,99442.0,99448,99448,99439
unique,99457,99457,2,,14,,,3,797,10
top,I337046,C189076,Female,,Clothing,,,Cash,24/11/2021,Mall of Istanbul
freq,1,1,59482,,34418,,,44445,159,19939
mean,,,,43.427089,,3.003429,689.27616,,,
std,,,,14.990054,,1.413025,941.218853,,,
min,,,,18.0,,1.0,5.23,,,
25%,,,,30.0,,2.0,45.45,,,
50%,,,,43.0,,3.0,203.3,,,
75%,,,,56.0,,4.0,1200.32,,,


### Change name and indentation of column names, then converting column names into lower case

In [158]:
df.rename(columns={
    ' age ':'age',
    'invoice_date': 'datetime'
}, inplace = True)
df

Unnamed: 0,invoice_no,customer_id,gender,age,category,quantity,Price,payment_method,datetime,shopping_mall
0,I337046,C189076,Female,53,Books,4,60.60,Cash,24/10/2021,Kanyon
1,I139207,C191708,Female,29,Books,1,15.15,Credit Card,28/10/2022,Emaar Square Mall
2,I294687,C300786,Male,65,Books,2,30.30,Debit Card,16/01/2021,Metrocity
3,I174250,C204553,Female,42,Books,5,75.75,Cash,16/12/2022,Metrocity
4,I117291,C134449,Male,46,Books,5,75.75,Credit Card,9/12/22,Zorlu Center
...,...,...,...,...,...,...,...,...,...,...
99452,I180008,C158576,Female,19,,3,121.98,Cash,28/11/2021,Metropol AVM
99453,I117736,C336781,Female,25,,1,300.08,Credit Card,17/01/2022,Mall of Istanbul
99454,I327594,C189864,Male,25,,1,11.73,Cash,6/10/22,Kanyon
99455,I229534,C274652,Female,28,,2,81.32,,7/5/21,Emaar Square Mall


In [159]:
df = df.rename(columns=str.lower)
df.head()

Unnamed: 0,invoice_no,customer_id,gender,age,category,quantity,price,payment_method,datetime,shopping_mall
0,I337046,C189076,Female,53,Books,4,60.6,Cash,24/10/2021,Kanyon
1,I139207,C191708,Female,29,Books,1,15.15,Credit Card,28/10/2022,Emaar Square Mall
2,I294687,C300786,Male,65,Books,2,30.3,Debit Card,16/01/2021,Metrocity
3,I174250,C204553,Female,42,Books,5,75.75,Cash,16/12/2022,Metrocity
4,I117291,C134449,Male,46,Books,5,75.75,Credit Card,9/12/22,Zorlu Center


### Convert datetime column from object to Datetime format and fill null with forward fill

In [160]:
df['datetime'] = pd.to_datetime(df['datetime'], errors='raise', format='mixed')
df.head()

Unnamed: 0,invoice_no,customer_id,gender,age,category,quantity,price,payment_method,datetime,shopping_mall
0,I337046,C189076,Female,53,Books,4,60.6,Cash,2021-10-24,Kanyon
1,I139207,C191708,Female,29,Books,1,15.15,Credit Card,2022-10-28,Emaar Square Mall
2,I294687,C300786,Male,65,Books,2,30.3,Debit Card,2021-01-16,Metrocity
3,I174250,C204553,Female,42,Books,5,75.75,Cash,2022-12-16,Metrocity
4,I117291,C134449,Male,46,Books,5,75.75,Credit Card,2022-09-12,Zorlu Center


In [161]:
df['datetime'] = df['datetime'].fillna(method='ffill')

  df['datetime'] = df['datetime'].fillna(method='ffill')


### Fix mistakes in category column

In [162]:
df['category'].unique()

array(['Books', 'Book', 'bok', 'Clothing', 'Clothin', 'Cosmetics',
       'Food & Beverage', 'Shoes', 'Shoe', 'Souvenir', 'Technology',
       'Technologi', 'Toys', 'Toy', nan], dtype=object)

In [163]:
df['category'] = df['category'].replace({
    'bok':'Books',
    'Book':'Books',
    'Toy': 'Toys',
    'Clothin': 'Clothing',
    'Shoe':'Shoes',
    'Technologi': 'Technology'    
})
df['category'].unique()

array(['Books', 'Clothing', 'Cosmetics', 'Food & Beverage', 'Shoes',
       'Souvenir', 'Technology', 'Toys', nan], dtype=object)

### Fill null value of price column with median value

In [164]:
median_value = df['price'].median()
df['price'] = df['price'].fillna(median_value)

In [165]:
df = df.dropna()
df.head()

Unnamed: 0,invoice_no,customer_id,gender,age,category,quantity,price,payment_method,datetime,shopping_mall
0,I337046,C189076,Female,53,Books,4,60.6,Cash,2021-10-24,Kanyon
1,I139207,C191708,Female,29,Books,1,15.15,Credit Card,2022-10-28,Emaar Square Mall
2,I294687,C300786,Male,65,Books,2,30.3,Debit Card,2021-01-16,Metrocity
3,I174250,C204553,Female,42,Books,5,75.75,Cash,2022-12-16,Metrocity
4,I117291,C134449,Male,46,Books,5,75.75,Credit Card,2022-09-12,Zorlu Center


### Drop invoice_no, customer_id columns as they seem to have no affect on dataset.

In [166]:
df = df.drop(labels=['invoice_no','customer_id'], axis=1)
df

Unnamed: 0,gender,age,category,quantity,price,payment_method,datetime,shopping_mall
0,Female,53,Books,4,60.60,Cash,2021-10-24,Kanyon
1,Female,29,Books,1,15.15,Credit Card,2022-10-28,Emaar Square Mall
2,Male,65,Books,2,30.30,Debit Card,2021-01-16,Metrocity
3,Female,42,Books,5,75.75,Cash,2022-12-16,Metrocity
4,Male,46,Books,5,75.75,Credit Card,2022-09-12,Zorlu Center
...,...,...,...,...,...,...,...,...
99443,Male,28,Toys,1,35.84,Credit Card,2022-02-11,Metrocity
99444,Female,47,Toys,5,179.20,Credit Card,2021-08-02,Kanyon
99445,Male,24,Toys,1,35.84,Cash,2021-11-26,Mall of Istanbul
99446,Female,37,Toys,3,107.52,Cash,2021-02-21,Metropol AVM


In [167]:
df.describe(include='all')

Unnamed: 0,gender,age,category,quantity,price,payment_method,datetime,shopping_mall
count,99422,99422.0,99422,99422.0,99422.0,99422,99422,99422
unique,2,,8,,,3,,10
top,Female,,Clothing,,,Cash,,Mall of Istanbul
freq,59459,,34476,,,44433,,19936
mean,,43.42736,,3.00348,689.229708,,2022-02-08 23:17:36.369817600,
min,,18.0,,1.0,5.23,,2021-01-01 00:00:00,
25%,,30.0,,2.0,45.45,,2021-07-19 00:00:00,
50%,,43.0,,3.0,203.3,,2022-02-05 00:00:00,
75%,,56.0,,4.0,1200.32,,2022-08-22 00:00:00,
max,,69.0,,5.0,5250.0,,2023-12-02 00:00:00,


In [168]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99422 entries, 0 to 99447
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   gender          99422 non-null  object        
 1   age             99422 non-null  int64         
 2   category        99422 non-null  object        
 3   quantity        99422 non-null  int64         
 4   price           99422 non-null  float64       
 5   payment_method  99422 non-null  object        
 6   datetime        99422 non-null  datetime64[ns]
 7   shopping_mall   99422 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(2), object(4)
memory usage: 6.8+ MB


In [172]:
dfc = df.sample(n=10000, random_state=9)
dfc.to_csv('/Users/raihannasir/Documents/DA_AI/project_shopping_mall/Dataset/Cleaned/shopping_data.csv', index=False)
dfc.shape

(10000, 8)

In [173]:
dfc = pd.read_csv('/Users/raihannasir/Documents/DA_AI/project_shopping_mall/Dataset/Cleaned/shopping_data.csv')
dfc.head()

Unnamed: 0,gender,age,category,quantity,price,payment_method,datetime,shopping_mall
0,Male,42,Clothing,4,1200.32,Credit Card,2022-01-25,Istinye Park
1,Female,56,Clothing,5,1500.4,Cash,2023-02-18,Zorlu Center
2,Female,20,Clothing,2,600.16,Credit Card,2022-11-30,Viaport Outlet
3,Male,46,Food & Beverage,4,20.92,Debit Card,2021-11-23,Istinye Park
4,Female,32,Food & Beverage,2,10.46,Credit Card,2021-02-19,Metrocity


In [174]:
dfc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   gender          10000 non-null  object 
 1   age             10000 non-null  int64  
 2   category        10000 non-null  object 
 3   quantity        10000 non-null  int64  
 4   price           10000 non-null  float64
 5   payment_method  10000 non-null  object 
 6   datetime        10000 non-null  object 
 7   shopping_mall   10000 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 625.1+ KB


### Convert to Datetime format

In [175]:
dfc['datetime'] = pd.to_datetime(dfc['datetime'])

In [177]:
dfc['year'] = dfc['datetime'].dt.year

In [184]:
dfc['month'] = dfc['datetime'].dt.month_name()

In [185]:
dfc

Unnamed: 0,gender,age,category,quantity,price,payment_method,datetime,shopping_mall,year,month
0,Male,42,Clothing,4,1200.32,Credit Card,2022-01-25,Istinye Park,2022,January
1,Female,56,Clothing,5,1500.40,Cash,2023-02-18,Zorlu Center,2023,February
2,Female,20,Clothing,2,600.16,Credit Card,2022-11-30,Viaport Outlet,2022,November
3,Male,46,Food & Beverage,4,20.92,Debit Card,2021-11-23,Istinye Park,2021,November
4,Female,32,Food & Beverage,2,10.46,Credit Card,2021-02-19,Metrocity,2021,February
...,...,...,...,...,...,...,...,...,...,...
9995,Female,51,Food & Beverage,2,10.46,Credit Card,2023-02-19,Kanyon,2023,February
9996,Male,41,Food & Beverage,1,5.23,Credit Card,2021-11-22,Viaport Outlet,2021,November
9997,Male,64,Clothing,2,600.16,Debit Card,2021-01-15,Mall of Istanbul,2021,January
9998,Female,37,Shoes,5,3000.85,Cash,2023-06-03,Kanyon,2023,June
