In [28]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

Download data from kaggle

In [None]:
#!mkdir ~/.kaggle
#!cp kaggle.json ~/.kaggle/
#!chmod 600 ~/.kaggle/kaggle.json
#!kaggle datasets download -d lakshmi25npathi/online-retail-dataset

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/online-retail-dataset
License(s): other
Downloading online-retail-dataset.zip to /content
  0% 0.00/43.3M [00:00<?, ?B/s]
100% 43.3M/43.3M [00:00<00:00, 1.17GB/s]


Extract Data

In [64]:
#!unzip online-retail-dataset.zip
df=pd.read_excel("online_retail.xlsx")

In [65]:
#df
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 525461 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      525461 non-null  object        
 1   StockCode    525461 non-null  object        
 2   Description  522533 non-null  object        
 3   Quantity     525461 non-null  int64         
 4   InvoiceDate  525461 non-null  datetime64[ns]
 5   Price        525461 non-null  float64       
 6   Customer ID  417534 non-null  float64       
 7   Country      525461 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 32.1+ MB


Unnamed: 0,Quantity,InvoiceDate,Price,Customer ID
count,525461.0,525461,525461.0,417534.0
mean,10.337667,2010-06-28 11:37:36.845017856,4.688834,15360.645478
min,-9600.0,2009-12-01 07:45:00,-53594.36,12346.0
25%,1.0,2010-03-21 12:20:00,1.25,13983.0
50%,3.0,2010-07-06 09:51:00,2.1,15311.0
75%,10.0,2010-10-15 12:45:00,4.21,16799.0
max,19152.0,2010-12-09 20:01:00,25111.09,18287.0
std,107.42411,,146.126914,1680.811316


Data type casting

In [66]:
def auto_data_type(df):
    for col in df.select_dtypes(include=[np.number]).columns:
        min_value = df[col].min()
        max_value = df[col].max()

        if pd.api.types.is_float_dtype(df[col]):
            df[col] = df[col].astype(np.float32)
        elif min_value >= 0:
            if max_value <= 255:
                df[col] = df[col].astype(np.uint8)
            elif max_value <= 65535:
                df[col] = df[col].astype(np.uint16)
            elif max_value <= 4294967295:
                df[col] = df[col].astype(np.uint32)
            else:
                df[col] = df[col].astype(np.uint64)
        else:
            if -128 <= min_value and max_value <= 127:
                df[col] = df[col].astype(np.int8)
            elif -32768 <= min_value and max_value <= 32767:
                df[col] = df[col].astype(np.int16)
            elif -2147483648 <= min_value and max_value <= 2147483647:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)
    return df


clean_df=auto_data_type(df)
#df.info()


drop rows with missing Customer ID

In [None]:
df = df.dropna(subset=["Customer ID"])
df["Customer ID"] = df["Customer ID"].astype(np.uint16)
df_copy["Invoice"]=df_copy["Invoice"].astype(np.uint32)


Remove rows with duplicate entries, missing and negative Quantity

In [68]:
df_copy=df.copy()
df_copy = df_copy.drop_duplicates()
df_copy = df_copy[df_copy["Quantity"] >= 0]
df_copy.info()


<class 'pandas.core.frame.DataFrame'>
Index: 400947 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   Invoice      400947 non-null  object        
 1   StockCode    400947 non-null  object        
 2   Description  400947 non-null  object        
 3   Quantity     400947 non-null  int16         
 4   InvoiceDate  400947 non-null  datetime64[ns]
 5   Price        400947 non-null  float32       
 6   Customer ID  400947 non-null  uint16        
 7   Country      400947 non-null  object        
dtypes: datetime64[ns](1), float32(1), int16(1), object(4), uint16(1)
memory usage: 21.4+ MB


Encode Country column to numeric labels

In [69]:
le = LabelEncoder()
df_copy["Country_Code"] = le.fit_transform(df_copy["Country"].astype(str)).astype(np.uint8)

In [70]:
df_copy = df_copy.drop(columns=["Country"])

In [71]:
#df_copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 400947 entries, 0 to 525460
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   Invoice       400947 non-null  object        
 1   StockCode     400947 non-null  object        
 2   Description   400947 non-null  object        
 3   Quantity      400947 non-null  int16         
 4   InvoiceDate   400947 non-null  datetime64[ns]
 5   Price         400947 non-null  float32       
 6   Customer ID   400947 non-null  uint16        
 7   Country_Code  400947 non-null  uint8         
dtypes: datetime64[ns](1), float32(1), int16(1), object(3), uint16(1), uint8(1)
memory usage: 18.7+ MB
