## Dataset: amazon-purchases

In [1]:
import pandas as pd

### Import dataset

In [2]:
df_purchases_clean = pd.read_csv("data/raw/amazon_purchases.csv")

### Exploratory data analysis


In [3]:
df_purchases_clean.info()  # data type

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1850717 entries, 0 to 1850716
Data columns (total 8 columns):
 #   Column                    Dtype  
---  ------                    -----  
 0   Order Date                object 
 1   Purchase Price Per Unit   float64
 2   Quantity                  float64
 3   Shipping Address State    object 
 4   Title                     object 
 5   ASIN/ISBN (Product Code)  object 
 6   Category                  object 
 7   Survey ResponseID         object 
dtypes: float64(2), object(6)
memory usage: 113.0+ MB


In [4]:
df_purchases_clean["Survey ResponseID"]

0          R_01vNIayewjIIKMF
1          R_01vNIayewjIIKMF
2          R_01vNIayewjIIKMF
3          R_01vNIayewjIIKMF
4          R_01vNIayewjIIKMF
                 ...        
1850712    R_zfqnsBzlOAKibzb
1850713    R_zfqnsBzlOAKibzb
1850714    R_zfqnsBzlOAKibzb
1850715    R_zfqnsBzlOAKibzb
1850716    R_zfqnsBzlOAKibzb
Name: Survey ResponseID, Length: 1850717, dtype: object

In [5]:
df_purchases_clean["Quantity"].sum()

2014758.0

In [6]:
df_purchases_clean[
    "Category"
].value_counts()  # different data types within the selected column

Category
ABIS_BOOK                        87619
PET_FOOD                         38256
GIFT_CARD                        27734
SHIRT                            27267
NUTRITIONAL_SUPPLEMENT           26913
                                 ...  
FUEL_INJECTOR                        1
FIRE_GRATE                           1
NONACTIVATED_GIFT_CARD               1
AUTO_CHEMICAL                        1
COMPUTER_VIDEO_GAME_CONTOLLER        1
Name: count, Length: 1871, dtype: int64

In [7]:
df_purchases_clean[
    "Survey ResponseID"
].nunique()  # number of unique values within the selected column

5027

In [8]:
df_purchases_clean[
    df_purchases_clean["Order Date"].str.startswith("2024")
]  # filters the rows where the 'Order Date' starts with '2024'

Unnamed: 0,Order Date,Purchase Price Per Unit,Quantity,Shipping Address State,Title,ASIN/ISBN (Product Code),Category,Survey ResponseID
240482,2024-08-15,1.84,1.0,,Allstate 4-Year PC Peripheral Protection Plan ...,B008I64UF4,COMPUTER,R_1d1fnT4sjZABBwe


In [9]:
df_purchases_clean[
    df_purchases_clean["Quantity"] == 55
]  # filters the rows where the 'Quantity' is equal '55'

Unnamed: 0,Order Date,Purchase Price Per Unit,Quantity,Shipping Address State,Title,ASIN/ISBN (Product Code),Category,Survey ResponseID
1833380,2019-04-19,2.15,55.0,,Bring | Teach | Keep: Illuminating the Biblica...,148183651X,ABIS_BOOK,R_yK4rOrk16ZOoWwp


#### Memory usage dataset


In [10]:
memory = df_purchases_clean.memory_usage(deep=True)  # RAM memory used by each column
total_memory_gb = memory.sum() / (1024**3)  # transform into GB
print(f"Total memory: {total_memory_gb:.2f} GB")

memory_percent = memory / memory.sum() * 100  # RAM memory used by each column in GB
print("\nMemory by column:")
print(memory_percent)

Total memory: 0.89 GB

Memory by column:
Index                        0.000014
Order Date                  13.040120
Purchase Price Per Unit      1.557029
Quantity                     1.557029
Shipping Address State      11.233755
Title                       32.049373
ASIN/ISBN (Product Code)    13.036141
Category                    13.124018
Survey ResponseID           14.402521
dtype: float64


### Data cleaning


In [11]:
# Edit "Category" column and convert to "category"
def capitalized_case(s):
    if pd.isna(s):
        return s
    words = s.split("_")
    return " ".join([words[0].capitalize()] + [word.lower() for word in words[1:]])


df_purchases_clean["Category"] = (
    df_purchases_clean["Category"].apply(capitalized_case).astype("category")
)

### Reduce dataset size

In [12]:
# Convert columns to "category"
df_purchases_clean["Shipping Address State"] = df_purchases_clean[
    "Shipping Address State"
].astype("category")

df_purchases_clean["Survey ResponseID"] = df_purchases_clean[
    "Survey ResponseID"
].astype("category")

# Convert columns to "date"
df_purchases_clean["Order Date"] = pd.to_datetime(df_purchases_clean["Order Date"])

# Convert columns to "int8"
df_purchases_clean["Quantity"] = df_purchases_clean["Quantity"].astype("int8")

df_purchases_clean["ASIN/ISBN (Product Code)"] = df_purchases_clean[
    "ASIN/ISBN (Product Code)"
].astype("category")

### Adding columns


In [13]:
# Extract the year from "Order Date"
df_purchases_clean["Order Date Year"] = df_purchases_clean["Order Date"].dt.year

In [14]:
# Extract the month from "Order Date" and sort it
df_purchases_clean["Order Date Month"] = pd.Categorical(
    df_purchases_clean["Order Date"].dt.strftime("%b"),
    categories=[
        "Jan",
        "Feb",
        "Mar",
        "Apr",
        "May",
        "Jun",
        "Jul",
        "Aug",
        "Sep",
        "Oct",
        "Nov",
        "Dec",
    ],
    ordered=True,
)

In [15]:
# Create "Purchase Total" column
df_purchases_clean["Purchase Total"] = (
    df_purchases_clean["Purchase Price Per Unit"] * df_purchases_clean["Quantity"]
)

# NA values
df_purchases_clean["Quantity"] = df_purchases_clean["Quantity"].fillna(0)

#### Memory usage dataset now


In [16]:
memory = df_purchases_clean.memory_usage(deep=True)  # RAM memory used by each column
total_memory_gb = memory.sum() / (1024**3)  # transform into GB
print(f"Total memory: {total_memory_gb:.2f} GB")

memory_percent = memory / memory.sum() * 100  # RAM memory used by each column in GB
print("\nMemory by column:")
print(memory_percent)

Total memory: 0.44 GB

Memory by column:
Index                        0.000028
Order Date                   3.121104
Purchase Price Per Unit      3.121104
Quantity                     0.390138
Shipping Address State       0.391228
Title                       64.243764
ASIN/ISBN (Product Code)    21.951977
Category                     0.822100
Survey ResponseID            0.886549
Order Date Year              1.560552
Order Date Month             0.390353
Purchase Total               3.121104
dtype: float64


In [17]:
df_purchases_clean["Survey ResponseID"].nunique()

5027

### Create new dataset - page 1

In [18]:
selected_columns = [
    "Survey ResponseID",
    "Order Date",
    "Order Date Year",
    "Order Date Month",
    "Category",
    "Quantity",
    "Purchase Price Per Unit",
    "Purchase Total",
    "ASIN/ISBN (Product Code)",
]
df_purchases_clean = df_purchases_clean[selected_columns]

### Save dataset


In [19]:
df_purchases_clean.to_parquet("data/ready/amazon_purchases.parquet")  # parquet format