In [13]:
# import những thư viện cần thiết

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # line, bar, scatter, histogram, ...
import seaborn as sns #boxplot, heatmap, ...

# display setup cho đẹp và dễ nhìn
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: f'{x:.3f}')
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('pastel')

In [14]:
# import dữ liệu
df = pd.read_csv('/workspaces/adorie-data-projects/customer-personality-analysis/data/raw/marketing_campaign.csv',
                delimiter = "\t")

# ban đầu để delimiter default, 
# khi print df ra thì phát hiện dữ liệu không được phân tách bởi dấu cách (csv) 
# mà được phân tách bởi kí hiệu \t (tsv)
# --> bổ sung thêm delimiter vào read_csv

In [12]:
# xem một số thông tin cơ bản của dataframe
# số dòng, số cột
df.shape

(2240, 29)

In [20]:
# kiểu dữ liệu
df.dtypes

ID                       int64
Year_Birth               int64
Education               object
Marital_Status          object
Income                 float64
Kidhome                  int64
Teenhome                 int64
Dt_Customer             object
Recency                  int64
MntWines                 int64
MntFruits                int64
MntMeatProducts          int64
MntFishProducts          int64
MntSweetProducts         int64
MntGoldProds             int64
NumDealsPurchases        int64
NumWebPurchases          int64
NumCatalogPurchases      int64
NumStorePurchases        int64
NumWebVisitsMonth        int64
AcceptedCmp3             int64
AcceptedCmp4             int64
AcceptedCmp5             int64
AcceptedCmp1             int64
AcceptedCmp2             int64
Complain                 int64
Z_CostContact            int64
Z_Revenue                int64
Response                 int64
dtype: object

In [21]:
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,88,546,172,88,88,3,8,10,4,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,1,6,2,1,6,2,1,1,2,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,49,127,111,21,42,1,8,2,10,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,4,20,10,3,5,2,2,0,4,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,43,118,46,27,15,5,5,3,6,5,0,0,0,0,0,0,3,11,0


In [None]:
# Có thể chia các cột / attributes theo nhóm:

# Nhóm về khách hàng:
# ID: Customer's unique identifier
# Year_Birth: Customer's birth year
# Education: Customer's education level -- CATEGORICAL
# Marital_Status: Customer's marital status -- CATEGORICAL
# Income: Customer's yearly household income
# Kidhome: Number of children in customer's household
# Teenhome: Number of teenagers in customer's household
# Dt_Customer: Date of customer's enrollment with the company
# Recency: Number of days since customer's last purchase
# Complain: 1 if the customer complained in the last 2 years, 0 otherwise -- CATEGORICAL

# Nhóm về sản phẩm:
# MntWines: Amount spent on wine in last 2 years
# MntFruits: Amount spent on fruits in last 2 years
# MntMeatProducts: Amount spent on meat in last 2 years
# MntFishProducts: Amount spent on fish in last 2 years
# MntSweetProducts: Amount spent on sweets in last 2 years
# MntGoldProds: Amount spent on gold in last 2 years

# Nhóm về các chiến dịch promotion
# NumDealsPurchases: Number of purchases made with a discount
# AcceptedCmp1: 1 if customer accepted the offer in the 1st campaign, 0 otherwise -- CATEGORICAL
# AcceptedCmp2: 1 if customer accepted the offer in the 2nd campaign, 0 otherwise -- CATEGORICAL
# AcceptedCmp3: 1 if customer accepted the offer in the 3rd campaign, 0 otherwise -- CATEGORICAL
# AcceptedCmp4: 1 if customer accepted the offer in the 4th campaign, 0 otherwise -- CATEGORICAL
# AcceptedCmp5: 1 if customer accepted the offer in the 5th campaign, 0 otherwise -- CATEGORICAL
# Response: 1 if customer accepted the offer in the last campaign, 0 otherwise -- CATEGORICAL

# Nhóm về kênh mua hàng
# NumWebPurchases: Number of purchases made through the company’s website
# NumCatalogPurchases: Number of purchases made using a catalogue
# NumStorePurchases: Number of purchases made directly in stores
# NumWebVisitsMonth: Number of visits to company’s website in the last month

In [36]:
# cột Dt_Customer là dữ liệu datetime dạng d - m - Y
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], format = "%d-%m-%Y")

In [42]:
# extract ngày, tháng, năm thành các cột riêng để phân tích thời gian 
df['Dt_year'] = df['Dt_Customer'].dt.year
df['Dt_month'] = df['Dt_Customer'].dt.month
df['Dt_day'] = df['Dt_Customer'].dt.day

In [31]:
# kiểm tra sự trùng lặp trong dữ liệu của attribute ID
check_duplicated = df['ID'].duplicated().any()
if check_duplicated:
    print('Có dữ liệu trùng lặp')
else:
    print('Không có dữ liệu trùng lặp')

Không có dữ liệu trùng lặp
