In [3]:
import pandas as pd

In [4]:
df = pd.read_excel('data.xlsx', 'transaction')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   transaction_id    20000 non-null  int64         
 1   product_id        20000 non-null  int64         
 2   customer_id       20000 non-null  int64         
 3   transaction_date  20000 non-null  datetime64[ns]
 4   online_order      19640 non-null  object        
 5   order_status      20000 non-null  object        
 6   brand             19803 non-null  object        
 7   product_line      19803 non-null  object        
 8   product_class     19803 non-null  object        
 9   product_size      19803 non-null  object        
 10  list_price        20000 non-null  float64       
 11  standard_cost     19803 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(3), object(6)
memory usage: 1.8+ MB


In [6]:
df.columns

Index(['transaction_id', 'product_id', 'customer_id', 'transaction_date',
       'online_order', 'order_status', 'brand', 'product_line',
       'product_class', 'product_size', 'list_price', 'standard_cost'],
      dtype='object')

In [7]:
# Joining the chosen fields to the composite surrogate id
df['surrogate_id'] = df['product_id'].astype(str) + df['brand'].astype(str) + df['product_line'].astype(str) + df['product_class'].astype(str) + df['product_size'].astype(str) + df['list_price'].astype(str) + df['standard_cost'].astype(str)


In [8]:
# assigning unique integer values for surrogate ids
df['product_key_id'] = pd.factorize(df['surrogate_id'])[0]

In [9]:
df.head(20)

Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,surrogate_id,product_key_id
0,1,2,2950,2017-02-25,False,Approved,Solex,Standard,medium,medium,71.49,53.62,2SolexStandardmediummedium71.4953.62,0
1,2,3,3120,2017-05-21,True,Approved,Trek Bicycles,Standard,medium,large,2091.47,388.92,3Trek BicyclesStandardmediumlarge2091.47388.92,1
2,3,37,402,2017-10-16,False,Approved,OHM Cycles,Standard,low,medium,1793.43,248.82,37OHM CyclesStandardlowmedium1793.43248.82,2
3,4,88,3135,2017-08-31,False,Approved,Norco Bicycles,Standard,medium,medium,1198.46,381.1,88Norco BicyclesStandardmediummedium1198.46381.1,3
4,5,78,787,2017-10-01,True,Approved,Giant Bicycles,Standard,medium,large,1765.3,709.48,78Giant BicyclesStandardmediumlarge1765.3709.48,4
5,6,25,2339,2017-03-08,True,Approved,Giant Bicycles,Road,medium,medium,1538.99,829.65,25Giant BicyclesRoadmediummedium1538.99829.65,5
6,7,22,1542,2017-04-21,True,Approved,WeareA2B,Standard,medium,medium,60.34,45.26,22WeareA2BStandardmediummedium60.3445.26,6
7,8,15,2459,2017-07-15,False,Approved,WeareA2B,Standard,medium,medium,1292.84,13.44,15WeareA2BStandardmediummedium1292.8413.44,7
8,9,67,1305,2017-08-10,False,Approved,Solex,Standard,medium,large,1071.23,380.74,67SolexStandardmediumlarge1071.23380.74,8
9,10,12,3262,2017-08-30,True,Approved,WeareA2B,Standard,medium,medium,1231.15,161.6,12WeareA2BStandardmediummedium1231.15161.6,9


In [27]:
columns_to_products = ['product_key_id', 'product_id', 'brand', 'product_line', 'product_class', 'product_size', 'list_price', 'standard_cost']

products = df[columns_to_products].drop_duplicates()
products.info()

<class 'pandas.core.frame.DataFrame'>
Index: 399 entries, 0 to 19871
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   product_key_id  399 non-null    int64  
 1   product_id      399 non-null    int64  
 2   brand           203 non-null    object 
 3   product_line    203 non-null    object 
 4   product_class   203 non-null    object 
 5   product_size    203 non-null    object 
 6   list_price      399 non-null    float64
 7   standard_cost   203 non-null    float64
dtypes: float64(2), int64(2), object(4)
memory usage: 28.1+ KB


In [29]:
# exporting csv with products
products.to_csv('products.csv', index=False, columns=columns_to_products)

In [32]:
customers = pd.read_excel('data.xlsx', 'customer')

In [33]:
customers.head()

Unnamed: 0,customer_id,first_name,last_name,gender,DOB,job_title,job_industry_category,wealth_segment,deceased_indicator,owns_car,address,postcode,state,country,property_valuation
0,1,Laraine,Medendorp,F,1953-10-12 00:00:00,Executive Secretary,Health,Mass Customer,N,Yes,060 Morning Avenue,2016,New South Wales,Australia,10
1,2,Eli,Bockman,Male,1980-12-16 00:00:00,Administrative Officer,Financial Services,Mass Customer,N,Yes,6 Meadow Vale Court,2153,New South Wales,Australia,10
2,3,Arlin,Dearle,Male,1954-01-20 00:00:00,Recruiting Manager,Property,Mass Customer,N,Yes,0 Holy Cross Court,4211,QLD,Australia,9
3,4,Talbot,,Male,1961-10-03 00:00:00,,IT,Mass Customer,N,No,17979 Del Mar Point,2448,New South Wales,Australia,4
4,5,Sheila-kathryn,Calton,Female,1977-05-13 00:00:00,Senior Editor,,Affluent Customer,N,Yes,9 Oakridge Court,3216,VIC,Australia,9


In [37]:
# Check if each customer_id in df is in customers dataset
mask_missing_in_customers = ~df['customer_id'].isin(customers['customer_id'])
df[mask_missing_in_customers]



Unnamed: 0,transaction_id,product_id,customer_id,transaction_date,online_order,order_status,brand,product_line,product_class,product_size,list_price,standard_cost,surrogate_id,product_key_id
8707,8708,0,5034,2017-10-07,False,Approved,Solex,Road,medium,medium,416.98,312.735016,0SolexRoadmediummedium416.98312.7350159,281
16700,16701,0,5034,2017-01-27,False,Approved,Norco Bicycles,Standard,medium,medium,360.4,270.299988,0Norco BicyclesStandardmediummedium360.4270.29...,363
17468,17469,0,5034,2017-01-03,False,Approved,OHM Cycles,Road,medium,medium,742.54,667.400024,0OHM CyclesRoadmediummedium742.54667.4000244,372


In [44]:
# There is one missing customer (three transactions). The assignment condition allowed to export only some of the transactions, so let's drop those before export:
# exporting csv with transactions

df = df[~mask_missing_in_customers]

columns_to_transactions = ['transaction_id', 'product_key_id', 'customer_id', 'transaction_date', 'online_order', 'order_status']
df.to_csv('transactions.csv', index=False, columns=columns_to_transactions)


In [45]:
customers.to_csv('customers.csv', index=False)