In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import re
import warnings
warnings.filterwarnings('ignore')

# Data Import

In [2]:
data_1 = pd.read_csv('./data/2019.csv', index_col=None, encoding = 'gb18030')
data_2 = pd.read_csv('./data/2013.08-2018.12.csv', index_col=None, encoding = 'gb18030')
data_3 = pd.read_csv('./data/2010.01-2013.08.csv', index_col=None, encoding = 'gb18030')

In [3]:
print(f'The shape of data1 is {data_1.shape}, shape of data2 is {data_2.shape}, shape of data3 is {data_3.shape},')

The shape of data1 is (2645, 61), shape of data2 is (92256, 30), shape of data3 is (45625, 30),


In [21]:
data = pd.concat([data_1.loc[:,data_2.columns], data_2, data_3], ignore_index=True)
data = data.rename(columns={'订单编号':'OrderID', '买家会员名':'MemberName', '买家支付宝账号':'AlipayAccount',
                            '买家应付货款':'OwnedAmount', '买家应付邮费':'DeliveryFee', '买家支付积分':'PointsPay',
                            '总金额':'Total', '返点积分':'PointsEarned', '买家实际支付金额':'DirectPay' ,
                            '买家实际支付积分':'PointsPay', '订单状态':'OrderStatus', '买家留言':'BuyerMessage',
                            '收货人姓名':'ReceiverName', '收货地址 ':'ReceivingAdd', '运送方式':'DeliverMethod',
                            '联系电话 ':'ContactPhone', '联系手机':'CellPhone', '订单创建时间':'OrderTime',
                            '订单付款时间 ':'PaytTime', '宝贝标题 ':'ItemName', '宝贝种类 ':'ItemCate',
                            '物流单号 ':'DeliverNo', '物流公司':'DeliverCompany', '订单备注':'OrderNote', 
                            '宝贝总数量':'OrderItemNo', '店铺Id':'StoreID', '店铺名称':'StoreName', 
                            '确认收货时间':'DeliveredTime', '打款商家金额':'StoreReceived', '是否村淘订单':'CunTaoOrder'
                            })

In [22]:
data = data.drop(data.loc[(data['OrderTime']>'2019-08-01 00:00:00') |\
                          (data['OrderStatus'] == '卖家已发货，等待买家确认')].index,axis=0)

In [23]:
data.shape

(140068, 30)

# Data Clean

- There are total 13 columns with missing data:  
        AlipayAccount, BuyerMessage, ReceiverName, DeliverMethod, ContactPhone,  
        CellPhone, PaytTime, DeliverNo, DeliverCompany, OrderNote,  
        StoreID, StoreName, DeliveredTime  

In [24]:
np.sum(data.isnull(),axis=0)

OrderID                0
MemberName             0
AlipayAccount       1064
OwnedAmount            0
DeliveryFee            0
PointsPay              0
Total                  0
PointsEarned           0
DirectPay              0
PointsPay              0
OrderStatus            0
BuyerMessage      124843
ReceiverName           1
ReceivingAdd           0
DeliverMethod          1
ContactPhone      113075
CellPhone            962
OrderTime              0
PaytTime           27044
ItemName             404
ItemCate               0
DeliverNo          32048
DeliverCompany     32048
OrderNote         122258
OrderItemNo            0
StoreID           136952
StoreName          54394
DeliveredTime      32030
StoreReceived          0
CunTaoOrder            0
dtype: int64

In [25]:
# Create Province & City
data.loc[156, 'ReceivingAdd'] = '海南省 文昌市 文城镇 海南省文昌市文昌中学'
data.loc[86947, 'ReceivingAdd'] = '四川省 成都市 天府新区 华阳街道音乐广场贝康宠物医院(610213)'
data['Province'] = list(map(lambda s: s.split(' ')[0], data['ReceivingAdd']))
data['City'] = list(map(lambda s: s.split(' ')[1], data['ReceivingAdd']))

- There is no record missing CellPhone, ContactPhone, AlipayAccount at the same time
 - CellPhone 962 missing values, fill with ContactPhone
 - AlipayAccount 1601 missing values, fill with CellPhone, ContactPhone

In [26]:
data['CellPhone'] = data['CellPhone'].fillna(data['ContactPhone'])
data['AlipayAccount'] = data['AlipayAccount'].fillna(data['CellPhone'])

- Missing BuyerMessage and OrderNote means customers left no message
 - BuyerMessage 124885 missing values, fill with 'No'
 - OrderNote 22259 missing values, fill with 'No'
- ItemName 404 missing values, fill with 'No'

In [54]:
data['BuyerMessage'] = data['BuyerMessage'].fillna('No')
data['OrderNote'] = data['OrderNote'].fillna('No')
data['OrderNote'] = data['OrderNote'].apply(lambda s: 'No' if s=="'null" else s)
data['ItemName'] = data['ItemName'].fillna('No')

- There is one record with no information, delete it

In [28]:
data[data['ReceiverName'].isna()] 

Unnamed: 0,OrderID,MemberName,AlipayAccount,OwnedAmount,DeliveryFee,PointsPay,Total,PointsEarned,DirectPay,PointsPay.1,OrderStatus,BuyerMessage,ReceiverName,ReceivingAdd,DeliverMethod,ContactPhone,CellPhone,OrderTime,PaytTime,ItemName,ItemCate,DeliverNo,DeliverCompany,OrderNote,OrderItemNo,StoreID,StoreName,DeliveredTime,StoreReceived,CunTaoOrder,Province,City
73610,1053302034007093.0,clearcatnn,clearcatnn@hotmail.com,165.0,0.0,0.0,165.0,82.0,165.0,0.0,交易成功,No,,,,,,2015-05-17 16:02:14,2015-05-17 16:02:34,科博磁力棒磁铁儿童益智玩具男女孩智力拼装玩具百变早教磁力积木,1.0,,,No,1.0,,母婴惠童专营店,2015-05-22 18:42:27,165.00元,否,,
156,,,,,,,,,,,,No,,海南省 文昌市 文城镇 海南省文昌市文昌中学,,,,,,,,,,No,,,,,,,海南省,文昌市


In [29]:
data = data[~data['ReceiverName'].isna()]

- Records with missing PaytTime and DeliveredTime are all from closed orders, which means the customers closed the orders before paying.
 - PaytTime 26640 missing values(all included in closed orders), fill with 0
 - DeliveredTime 31516 missing values(all included in closed orders), fill with 0

In [30]:
t1 = data[(data['PaytTime'].isnull()) & (data['OrderStatus'] == '交易成功')].shape[0]
t2 = data[(data['DeliveredTime'].isnull()) & (data['OrderStatus'] == '交易成功')].shape[0]
print(f'The number of success order without PaytTime is {t1}, the number of success order without DeliveredTime is {t2}')

The number of success order without PaytTime is 0, the number of success order without DeliveredTime is 0


In [31]:
data['PaytTime'] = data['PaytTime'].fillna(0)
data['DeliveredTime'] = data['DeliveredTime'].fillna(0)

- ContactPhone, StoreID, StoreName, DeliverNo are useless columns, delete

In [32]:
data = data.drop(columns = (['ContactPhone', 'StoreID','StoreName', 'DeliverNo']), axis = 0)

- Check the ItemName and OrderNote, delete records which just pay for delivery fee.
- Impute DeliverCompany with mode in every province
- After imputation, the left 29693 missing in DeliverCompany are from closed orders, fill with No

In [33]:
data = data[~(data['ItemName'] == '邮费运费链接补差价专拍')]

In [34]:
def delete_fake_orders(df, key_words):
    for word in key_words:
        tmp = df[(df['DeliverCompany'].isnull()) & (df['OrderStatus'] == '交易成功')]
        df = df.drop(tmp[tmp['OrderNote'].apply(lambda x : re.search(word, x) != None)].index, axis = 0)
    return df

def impute_deliver_company(df):
    impute_index = df[(df['DeliverCompany'].isnull())&(df['OrderStatus'] == '交易成功')].index
    for index in impute_index:
        df.loc[index, 'DeliverCompany'] = df[(~df['DeliverCompany'].isnull())&(df['Province'] == df.loc[index].Province)].DeliverCompany.mode()[0]
    return df

In [35]:
data = delete_fake_orders(data, ['邮','费','差','价','运','费','补','重复','不要','不发','关闭'])
data = impute_deliver_company(data)
data['DeliverCompany'] = data['DeliverCompany'].fillna('No')

In [55]:
data.isnull().values.any()

False

**Create time features**

In [56]:
data['OrderTime'] = pd.to_datetime(data['OrderTime'])
data['PaytTime'] = pd.to_datetime(data['PaytTime'])
data['DeliveredTime'] = pd.to_datetime(data['DeliveredTime'])

In [57]:
# create new col: time window between PaytTime & OrderTime
data['OrderToPay(s)'] = (data['PaytTime'] - data['OrderTime']).astype('timedelta64[s]')
# create new col: time 
data['PayToReceive(h)'] = (data['DeliveredTime'] - data['PaytTime']).astype('timedelta64[h]')

In [58]:
data.head(3)

Unnamed: 0,OrderID,MemberName,AlipayAccount,OwnedAmount,DeliveryFee,PointsPay,Total,PointsEarned,DirectPay,PointsPay.1,OrderStatus,BuyerMessage,ReceiverName,ReceivingAdd,DeliverMethod,CellPhone,OrderTime,PaytTime,ItemName,ItemCate,DeliverCompany,OrderNote,OrderItemNo,DeliveredTime,StoreReceived,CunTaoOrder,Province,City,OrderToPay(s),PayToReceive(h)
454,"=""561099105503109255""",雷霆3战机,342239175@qq.com,154.84,0.0,0.0,154.84,77.0,154.84,0.0,交易成功,No,陈丹,福建省 福州市 仓山区 东升街道东园路10号金辉莱茵城14号楼（东辉社区）(000000),快递,'13405907147,2019-07-31 23:28:46,2019-07-31 23:28:58,迪宝乐电子积木儿童益智拼装电路玩具男女孩物理教科书5-6-12岁,1.0,韵达快递,No,1.0,2019-08-09 22:38:05,154.84元,否,福建省,福州市,12.0,215.0
455,"=""561103267468252376""",konglingsheng888,kls666@sina.com,109.0,0.0,0.0,109.0,54.0,109.0,0.0,交易成功,No,孔令晟,安徽省 铜陵市 铜官山区 铜官山区虚镇秀水山庄4栋401(244000),快递,'13856271285,2019-07-31 22:58:03,2019-07-31 22:58:30,迪士尼书包小学生男1-3-4年级汽车麦昆卡通儿童护脊双肩背包,1.0,圆通速递,No,1.0,2019-08-03 19:47:38,109.00元,否,安徽省,铜陵市,27.0,68.0
456,"=""561084450761550335""",更好的明天1992,18067226706,103.0,0.0,0.0,103.0,51.0,103.0,0.0,交易成功,No,吴文洁,河南省 商丘市 睢县 城关镇西门里赵家胡同(000000),快递,'18067222105,2019-07-31 22:55:06,2019-07-31 22:55:41,迪宝乐电子积木儿童益智拼装电路玩具男女孩物理教科书5-6-12岁,1.0,韵达快递,No,1.0,2019-08-11 08:32:16,103.00元,否,河南省,商丘市,35.0,249.0


Notes from 2019.csv: 
1. 订单编号 is unique   ???
2. '总金额' = '买家应付货款' + '买家应付邮费', (32 records with 买家应付邮费)
3. '买家实际支付金额' =  '总金额' -  '退款金额'   
4. '返点积分' = 0.5 * '买家实际支付金额' + '买家支付积分'
5. '打款商家金额' = '买家实际支付金额'

In [59]:
data[data['OrderID'] == 403137289407813]

Unnamed: 0,OrderID,MemberName,AlipayAccount,OwnedAmount,DeliveryFee,PointsPay,Total,PointsEarned,DirectPay,PointsPay.1,OrderStatus,BuyerMessage,ReceiverName,ReceivingAdd,DeliverMethod,CellPhone,OrderTime,PaytTime,ItemName,ItemCate,DeliverCompany,OrderNote,OrderItemNo,DeliveredTime,StoreReceived,CunTaoOrder,Province,City,OrderToPay(s),PayToReceive(h)
94900,403137289407813,天涯海角516087,13584860290,78.0,0.0,0.0,78.0,39.0,78.0,0.0,交易成功,No,胡国艳,江苏省 苏州市 园区 唯亭镇青苑四区14栋2单元(215121),快递,'13782986509,2013-08-19 22:29:27,2013-08-19 22:31:20,和乐虎早教机故事机幼儿和乐熊和乐族儿童益智玩具会说话的巧虎,1.0,申通E物流,No,1.0,2013-08-21 19:00:32,78.00元,否,江苏省,苏州市,113.0,44.0
94901,403137289407813,天涯海角516087,13584860290,78.0,0.0,0.0,78.0,39.0,78.0,0.0,交易成功,No,胡国艳,江苏省 苏州市 园区 唯亭镇青苑四区14栋2单元(215121),快递,'13782986509,2013-08-19 22:29:27,2013-08-19 22:31:20,和乐虎早教机故事机幼儿和乐熊和乐族儿童益智玩具会说话的巧虎,1.0,申通E物流,No,1.0,2013-08-21 19:00:32,78.00元,否,江苏省,苏州市,113.0,44.0


In [60]:
data = data.drop_duplicates()

In [61]:
data.shape

(139771, 30)

In [62]:
data.to_csv('./data/after_clean.csv', index = False, encoding = 'gb18030')