# 订单、定损单、商品库数据
人保订单，德系、日系、美系、韩系车100+品牌，8000 OE，主要是保险杠和大灯，202001--202006数据

In [None]:
# 初始化
%matplotlib inline
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# 加载数据

In [None]:
# 加载数据
# "part_type_code","配件品质代码：01 原厂件;02 品牌件; 03 配套件; 04 流通原厂件;05 配套品牌件;06 售后品牌件;07 经济适用件;08 再制造件;09 拆车回用件"

# 订单列：[主键，车厂，配件名称，去符号OE，品质代码，配件价格，数据来源，来源唯一标志, 原始车型名称, 省份]
orders_column = ['id','manufacturer', 'part_name', 'regularize_oe', 'part_type_code', 'part_price', 'data_source', 'data_source_idx', 'data_vehicletype_name', 'province_name']
# 定损单列：[主键，车厂，配件名称，去符号OE，品质代码，定损价参考类型, 理赔参考价, 定损价格, 核损价格, 本地化价格, 光盘价, 4s店价格，数据来源，来源唯一标志, 原始车型名称, 省份, 定损单创建时间]
damage_column = ['id','manufacturer', 'part_name', 'regularize_oe', 'part_type_code', 'reference_type_name', 'reference_price', 'damage_price', 'verify_price', 'local_price', 'guide_price', 'ms_retail_price', 'data_source', 'data_source_idx', 'data_vehicletype_name', 'province_name']
# 询价单列：[主键，车厂，配件名称，去符号OE，品质代码，配件价格，数据来源，来源唯一标志, 原始车型名称, 省份, 是否选中]
enquiry_column = ['id','manufacturer', 'part_name', 'regularize_oe', 'part_type_code', 'part_price', 'data_source', 'data_source_idx', 'data_vehicletype_name', 'province_name', 'enquiry_comfig_status']

orders_csv = "data/order-2020Q1Q2.csv"
damage_csv = "data/damage-2020Q1Q2.csv"
enquiry_csv = "data/enquiry-2020Q1Q2.csv"

def load_data(csv_path, columns):
    df = pd.read_csv(csv_path)
    df = df[columns]
    return df[df['part_type_code']==1]

df_orders = load_data(orders_csv, orders_column)
df_damage = load_data(damage_csv, damage_column)
df_enquiry = load_data(enquiry_csv, enquiry_column)
#print(df_orders.describe())
#print(df_orders['regularize_oe'].value_counts())
#print(df_damage.describe())
#print(df_damage['regularize_oe'].value_counts())
#print(df_enquiry.describe())
#print(df_enquiry['regularize_oe'].value_counts())

tmp_kv = df_orders['regularize_oe'].value_counts()
# print(a) # pandas.core.series.Series
for k in range(20):
   print(tmp_kv.index[k],tmp_kv[k]) 

# 几个函数

In [None]:
# 几个函数
import math

# 1. 空判断
# isEmpty(Object): 返回True/False
def isEmpty(o):
    if o:
        return True
    else:
        return False

# 2. anyNan(numList) 判断是否为NaN
def anyNan(numList):
    for num in numList:
        if math.isnan(num):
            return True
    return False

# 3. anyZero(numList) 判断是否为0
def anyZero(numList):
    for num in numList:
        if num == 0 | num == 0.0:
            return True
    return False

# 4, 箱线图算法
# 箱体图算法：四分位Q3+(Q3-Q1)*1.5 > x > Q1-(Q3-Q1)*1.5 为异常值
# 返回值：数据集 和 处理的行数
def cleanData_4p(data, col):
    if isEmpty(data):
        return data, 0
    o_size = data.shape[0]  # DataFrame的行数
    if o_size < 3:
        return data, 0
    f_q1 = data[col].quantile(0.25)
    f_q3 = data[col].quantile(0.75)
    delta_q31 = f_q3 - f_q1
    # 上下边缘
    b_edge = f_q1 - 1.5 * delta_q31
    t_edge = f_q3 + 1.5 * delta_q31
    
    if anyNan([f_q1, f_q3, delta_q31, b_edge, t_edge]):
        return data, 0
    
    data_return = data[(data[col]<=t_edge)&(data[col]>=b_edge)]
    return data_return, o_size-data_return.shape[0]

# 5. 循环清理：返回清理后的数据集
def loopCleanData(func, data, col):
    if isEmpty(data):
        return data
    print("************开始清理: col key: %s*************" % data.iloc[0][col])
    print("Step 1.当前数据集记录数：%s" % data.shape[0])
    print("Step 2.处理数据：")
    data_new = data
    cnt = 0
    while True:
        data_new, effectRows = func(data, col)
        print("==第一次循环清理记录数：%s" % effectRows)
        if effectRows == 0:
            break;
    return data_new
    

In [None]:
# 抽取OE，查看数据
sample_oe = '521190Z954'
sample_df_orders = df_orders[df_orders['regularize_oe']==sample_oe].sort_values(by=['manufacturer', 'part_name', 'part_price'])
sample_df_damage = df_damage[(df_damage['regularize_oe']==sample_oe)&(df_damage['reference_type_name']=='市场原厂')].sort_values(by=['manufacturer', 'part_name', 'damage_price'])
sample_df_enquiry = df_enquiry[df_enquiry['regularize_oe']==sample_oe].sort_values(by=['manufacturer', 'part_name', 'part_price'])

In [None]:
print("==========sample_df_orders['%s']==========" % sample_oe)
print(sample_df_orders['part_price'].describe())
sample_df_orders = loopCleanData(cleanData_4p, sample_df_orders, 'regularize_oe')

In [None]:
print("==========sample_df_orders['%s']==========" % sample_oe)
print(sample_df_orders['part_price'].describe())
print("==========sample_df_damage['%s']==========" % sample_oe)
print(sample_df_damage['damage_price'].describe())
print("==========sample_df_enquiry['%s']==========" % sample_oe)
print(sample_df_enquiry['part_price'].describe())