In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
url_ecommerce = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/ecommerce_transaction.csv'
data_ecommerce = pd.read_csv(url_ecommerce)

### [1]

In [3]:
d1 = data_ecommerce
d1['review_creation_date'] = pd.to_datetime(d1['review_creation_date'])

In [4]:
print(f"{d1[d1['review_creation_date'] < '2017-01-01']['id'].nunique()}")

95


In [5]:
base = d1[d1['review_creation_date'] != '2000-01-01']
base = base.assign(review_creation_date=base['review_creation_date'].apply(
    lambda x: x if x.year >= 2017 else pd.to_datetime(str(x.year + 7) +
                                                      str(x.month).zfill(2) +
                                                      str(x.day).zfill(2))))

### [2]

In [6]:
base['order_timestamp'] = pd.to_datetime(base['order_timestamp'])
base['order_approved_time'] = pd.to_datetime(base['order_approved_time'])

In [7]:
base = base.assign(gap=base.apply(lambda x:
            (x['order_approved_time'] - x['order_timestamp']).days * 24 * 3600 +
            (x['order_approved_time'] - x['order_timestamp']).seconds, axis=1))
base = base.assign(pay=base.apply(lambda x:
                             'banking' if x['gap'] >= 3600 else 'card', axis=1))

In [8]:
cnt = base.groupby('pay').agg({'id': 'count'})
banking = cnt.loc['banking', 'id']
card = cnt.loc['card', 'id']

print(f"{(banking / (banking + card) * 100):.1f}")

36.1


### [3]

In [18]:
base = base.assign(unit_price=base['order_price'] / base['order_qty'])

In [19]:
def price_range(p):
    if p < 50:
        return 'R1'
    elif p < 100:
        return 'R2'
    elif p < 500:
        return 'R3'
    elif p < 5000:
        return 'R4'
    elif p >= 5000:
        return 'ERROR'

In [20]:
base = base.assign(order_range=base['order_price'].apply(
                                                    lambda x: price_range(x)))
base = base.assign(unit_range=base['unit_price'].apply(
                                                    lambda x: price_range(x)))

In [21]:
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

formula_order = 'review_score ~ C(order_range)'
model_order = ols(formula=formula_order, data=base).fit()
result_order = anova_lm(model_order)

formula_unit = 'review_score ~ C(unit_range)'
model_unit = ols(formula=formula_unit, data=base).fit()
result_unit = anova_lm(model_unit)

In [30]:
f_order = result_order.loc['C(order_range)', 'F']
f_unit = result_unit.loc['C(unit_range)', 'F']

print(f"{max(f_order, f_unit):.2}")

0.66


In [37]:
from statsmodels.stats.multicomp import MultiComparison

comparison_order = MultiComparison(base['review_score'], base['order_range'])
print(comparison_order.tukeyhsd())

comparison_unit = MultiComparison(base['review_score'], base['unit_range'])
print(comparison_unit.tukeyhsd())

Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower  upper  reject
--------------------------------------------
  R1     R2   -0.0549  -0.2377 0.128  False 
  R1     R3   -0.0701  -0.2493 0.1091 False 
  R1     R4   -0.1908  -0.6075 0.226  False 
  R2     R3   -0.0152  -0.1917 0.1613 False 
  R2     R4   -0.1359  -0.5515 0.2797 False 
  R3     R4   -0.1207  -0.5347 0.2933 False 
--------------------------------------------
Multiple Comparison of Means - Tukey HSD,FWER=0.05
group1 group2 meandiff  lower  upper  reject
--------------------------------------------
  R1     R2    0.0099  -0.1694 0.1892 False 
  R1     R3    0.0314  -0.1471 0.2099 False 
  R1     R4   -0.0838  -0.5223 0.3547 False 
  R2     R3    0.0215  -0.1575 0.2006 False 
  R2     R4   -0.0937  -0.5325 0.345  False 
  R3     R4   -0.1152  -0.5537 0.3232 False 
--------------------------------------------


### [4]

In [40]:
base['order_delivered_customer_date'] = pd.to_datetime(
                                          base['order_delivered_customer_date'])

In [41]:
base = base.assign(leadtime=(base['order_delivered_customer_date'] -
                             base['order_approved_time']).dt.days +
                            (base['order_delivered_customer_date'] -
                             base['order_approved_time']).dt.seconds / 3600 / 24
                  )

In [48]:
base = base.assign(wd=base['order_approved_time'].apply(lambda x:
                                 'weekends' if x.dayofweek >= 5 else 'weekdays'))

In [71]:
d4 = base[base['order_status'] == 'delivered']

d4_wd = d4[d4['wd'] == 'weekdays']
d4_we = d4[d4['wd'] == 'weekends']

train_X_wd = d4_wd[['order_freight']]
train_y_wd = d4_wd['leadtime']
train_X_we = d4_we[['order_freight']]
train_y_we = d4_we['leadtime']

In [72]:
from sklearn.linear_model import LinearRegression

model_wd = LinearRegression().fit(train_X_wd, train_y_wd)
model_we = LinearRegression().fit(train_X_we, train_y_we)

In [78]:
print(f"{abs(model_wd.coef_[0] - model_we.coef_[0]):.4f}")

0.0097
