<a href="https://colab.research.google.com/github/nullpitch-dev/DS_L1_Notebooks/blob/master/DS_L1_EX_05_2nd.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd

url = 'https://raw.githubusercontent.com/nullpitch-dev/hj_public/master/ecommerce_transaction.csv'
data = pd.read_csv(url)

In [73]:
# [1] datetime, relativedelta

from datetime import datetime
from dateutil.relativedelta import relativedelta

data1 = data[data['review_creation_date'] < '2017-01-01']
wrong_cnt = data1['id'].count()

print(f'Answer [1] : {wrong_cnt}')

# wrong data correction
# convert review_creation_date to datetime format
base = data.assign(dt_review=data.apply(lambda x:
              datetime.strptime(x['review_creation_date'], '%Y-%m-%d %H:%M:%S'),
                                        axis=1))

# delete 2000-01-01 records
base = base[base['dt_review'] != '2000-01-01']

# correct 7 year error 
def correct7(x):
    if x.strftime('%Y') < '2017':
        return x + relativedelta(years=7)
    else:
        return x

base = base.assign(dt_review=base.apply(lambda x: correct7(x['dt_review']),
                                        axis=1))

# update review_creation_date with corrected dt_review
base = base.assign(review_creation_date=base.apply(lambda x:
                                   x['dt_review'].strftime('%Y-%m-%d %H:%M:%S'),
                                                   axis=1))

Answer [1] : 95


In [74]:
# [2] timedelta

from datetime import timedelta

# convert order_timestamp to datetime type
base = base.assign(dt_order=base.apply(lambda x:
                   datetime.strptime(x['order_timestamp'], '%Y-%m-%d %H:%M:%S'),
                                       axis=1))
# convert order_approved_time to datetime type
base = base.assign(dt_approved=base.apply(lambda x:
               datetime.strptime(x['order_approved_time'], '%Y-%m-%d %H:%M:%S'),
                                          axis=1))
# calculate timedelta
base = base.assign(approve_lt=base.apply(lambda x:
                                               x['dt_approved'] - x['dt_order'],
                                         axis=1))
# mark card vs. banking
base = base.assign(pay=base.apply(lambda x:
                 'banking' if x['approve_lt'] >= timedelta(hours=1) else 'card',
                                  axis=1))

# calculate banking ratio
ratio = base[base['pay'] == 'banking']['id'].count() / base['id'].count() * 100

print(f'Answer [2] : {ratio:.1f}')

Answer [2] : 36.1


In [75]:
# [3] ANOVA

from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

# calculate unit price
base = base.assign(price=base.apply(lambda x: x['order_price'] / x['order_qty'],
                                    axis=1))

# make price range
def priceRange(x):
    if x < 50:
        return '0-50'
    elif x < 100:
        return '50-100'
    elif x < 500:
        return '100-500'
    elif x < 5000:
        return '500-5000'
    else:
        return 'ERROR'

# price range based on order amount
base = base.assign(range_amt=base.apply(lambda x: priceRange(x['order_price']),
                                        axis=1))
# price range based on unit price
base = base.assign(range_price=base.apply(lambda x: priceRange(x['price']),
                                          axis=1))

# perform ANOVA test
ols_amt = ols(formula='review_score ~ C(range_amt)', data=base).fit()
ols_price = ols(formula='review_score ~ C(range_price)', data=base).fit()

table_amt = anova_lm(ols_amt)
table_price = anova_lm(ols_price)

f_amt = table_amt['F'].iloc[0]
f_price = table_price['F'].iloc[0]

print(f'Answer [3] : {max(f_amt, f_price):.2f}')

Answer [3] : 0.66


In [77]:
# [4] datetime, timedelta, LinearRegression

from datetime import timedelta
from sklearn.linear_model import LinearRegression

# include only delivered transactions
base = base[base['order_status'] == 'delivered']

# convert order_delivered_customer_date to datetim format
base = base.assign(dt_delivered=base.apply(lambda x:
     datetime.strptime(x['order_delivered_customer_date'], '%Y-%m-%d %H:%M:%S'),
                   axis=1))

# calculate delivery leadtime in days
def calcLT(x, y):
    return (y - x).days + (y - x).seconds / 60 / 60 / 24

base = base.assign(delivery_lt=base.apply(lambda x:calcLT(x['dt_approved'],
                                                          x['dt_delivered']),
                                          axis=1))

# mark weekdays or weekends
base = base.assign(wd=base.apply(lambda x:
                     'weekends' if x['dt_approved'].strftime('%w') in ['0', '6']
                                else 'weekdays', axis=1))

# make train set
train_x_wd = base[base['wd'] == 'weekdays'][['order_freight']]
train_x_we = base[base['wd'] == 'weekends'][['order_freight']]
train_y_wd = base[base['wd'] == 'weekdays']['delivery_lt']
train_y_we = base[base['wd'] == 'weekends']['delivery_lt']

# train
lr_wd = LinearRegression()
lr_we = LinearRegression()
model_wd = lr_wd.fit(train_x_wd, train_y_wd)
model_we = lr_we.fit(train_x_we, train_y_we)

print(f'Answer [4] : {abs(model_wd.coef_[0] - model_we.coef_[0]):.4f}')

Answer [4] : 0.0097
