In [17]:
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np
import os
from pathlib import Path
from statsmodels.iolib.smpickle import load_pickle

## Train

In [2]:
cur_dir = os.getcwd()
dpath = Path(cur_dir)
dpath = dpath / ".." / ".." / "data" / "02_intermediate"
mpath = dpath / ".." / ".." / "prod-models"

In [3]:
train = pd.read_csv(dpath / 'train.csv')
train = train[['style_code', 'name', 'date', 'release_month', 'release_dow', 'retail_price', 
       'brand', 'wmns', 'collab', 'retro', 'kids', 'resale_price']]
train = train.dropna()
train = train.reset_index(drop=True)
train['wmns'] = train['wmns'].astype('int')
train['collab'] = train['collab'].astype('int')
train['retro'] = train['retro'].astype('int')
train['kids'] = train['kids'].astype('int')
train['log_resale'] = np.log(train['resale_price'])
train['log_retail'] = np.log(train['retail_price'])

In [4]:
# labels = train[['style_code','name','date']]
# y = train[['resale_price']]
# train = train[['release_month', 'release_dow','retail_price', 'brand', 'wmns', 'collab', 'retro', 'kids']]

In [5]:
# month = pd.get_dummies(train['release_month'])
# day = pd.get_dummies(train['release_dow'])
# brand = pd.get_dummies(train['brand'])
# retail = train[['retail_price']]
# bools = train[['wmns', 'collab', 'retro', 'kids']]
# X = pd.concat([month, day, brand, retail, bools],axis=1)

In [6]:
formula = "log_resale ~ C(collab) + C(retro) + C(kids) + C(wmns) + C(brand) + C(release_dow) + C(release_month) + log_retail"

In [7]:
lm = smf.ols(formula = formula, data = train).fit()

In [8]:
lm.save(mpath / 'resale_predictor.pickle')

## Test

In [18]:
lm = load_pickle(mpath / "resale_predictor.pickle")

In [19]:
test = pd.read_csv(dpath / 'test.csv')
test['wmns'] = test['wmns'].astype('int')
test['collab'] = test['collab'].astype('int')
test['retro'] = test['retro'].astype('int')
test['kids'] = test['kids'].astype('int')
test['log_resale'] = np.log(train['resale_price'])
test['log_retail'] = np.log(train['retail_price'])
test['retail_price'] = test['retail_price'].astype('int')

In [20]:
labels = test[['style_code','name','date','retail_price']]
# month = pd.get_dummies(test['release_month'])
# day = pd.get_dummies(test['release_dow'])
# brand = pd.get_dummies(test['brand'])
# retail = test[['retail_price']]
# bools = test[['wmns', 'collab', 'retro', 'kids']]
# X_test = pd.concat([month, day, brand,retail, bools],axis=1)

In [21]:
# for cname in list(X.columns):
#     if cname not in list(X_test.columns):
#         X_test[cname] = 0

In [22]:
lm_resale = pd.DataFrame(lm.predict(test)).rename(columns={0:'pred_resale_price'})
lm_resale['pred_resale_price'] = np.exp(lm_resale['pred_resale_price']).astype('int')

In [23]:
output = pd.concat([labels, lm_resale], axis=1)
output = output.sort_values(by='date')

In [24]:
opath = dpath / ".." / ".." / "data" / "07_model_output"
output.to_csv(opath / 'output.csv', index=False)