In [2]:
import pandas as pd
import numpy as np

import matplotlib as mlt
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [48]:
train = pd.read_csv("train_pos.csv")
test = pd.read_csv("test_pos.csv")

In [49]:
train = train.drop(columns=['time', 'installments', 'days_of_week', 'card_id', 'holyday'])
test = test.drop(columns=['time', 'installments', 'days_of_week', 'card_id', 'holyday'])


# Aggregated data into the sum of amount per each day, per store_id.
train = train.groupby(['date', 'store_id']).agg({'amount':'sum'}).reset_index()
test = test.groupby(['date', 'store_id']).agg({'amount':'sum'}).reset_index()


# 'date' column was converted into datetime format for further uses
train['date'] = pd.to_datetime(train['date'],infer_datetime_format=True)
test['date'] = pd.to_datetime(test['date'],infer_datetime_format=True)


# a duplicate column of 'date' was created for further uses
train['temp_date'] = train['date']
test['temp_date'] = test['date']


# the 'date' column was set to index
train.set_index("date",inplace=True)
test.set_index("date",inplace=True)

In [50]:
# Then, the number of data per each store_id in the train set was counted.
# If the store_id had less than 160 rows (= 160 days of data), it was removed from the train set.
#
# Since the goal of the 1st Competition was to predict the future 100 days of sales,
# 160 days was required to split into 60 days of training (X) & 100 days of prediction (y)
# The 'limit' 160 days was the ideal number resulted from multiple trials of training.
counter = 0
limit = 160

print("Before removing stores (due to limit): ", train.shape)

for x in range(train['store_id'].max()+1): # iterating through each store_id
    if train[train['store_id']==x]['store_id'].count() >= limit:
        counter += 1
    else:
        # drop rows that has total 'store_id' less than limit
        train = train[train.store_id != x]

print("Total # of stores that exceeds {} is {}".format(limit, counter))
print("After removing stores (due to limit): ", train.shape)

Before removing stores (due to limit):  (449423, 3)
Total # of stores that exceeds 160 is 929
After removing stores (due to limit):  (404792, 3)


In [51]:
# Now, the train set was checked for any stores that was out of business and removed them from the train set.
# If the store had no data within 5 days from 2018-07-31, it was 'assumed' to be closed.
# (2018-07-31 was the last date stores in train data was supposed to have)
from datetime import datetime

def keep_alive_store(df):
    
    store_id_list = df.store_id.unique() # list of train store_id
    yes, no = 0, 0 # yes: store has data within 5 days from 2018-07-31 / no: it doesn't

    t2 = datetime.strptime('2018-07-31 00:00:00', "%Y-%m-%d %H:%M:%S")

    for s in store_id_list:
        if str(df[df.store_id == s].iloc[-1]['temp_date']) == '2018-07-31 00:00:00':
            yes += 1
        else:
            # t1 is the last date of data the corresponding store_id has
            t1 = datetime.strptime(str(df[df.store_id == s].iloc[-1]['temp_date']), "%Y-%m-%d %H:%M:%S")
            difference = t2 - t1
            if difference.days <= 5:            
                yes +=1
            else:
                no +=1
                df = df[df.store_id != s] # remove stores that are 'assumed' closed
    print("# of train store open/out of business: ", yes, no)
    return df
    
train = keep_alive_store(train)

# of train store open/out of business:  844 85


In [52]:
# The same goes for the test data, but in a slightly different way.
# If the store had no data within 7 days from 2018-03-31, it was 'assumed' to be closed.
# (2018-03-31 was the last date stores in test data was supposed to have)
store_id_list = test.store_id.unique() # list of test store_id
yes, no = 0, 0 # yes: store has data within 7 days from 2018-03-31 / no: it doesn't
closed_test_store = []

for s in store_id_list:
    t2 = datetime.strptime('2018-03-31 00:00:00', "%Y-%m-%d %H:%M:%S")
    t1 = datetime.strptime(str(test[test.store_id == s].iloc[-1]['temp_date']), "%Y-%m-%d %H:%M:%S")
    difference = t2 - t1
    
    if difference.days <= 7:
        yes+=1
    else:
        no+=1
        print(test[test.store_id == s].iloc[-1]['temp_date'])
        closed_test_store.append(s)
print("# of test store open/out of business: ", yes, no)

2018-03-21 00:00:00
2018-01-17 00:00:00
2018-03-23 00:00:00
2018-03-23 00:00:00
# of test store open/out of business:  196 4


In [56]:
data_train = train.groupby(['store_id', 'temp_date'])[['amount']].sum()
data_train

Unnamed: 0_level_0,Unnamed: 1_level_0,amount
store_id,temp_date,Unnamed: 2_level_1
0,2016-12-19,3055
0,2016-12-20,5918
0,2016-12-21,16718
0,2016-12-22,12043
0,2016-12-23,12359
...,...,...
1687,2018-07-25,244
1687,2018-07-27,1095
1687,2018-07-28,919
1687,2018-07-29,757


In [57]:
data_test = test.groupby(['store_id', 'temp_date'])[['amount']].sum()
data_test

Unnamed: 0_level_0,Unnamed: 1_level_0,amount
store_id,temp_date,Unnamed: 2_level_1
0,2016-08-01,2106
0,2016-08-02,1528
0,2016-08-03,560
0,2016-08-04,1683
0,2016-08-05,1686
...,...,...
199,2018-03-27,265
199,2018-03-28,2065
199,2018-03-29,130
199,2018-03-30,395


In [60]:
print("number of store_id in train:",len(data_train.index.get_level_values('store_id').unique()))
print("number of store_id in test:",len(data_test.index.get_level_values('store_id').unique()))

number of store_id in train: 844
number of store_id in test: 200


In [86]:
dt.datetime(2019,12,23) - dt.datetime(2017,12,23)

datetime.timedelta(days=730)

In [84]:
a = data_train.loc[1674].index[-1] - dt.datetime(2019,12,23)
interval = dt.timedelta(days=14)

In [85]:
if a < interval:
    print("y")

y


In [119]:
# last day: 2018-07-31
# 각 가게들의 마지막 open날짜와 마지막 날짜의 차이 계산

last_day = dt.datetime(2018,7,31)
interval = dt.timedelta(days=14)

temp =[]

for store_name in store_names:
    temp.append(last_day - data_train.loc[store_name].index[-1])
    
temp = pd.DataFrame(temp)
temp.iloc[:,0].value_counts()

0 days    623
1 days     94
2 days     48
3 days     45
4 days     20
5 days     14
Name: 0, dtype: int64

In [125]:
train.

for store_name in store_names:
    temp.append(last_day - data_train.loc[store_name].index[-1])

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 374723 entries, 2016-08-01 to 2018-07-31
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   store_id   374723 non-null  int64         
 1   amount     374723 non-null  int64         
 2   temp_date  374723 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2)
memory usage: 11.4 MB
