# Prepare Notes

In [1]:
# tabular data manipulation
import numpy as np
import pandas as pd
# datetime utilities
from datetime import timedelta, datetime
# visualization
import matplotlib.pyplot as plt

# no yelling in the library
import warnings
warnings.filterwarnings("ignore")

# our acquire script
import acquire

In [5]:
stores = acquire.stores_df()
stores

In [6]:
items= acquire.items_df()
items

In [7]:
sales = acquire.sales_df()
sales

In [None]:
#rename column to be merged on
sales = sales.rename(columns={'item':'item_id'})
#merge sales and items on 'item_id'
merged_df = pd.merge(sales, items, on="item_id")

In [None]:
#rename column to be merged on
stores = stores.rename(columns={'store_id':'store'})
#merge stores to already merged df on store column
complete_df = pd.merge(merged_df, stores, on="store")

In [None]:
complete_df.info()

In [None]:
complete_df.shape

In [None]:
complete_df.isnull().sum()

In [None]:
from time import strftime

strftime('%a, %d, %b %Y')

In [None]:
df.sale_date = pd.to_datetime(df.sale_date, format='%a, %d %b %Y')

In [None]:
#check dtypes
complete_df.dtypes

In [None]:
#must complete step 2&3 (set date as index and sort)
complete_df = complete_df.set_index('sale_date').sort_index()

In [None]:
#make sure datatime is index now
type(complete_df.index)

In [None]:
complete_df.head()

In [None]:
complete_df.shape[0], complete_df.sale_id.nunique()

In [None]:
df= complete_df

In [None]:
#this shows that there are distinct paterns in this data
#we have peaks and valleys depending on season
#market seems predictable
by_date = df.groupby(['sale_date']).sale_amount.sum().reset_index()
by_date.plot(x='sale_date', y='sale_amount')

In [None]:
#univariate info with even distribution 

In [None]:
# Number of observations per store = number of item-transactions per store.
df.store.value_counts()

In [None]:
#number of items
df.item_id.value_counts().sort_index()

### Takeaways:
This all shows that there is an even distribution of items and stores within the dataframe
- 50 unique items
- 18260 sales per item
- 10 unique stores

<hr style="border:2px solid black"> </hr>

In [None]:
#find out what happens on first day of sales
#remove the index of datetime so we can use it in groupby (only for this problem)
first_sale = df.reset_index().groupby(['store', 'item_id']).sale_date.min()
first_sale.head()

In [None]:
# get the unique dates from first_sale_per_storeitem
first_sale.unique() # if there's only one unique value, then YES!

#shows that every item was sold in everystore on the that date

In [None]:
#look at last sale
last_sale = df.reset_index().groupby(['store', 'item_id']).sale_date.max()
last_sale.head()

In [None]:
last_sale.unique() # if there's only one unique value, then YES!

In [None]:
#see all the DIFFERENT days that items were sold at each store
days_per_store_per_item =  df.reset_index().groupby(['store', 'item_id']).sale_date.nunique()
days_per_store_per_item.head()

#there are no unique days.
#every item was sold every day at every store

<hr style="border:2px solid black"> </hr>

In [None]:
#you can see that sale_date is still index within the original df
df.head()

### Check for time gaps in the data

In [None]:
#this shows that there are no missing days or values
#don't need to fill or pad this data
print('Number of rows:', df.index.nunique())
n_days = df.index.max() - df.index.min() + pd.Timedelta('1d')
print(f"Number of days between first and last day:", n_days)

### Reproducability

In [None]:
def prep_store_data(df):
    return df.asign(sale_date=pd.to_datetime(df.sale_date)).sort_values('sale_date').set_index('sale_date')

<hr style="border:2px solid black"> </hr>

## Data Splitting
- **sklearn.model_selection.TimeSeriesSplit**

- Splitting time series data into train, test, and validate sets is a little trickier than with previous data we have looked at. Because the data points have an order to them, we cannot simply assign each point randomly to train, validate, or test.
- Ideally all splits should contain one season's worth of data. There are several methods we can use to split our time series data:
    - Human-based: use, for example, the last year in the dataset as test split
    - Percentage based: use the last 20% as test
    - Cross Validate: break data up into slices and use successive slices as train and test repeatedly (sklearn.model_selection.TimeSeriesSplit)

#### Important:
- We have to cut at a specific point in time
- cannot just scramble data up like before
- make the cut based on a cutoff on train that accurately sees seasonality
- overfitting can happen but to avoid it we must draw lines within validate and split it into subsets (**crossvalidation**)
    - K-fold cross validation