In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
#data_path = "../data/interim"
knit_data = pd.read_csv("../data/interim/transactions_sd_knits.csv")

In [None]:
knit_data.info()

### resample data using pandas sample

In [None]:
def resample_data(df):
    '''
    Randomly resamples rows with replacement from input data, fraction of axis items to return is set to 80.
        Parameters: 
            df (df): Dataframe from which to resample rows.
        Returns:
            df (df): Dataframe containing resampled rows.
    '''
    resampled_df = knit_data.sample(frac=80, replace=True, random_state=1)
    return resampled_df

In [None]:
sample_manual = resample_data(knit_data)

### compare original and resampled data

In [None]:
print('knit data shape: ' + str(knit_data.shape))
print('resampled data shape: ' + str(sample_manual.shape))

In [None]:
print('knit data unique products: ' + str(knit_data['p_id'].nunique()))
print('resampled data unique products: ' + str(sample_manual['p_id'].nunique()))

In [None]:
knit_data['transaction_date'] = pd.to_datetime(knit_data['transaction_date'], infer_datetime_format=True)
sample_manual['transaction_date'] = pd.to_datetime(knit_data['transaction_date'], infer_datetime_format=True)

In [None]:
sns.relplot(
    data = knit_data.groupby(['transaction_date']).size().to_frame("count").reset_index(),
    x = "transaction_date",
    y = "count",
    aspect = 3,
    kind="line",
    height=4
)

sns.relplot(
    data = sample_manual.groupby(['transaction_date']).size().to_frame("count").reset_index(),
    x = "transaction_date",
    y = "count",
    aspect = 4,
    kind="line",
    height=3
)


In [None]:
fig, ax =plt.subplots(2,1,figsize=(20,10))
sns.histplot(knit_data["label_desc"].astype('category'), ax=ax[0]).set(title='knit_data')
sns.histplot(sample_manual["label_desc"].astype('category'), ax=ax[1]).set(title='sample_manual')
fig.show()

In [None]:
fig, ax =plt.subplots(2,1,figsize=(20,10))
sns.histplot(knit_data["color_simple"].astype('category'), ax=ax[0]).set(title='knit_data')
sns.histplot(sample_manual["color_simple"].astype('category'), ax=ax[1]).set(title='sample_manual')
fig.show()

In [None]:
fig, ax =plt.subplots(2,1,figsize=(20,10))
sns.histplot(knit_data["quantity"].astype('category'), ax=ax[0]).set(title='knit_data')
sns.histplot(sample_manual["quantity"].astype('category'), ax=ax[1]).set(title='sample_manual')
fig.show()

In [None]:
fig, ax =plt.subplots(2,1,figsize=(20,10))
sns.histplot(knit_data["amount"], ax=ax[0]).set(title='knit_data')
sns.histplot(sample_manual["amount"], ax=ax[1]).set(title='sample_manual')
fig.show()

### remap identifiable data

In [None]:
def remap_identifiable_data(df):
    '''
    Remaps identifying information in the 'p_id', 'sub_department_desc', are 'label_desc' columns of the original data.
        Parameters:
            df (df): Dataframe in which to remap identifying information.
        Returns:
            df (df): Dataframe with identifying information remapped.
    '''
    p_id_array = df['p_id'].unique()
    dict_p_id = {}
    for i in range(len(p_id_array)):
        dict_p_id[p_id_array[i]] = "p_"+str(i + 1)

    dict_sub_dept = {'W L/S KNITS' : 'KNITS'}

    label_desc_array = df['label_desc'].unique()
    dict_label_desc = {}
    for i in range(len(label_desc_array)):
        dict_label_desc[label_desc_array[i]] = "lab_"+str(i + 1)

    df.replace({"p_id": dict_p_id, "sub_department_desc": dict_sub_dept, "label_desc" : dict_label_desc}, inplace=True)

    return df

In [None]:
sample_manual = remap_identifiable_data(sample_manual)

In [None]:
sample_manual.head()

### write resampled data to interim folder 

In [None]:
sample_manual.to_csv("../data/interim/transactions_sd_knits_resampled.csv", index=False)