In [87]:
import pandas as pd
import numpy as np
from pprint import pprint

df = pd.read_csv("./first.csv")

# extract style from style & color code
df['STYLE'] = df['UNIV_STYLE_COLOR_CD'].str.split('-', expand=True)[0]
df['TRANS_DT']= pd.to_datetime(df['TRANS_DT'])

# what shape do we want the data in?
# one row per timeseries
# columns: id, timestamp, target 
# timestamp and target should be same length
# shapes: int, list of datetimes, list of floats
df.drop(columns=['ORD_KEY', 'BOW_DATE', 'GTIN', 'UNIV_STYLE_COLOR_CD',
       'UNIV_SZ_CD', 'CHANNEL', 'COUNTRY', 'GEO', 'ZIP_CD', 'PLANT_CD',
       'RETURN_UNITS', 'NET_SLS_UNITS', 'CLEARANCE_IND',
       'GROSS_AMT_USD', 'GROSS_AMT_LC', 'NET_SLS_AMT_USD', 'NET_SLS_AMT_LC',
       'RETURN_AMT_LC', 'RETURN_AMT_USD', 'MSRP_LC', 'MSRP_USD'], inplace=True)
df = df.groupby(['STYLE', 'TRANS_DT'])['GROSS_UNITS'].sum().reset_index()
df.head()

Index(['ORD_KEY', 'TRANS_DT', 'BOW_DATE', 'GTIN', 'UNIV_STYLE_COLOR_CD',
       'UNIV_SZ_CD', 'CHANNEL', 'COUNTRY', 'GEO', 'ZIP_CD', 'PLANT_CD',
       'GROSS_UNITS', 'RETURN_UNITS', 'NET_SLS_UNITS', 'CLEARANCE_IND',
       'GROSS_AMT_USD', 'GROSS_AMT_LC', 'NET_SLS_AMT_USD', 'NET_SLS_AMT_LC',
       'RETURN_AMT_LC', 'RETURN_AMT_USD', 'MSRP_LC', 'MSRP_USD'],
      dtype='object')


Unnamed: 0,STYLE,TRANS_DT,GROSS_UNITS
0,408452,2021-08-08 00:00:00+00:00,1
1,408452,2022-06-25 00:00:00+00:00,1
2,408452,2022-07-23 00:00:00+00:00,1
3,415445,2020-08-28 00:00:00+00:00,1
4,415445,2021-05-25 00:00:00+00:00,1


In [None]:
units_grouped_df = df.groupby('STYLE')['GROSS_UNITS'].apply(list).reset_index()
dt_grouped_df = df.groupby('STYLE')['TRANS_DT'].apply(list).reset_index()

grouped_df = units_grouped_df.merge(dt_grouped_df)
grouped_df

In [131]:
dt_min = df['TRANS_DT'].min()
dt_max = df['TRANS_DT'].max()

all_dates = pd.DataFrame(pd.date_range(start=dt_min, end=dt_max, freq='D'), columns=["FULL_DATE"])
all_styles = pd.Series(df['STYLE'].unique(), name="STYLE")

# for each date make sure there is a row for that data and every style
# if this row does not already exist, create it and set GROSS_UNITS to 0
df_merged_by_date = pd.merge(all_dates, df, left_on=['FULL_DATE'], right_on=['TRANS_DT'], how="left").drop(columns=['TRANS_DT']).fillna(0)
df_merged_by_style = pd.merge(all_styles, df_merged_by_date).fillna(0)
df_merged_by_style

Unnamed: 0,STYLE,FULL_DATE,GROSS_UNITS
0,408452,2021-08-08 00:00:00+00:00,1.0
1,408452,2022-06-25 00:00:00+00:00,1.0
2,408452,2022-07-23 00:00:00+00:00,1.0
3,415445,2020-08-28 00:00:00+00:00,1.0
4,415445,2021-05-25 00:00:00+00:00,1.0
...,...,...,...
959,SX7667,2021-04-29 00:00:00+00:00,1.0
960,SX7667,2021-05-21 00:00:00+00:00,1.0
961,SX7667,2021-07-08 00:00:00+00:00,1.0
962,SX7837,2020-11-08 00:00:00+00:00,1.0


In [118]:
df_styles_by_day = df_merged.groupby(['FULL_DATE'])['STYLE'].apply(list).reset_index()

In [119]:
df_units_by_day = df_merged.groupby(['FULL_DATE'])['GROSS_UNITS'].apply(list).reset_index()

In [113]:
df_merged.groupby(['STYLE']).groups[0]

Index([   2,    4,    5,    6,    7,    8,    9,   10,   11,   13,
       ...
       1736, 1742, 1744, 1745, 1746, 1747, 1749, 1750, 1751, 1752],
      dtype='int64', length=790)

In [101]:
df_merged_zeroless = df_merged[df_merged['STYLE'] != 0]
df_merged_zeroless

Unnamed: 0,FULL_DATE,STYLE,GROSS_UNITS
0,2020-07-21 00:00:00+00:00,BQ5309,2.0
1,2020-07-21 00:00:00+00:00,CW2500,1.0
3,2020-07-23 00:00:00+00:00,CJ3582,1.0
12,2020-08-01 00:00:00+00:00,928513,1.0
17,2020-08-06 00:00:00+00:00,CD6003,1.0
...,...,...,...
1740,2024-07-10 00:00:00+00:00,HF0013,1.0
1741,2024-07-11 00:00:00+00:00,FZ4625,1.0
1743,2024-07-13 00:00:00+00:00,DJ0292,1.0
1748,2024-07-18 00:00:00+00:00,FQ1457,1.0


In [136]:
# Group by style and transaction date, sum the gross units
df = df.groupby(['STYLE', 'TRANS_DT'])['GROSS_UNITS'].sum().reset_index()

# Create a complete date range
dt_min = df['TRANS_DT'].min()
dt_max = df['TRANS_DT'].max()
all_dates = pd.date_range(start=dt_min, end=dt_max, freq='D')
all_styles = df['STYLE'].unique()

# Create a dataframe with every combination of style and date
complete_index = pd.MultiIndex.from_product([all_styles, all_dates], names=['STYLE', 'TRANS_DT'])
df_complete = pd.DataFrame(index=complete_index).reset_index()

# Merge with the original dataframe to fill in the missing combinations with 0s
df = pd.merge(df_complete, df, on=['STYLE', 'TRANS_DT'], how='left').fillna(0)

# Group by style and create lists of timestamps and gross units
grouped_df = df.groupby('STYLE').agg({'TRANS_DT': lambda x: list(x), 'GROSS_UNITS': lambda x: list(x)}).reset_index()

# Rename columns for clarity
grouped_df.rename(columns={'TRANS_DT': 'datetimes', 'GROSS_UNITS': 'sales'}, inplace=True)

AttributeError: 'list' object has no attribute 'sum'

In [140]:
# Display the resulting dataframe
sum(grouped_df['sales'][3])

6.0