In [1]:
# dependencies

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import datetime
import plotly.express as px

from sqlalchemy import create_engine

pd.set_option('display.max_columns', None)

In [2]:
# database connection

db_password = "UnitCircle42!"
db_user = "postgres"
db_name = "dot"
endpoint = "awakedb.cre3f7yk1unp.us-west-1.rds.amazonaws.com"

connection_string = f"postgresql://{db_user}:{db_password}@{endpoint}:5432/{db_name}"
engine = create_engine(connection_string)

In [3]:
dot = pd.read_sql('SELECT * FROM invoice_clean;', con = engine)
unl = pd.read_sql('SELECT * FROM unleashed_clean;', con = engine)

unl.columns = ['month','year','customer','date','item','qty','cad','usd','sale_origin','market_segment','parent_customer']
dot.columns = ['month','year','customer','date','item','qty','usd','cad','sale_origin','parent_customer','market_segment']

In [4]:
## concat datasets

lvl2 = pd.concat([dot,unl]).sort_values(by='date',ascending=False).reset_index(drop=True)
lvl2 = lvl2[lvl2.customer != 'DOT Foods, Inc.']
lvl2 = lvl2.convert_dtypes()
print(f"Latest Date: {lvl2.date.max()}")
lvl2.sample(3)

Latest Date: 2024-01-23


Unnamed: 0,month,year,customer,date,item,qty,usd,cad,sale_origin,parent_customer,market_segment
244131,March,2018,SYSCO/BARABOO,2018-03-20,AWAKE CHOCOLATE AWAKE CRML BARS 4X12PK FOOD SE...,3.0,156.96,208.76,dot,Sysco,Broadline Distributor
81218,May,2022,Amazon SC,2022-05-10,50ct Change Maker - AWAKE Chocolate Bites USA ...,2.0,65.61,87.48,unl,Amazon,Online
168981,August,2020,TROPICAL FOODS--DC,2020-08-21,AWAKE CHOCOLATE AWAKE CAFF MILK CHOC- 6X12PK M...,3.0,235.44,313.14,dot,Tropical,Alternate Retail


In [5]:
# ORDER COLUMNS TO MATCH POSTGRES DATABASE

new_order = ['date', 'sale_origin', 'market_segment', 'parent_customer', 'customer', 'item', 'qty', 'usd', 'cad','month','year']

for i,col in enumerate(new_order):
    tmp = lvl2[col]
    lvl2.drop(labels=[col],axis=1,inplace=True)
    lvl2.insert(i,col,tmp) 

lvl2.head(2)

Unnamed: 0,date,sale_origin,market_segment,parent_customer,customer,item,qty,usd,cad,month,year
0,2024-01-23,unl,Online,Amazon,Amazon SC FBA,NSA 50ct Pouch - Almond Sea Salt Bites,1.0,36.99,49.32,January,2024
1,2024-01-23,unl,Online,Amazon,Amazon SC FBA,50ct Pouch - Caramel Milk Choc Bites,1.0,34.9875,46.65,January,2024


In [6]:
##### CAREFUL!!!! REPLACES ENTIRE CLEAN LEVEL_2 DATABASE

lvl2.to_sql('level_2', engine, if_exists='replace', index=False)

230

In [8]:
#######################
## UPDATE CSV VERSION OF DATABASE

lvl2.to_csv(r"C:\Users\mikej\Desktop\cpg-sales\data\all_sales_data.csv")

PermissionError: [Errno 13] Permission denied: 'C:\\Users\\mikej\\Desktop\\cpg-sales\\data'

In [3]:
##  ------ READ ALL DATA INTO APPLICATION

def get_data_from_csv():
    df = pd.read_sql("""
            SELECT * 
            FROM level_2
            WHERE date > '2019-12-31'
            """
            ,con = engine)
#     df = pd.read_csv(r"C:\Users\mikej\Desktop\cpg-sales\data\all_sales_data.csv")
    return df
df = get_data_from_csv()

### MASTER DATA ###
all_sales = df.copy()
all_sales.date.max()

datetime.date(2024, 1, 23)

In [5]:
lvl2.to_csv(r"C:\Users\mikej\Desktop\cpg-sales\data\all_sales_data.csv")

In [36]:
# invoice date cleanup
all_sales['date'] = pd.to_datetime(all_sales['date'])
all_sales['date'] = all_sales['date'].dt.normalize()
all_sales['date'] = all_sales['date'].dt.floor('D')

In [None]:
#####
##  TEST APP HERE

In [76]:
# user query

year = sorted(list(all_sales['year'].unique()))
segment = np.array(all_sales['market_segment'].unique())
sale_origin = np.array(all_sales['sale_origin'].unique())


# QUERY THE DATEFRAME BASED ON FILTER SELECTIONS
df = all_sales[
    (all_sales['year'].isin(year)) &
    (all_sales['market_segment'].isin(segment)) &
    (all_sales['sale_origin'].isin(sale_origin))
    ]
               
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197086 entries, 0 to 197085
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   date             197086 non-null  datetime64[ns]
 1   sale_origin      197086 non-null  object        
 2   market_segment   197086 non-null  object        
 3   parent_customer  197086 non-null  object        
 4   customer         197086 non-null  object        
 5   item             197086 non-null  object        
 6   qty              197086 non-null  float64       
 7   usd              197086 non-null  float64       
 8   cad              197086 non-null  float64       
 9   month            197086 non-null  object        
 10  year             197086 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int64(1), object(6)
memory usage: 16.5+ MB


In [77]:
df['month'] = df.date.dt.month

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197086 entries, 0 to 197085
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   date             197086 non-null  datetime64[ns]
 1   sale_origin      197086 non-null  object        
 2   market_segment   197086 non-null  object        
 3   parent_customer  197086 non-null  object        
 4   customer         197086 non-null  object        
 5   item             197086 non-null  object        
 6   qty              197086 non-null  float64       
 7   usd              197086 non-null  float64       
 8   cad              197086 non-null  float64       
 9   month            197086 non-null  int32         
 10  year             197086 non-null  int64         
dtypes: datetime64[ns](1), float64(3), int32(1), int64(1), object(5)
memory usage: 15.8+ MB


In [81]:
px.histogram(df, x='year', y='usd', barmode='group')

In [79]:
df1 = df.groupby(['year','market_segment'],as_index=False)['usd'].sum()
df1

Unnamed: 0,year,market_segment,usd
0,2020,Alternate Retail,807691.7
1,2020,Broadline Distributor,35175.05
2,2020,Canada,366736.4
3,2020,Convenience,539674.0
4,2020,Grocery,253241.9
5,2020,Online,1252902.0
6,2020,Other,106413.2
7,2020,Samples,442.4175
8,2020,Vending,1111619.0
9,2021,Alternate Retail,1075944.0


In [49]:
px.bar(df1,x='year',y='usd',color='market_segment', title='Sales by Month')

In [None]:
fig_mth_bar = px.bar(df,
        template='plotly_white',
        x= df.index,
        y='usd',
        color='usd',
        color_continuous_scale=px.colors.sequential.Oranges,
        labels = {'date':' ','usd':'<b>$USD</b>'},
        text='usd',
        opacity=.8,
        hover_data=['usd'],
        title=' ',
        height=400
        ).update_coloraxes(showscale=False)
fig_mth_bar.update_traces(texttemplate='<b>%{text:$,}</b>',hovertext=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
fig_mth_bar.update_layout(title_x=0.5,hovermode="x")
# fig_mth_bar.update_xaxes(tickmode='array',tickvals = df.index, ticktext=df.index.month_name())
fig_mth_bar.update_yaxes(tick0=0,dtick=250000)#,showticklabels=False)

In [None]:
fig_mth_bar.show()

In [None]:
import plotly.express as px

In [None]:
# df['month'] = pd.to_datetime(df['month'])
# df['year'] = pd.to_datetime(df['year'])

df['date'] = pd.to_datetime(df['date'])
df['monthy'] = df['date'].dt.month
df['yeary'] = df['date'].dt.year
df = df.sort_values(by=['yeary','monthy'])
# df.groupby(pd.Grouper(freq='M'))['usd'].sum()


# px.bar(df,
#        # x = 'date',
#        y = 'usd')

In [None]:
bar_df = df.set_index('date').groupby(pd.Grouper(freq='M'))['usd'].sum()
px.bar(bar_df,
        template='plotly_white',
        y='usd',
        color='usd',
        color_continuous_scale=px.colors.sequential.Oranges,
        labels = {'date':' ','usd':'<b>$USD</b>'},
        text='usd',
        opacity=.8,
        hover_data=['usd'],
        title=' ',
        height=400
        )

In [None]:
# df = pd.DataFrame(bar_df).reset_index()
# df.date = df.date.astype('category')

px.bar(bar_df.reset_index(),
       x = 'date',
       y = 'usd',
       color=df.date)