In [1]:
# dependencies

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import datetime
import plotly.express as px

from sqlalchemy import create_engine

pd.set_option('display.max_columns', None)

In [2]:
# database connection

db_password = "UnitCircle42!"
db_user = "postgres"
db_name = "dot"
endpoint = "awakedb.cre3f7yk1unp.us-west-1.rds.amazonaws.com"

connection_string = f"postgresql://{db_user}:{db_password}@{endpoint}:5432/{db_name}"
engine = create_engine(connection_string)

In [3]:
dot = pd.read_sql('SELECT * FROM invoice_clean;', con = engine)
unl = pd.read_sql('SELECT * FROM unleashed_clean;', con = engine)

unl.columns = ['month','year','customer','date','item','qty','cad','usd','sale_origin','market_segment','parent_customer']
dot.columns = ['month','year','customer','date','item','qty','usd','cad','sale_origin','parent_customer','market_segment']

In [None]:
unl.convert_dtypes().info()

In [None]:
dot.convert_dtypes().info()

In [None]:
# # fix columns in dot table to match unl table

# new_order = ['month', 'year', 'date', 'customer', 'item',
#        'qty', 'cad', 'usd', 'sale_origin', 'market_segment',
#        'parent_customer']

# for column in new_order:
#     dot[column] = dot.pop(column)

In [4]:
## concat datasets

lvl2 = pd.concat([dot,unl]).sort_values(by='date',ascending=False).reset_index(drop=True)
lvl2 = lvl2[lvl2.customer != 'DOT Foods, Inc.']
lvl2 = lvl2.convert_dtypes()
lvl2.sample(3)

Unnamed: 0,month,year,customer,date,item,qty,usd,cad,sale_origin,parent_customer,market_segment
245556,February,2018,Diane Gosman Marsella,2018-02-07,Shopify Shipping cost,1.0,3.9975,5.33,unl,Consumer,Online Direct
203374,October,2019,Amazon FBM,2019-10-20,Milk Chocolate Bars 12 Pack,1.0,11.6325,15.51,unl,Amazon,Online Distributor
246889,January,2018,Loblaws Inc,2018-01-19,6 x 12 Pack - AWAKE Caffeinated Chocolate Bars...,2.0,88.56,118.08,unl,Consumer,Online Direct


In [5]:
# ORDER COLUMNS TO MATCH POSTGRES DATABASE

new_order = ['date', 'sale_origin', 'market_segment', 'parent_customer', 'customer', 'item', 'qty', 'usd', 'cad','month','year']

for i,col in enumerate(new_order):
    tmp = lvl2[col]
    lvl2.drop(labels=[col],axis=1,inplace=True)
    lvl2.insert(i,col,tmp) 

lvl2.head(2)

Unnamed: 0,date,sale_origin,market_segment,parent_customer,customer,item,qty,usd,cad,month,year
0,2024-01-19,unl,Online Distributor,Amazon,Amazon SC,50ct Pouch - Caramel Milk Choc Bites,1.0,34.9875,46.65,January,2024
1,2024-01-19,unl,Online Distributor,Amazon,Amazon SC,50ct Pouch - Caramel Milk Choc Bites,1.0,34.9875,46.65,January,2024


In [6]:
##### CAREFUL!!!! REPLACES ENTIRE CLEAN LEVEL_2 DATABASE

lvl2.to_sql('level_2', engine, if_exists='replace', index=False)

319

In [8]:
##  ------ READ ALL DATA INTO APPLICATION

def get_data_from_csv():
    df = pd.read_sql("""
            SELECT * 
            FROM level_2
            WHERE year > '2020'
            """
            ,con = engine)
    return df
df = get_data_from_csv()

### MASTER DATA ###
all_sales = df.copy()

In [9]:
# all_sales = all_sales.convert_dtypes()

# # invoice date cleanup
# all_sales['date'] = pd.to_datetime(all_sales['date'])
# all_sales['date'] = all_sales['date'].dt.normalize()
# all_sales['date'] = all_sales['date'].dt.floor('D')
# all_sales.sort_values(by='usd',ascending=False,inplace=True)
all_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144513 entries, 0 to 144512
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   date             144513 non-null  object 
 1   sale_origin      144513 non-null  object 
 2   market_segment   144513 non-null  object 
 3   parent_customer  144513 non-null  object 
 4   customer         144513 non-null  object 
 5   item             144513 non-null  object 
 6   qty              144513 non-null  float64
 7   usd              144513 non-null  float64
 8   cad              144513 non-null  float64
 9   month            144513 non-null  object 
 10  year             144513 non-null  int64  
dtypes: float64(3), int64(1), object(7)
memory usage: 12.1+ MB


In [13]:
year = sorted(list(all_sales['year'].unique()))
segment = all_sales['market_segment'].unique()

df = all_sales[(all_sales['year'].isin(year)) & (all_sales['market_segment'].isin(segment))]
               
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144437 entries, 0 to 144436
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   date             144437 non-null  object 
 1   sale_origin      144437 non-null  object 
 2   market_segment   144437 non-null  object 
 3   parent_customer  144437 non-null  object 
 4   customer         144437 non-null  object 
 5   item             144437 non-null  object 
 6   qty              144437 non-null  float64
 7   usd              144437 non-null  float64
 8   cad              144437 non-null  float64
 9   month            144437 non-null  object 
 10  year             144437 non-null  int64  
dtypes: float64(3), int64(1), object(7)
memory usage: 12.1+ MB


In [23]:
df['date'] = df.date.astype('datetime64[ns]')
df['date_month'] = df.date.dt.month_name()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144437 entries, 0 to 144436
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   date             144437 non-null  datetime64[ns]
 1   sale_origin      144437 non-null  object        
 2   market_segment   144437 non-null  object        
 3   parent_customer  144437 non-null  object        
 4   customer         144437 non-null  object        
 5   item             144437 non-null  object        
 6   qty              144437 non-null  float64       
 7   usd              144437 non-null  float64       
 8   cad              144437 non-null  float64       
 9   month            144437 non-null  object        
 10  year             144437 non-null  int64         
 11  date_month       144437 non-null  object        
dtypes: datetime64[ns](1), float64(3), int64(1), object(7)
memory usage: 13.2+ MB


In [None]:
fig_mth_bar = px.bar(df,
        template='plotly_white',
        x= df.index,
        y='usd',
        color='usd',
        color_continuous_scale=px.colors.sequential.Oranges,
        labels = {'date':' ','usd':'<b>$USD</b>'},
        text='usd',
        opacity=.8,
        hover_data=['usd'],
        title=' ',
        height=400
        ).update_coloraxes(showscale=False)
fig_mth_bar.update_traces(texttemplate='<b>%{text:$,}</b>',hovertext=['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
fig_mth_bar.update_layout(title_x=0.5,hovermode="x")
# fig_mth_bar.update_xaxes(tickmode='array',tickvals = df.index, ticktext=df.index.month_name())
fig_mth_bar.update_yaxes(tick0=0,dtick=250000)#,showticklabels=False)

In [None]:
fig_mth_bar.show()

In [None]:
import plotly.express as px

In [None]:
# df['month'] = pd.to_datetime(df['month'])
# df['year'] = pd.to_datetime(df['year'])

df['date'] = pd.to_datetime(df['date'])
df['monthy'] = df['date'].dt.month
df['yeary'] = df['date'].dt.year
df = df.sort_values(by=['yeary','monthy'])
# df.groupby(pd.Grouper(freq='M'))['usd'].sum()


# px.bar(df,
#        # x = 'date',
#        y = 'usd')

In [None]:
bar_df = df.set_index('date').groupby(pd.Grouper(freq='M'))['usd'].sum()
px.bar(bar_df,
        template='plotly_white',
        y='usd',
        color='usd',
        color_continuous_scale=px.colors.sequential.Oranges,
        labels = {'date':' ','usd':'<b>$USD</b>'},
        text='usd',
        opacity=.8,
        hover_data=['usd'],
        title=' ',
        height=400
        )

In [None]:
# df = pd.DataFrame(bar_df).reset_index()
# df.date = df.date.astype('category')

px.bar(bar_df.reset_index(),
       x = 'date',
       y = 'usd',
       color=df.date)