In [1]:
# dependencies

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import datetime
import plotly.express as px
import psycopg2

from sqlalchemy import create_engine

pd.set_option('display.max_columns', None)

In [2]:
# database connection

db_password = "UnitCircle42!"
db_user = "postgres"
db_name = "dot"
endpoint = "awakedb.cre3f7yk1unp.us-west-1.rds.amazonaws.com"

connection_string = f"postgresql://{db_user}:{db_password}@{endpoint}:5432/{db_name}"
engine = create_engine(connection_string)

In [4]:
# READ NEW DATA, FIX HEADERS/FOOTERS, DATATYPES, TAKE A LOOK

file_path = r"C:\Users\mikej\Downloads\SalesEnquiryList - 2024-01-21T100559.981.xlsx"
new_unl_raw = pd.read_excel(file_path,header=2)

new_unl_raw = new_unl_raw.loc[:, ~new_unl_raw.columns.str.match('Unnamed')]     #remove 'Unnamed' column
new_unl_raw.drop(new_unl_raw.tail(1).index,inplace=True)                        #remove final row where vendor totals the Sub Total column

new_unl_raw.columns = ['order_num','order_date','req_date','completed_date',     #rename columns
                       'warehouse','customer_name','customer_type','product',
                       'product_group','status','quantity','sub_total']

new_unl_raw['quantity'] = new_unl_raw['quantity'].astype('float')
new_unl_raw['sub_total'] = new_unl_raw['sub_total'].astype('float')

print(f"{len(new_unl_raw)} new rows")
print(f"Dates: {new_unl_raw['completed_date'].dt.date.min()} thru {new_unl_raw['completed_date'].dt.date.max()}")
print(" ")

new_unl_raw.info()

35 new rows
Dates: 2024-01-19 thru 2024-01-19
 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   order_num       35 non-null     object        
 1   order_date      35 non-null     datetime64[ns]
 2   req_date        35 non-null     datetime64[ns]
 3   completed_date  35 non-null     datetime64[ns]
 4   warehouse       35 non-null     object        
 5   customer_name   35 non-null     object        
 6   customer_type   35 non-null     object        
 7   product         35 non-null     object        
 8   product_group   35 non-null     object        
 9   status          35 non-null     object        
 10  quantity        35 non-null     float64       
 11  sub_total       35 non-null     float64       
dtypes: datetime64[ns](3), float64(2), object(7)
memory usage: 3.4+ KB


In [5]:
## LIKE WHAT YOU SEE? PUSH TO CLEAN DATABASE

new_unl_raw.to_sql('unleashed_raw', engine, if_exists='append', index=False)       #send to database

35

In [3]:
# READ RAW DATA FROM POSTGRES, ADD COLUMNS 

unleashed_raw = pd.read_sql("""
                            SELECT completed_date, customer_name, product, quantity, sub_total 
                            FROM unleashed_raw
                            ;""", 
                            con = engine)

## add $ USD columns converted from CAD
unleashed_raw['usd'] = unleashed_raw['sub_total']*.75
## add origin of sale (dot or unleashed)
unleashed_raw['sale_origin'] = 'unl'

# assign market segments to each parent customer
segment_table = pd.read_csv(r"C:\Users\mikej\Desktop\cpg-sales\data\customer_table.csv",usecols=('customer','market_segment')).set_index('customer')
unleashed_raw.set_index('customer_name',inplace=True)
unleashed_raw = unleashed_raw.merge(segment_table, how='left',left_index=True,right_index=True)

# assign parent customers to customers
cus_table = pd.read_csv(r"C:\Users\mikej\Desktop\cpg-sales\data\customer_table.csv",usecols=('customer','parent_customer')).set_index('customer')
unleashed_raw = unleashed_raw.merge(cus_table, how='left',left_index=True,right_index=True).reset_index()

# add year/month columns
unleashed_raw['completed_date'] = pd.to_datetime(unleashed_raw['completed_date'])
year_col = unleashed_raw.set_index(['completed_date']).index.year
month_col = unleashed_raw.set_index(['completed_date']).index.month_name()

# month & year columns
unleashed_raw.insert(0,"month", month_col)
unleashed_raw.insert(1,"year", year_col)

unleashed_raw['completed_date'] = unleashed_raw['completed_date'].dt.date

print("  ")
print(unleashed_raw.completed_date.min())
print("thru")
print(unleashed_raw.completed_date.max())
print("  ")
print(f"Rows: {len(unleashed_raw)}")
unleashed_raw.info()
# unleashed_raw.head(3)

  
2018-01-05
thru
2024-01-19
  
Rows: 197332
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197332 entries, 0 to 197331
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   month            197332 non-null  object 
 1   year             197332 non-null  int32  
 2   customer_name    197332 non-null  object 
 3   completed_date   197332 non-null  object 
 4   product          197332 non-null  object 
 5   quantity         197332 non-null  float64
 6   sub_total        197332 non-null  float64
 7   usd              197332 non-null  float64
 8   sale_origin      197332 non-null  object 
 9   market_segment   195781 non-null  object 
 10  parent_customer  195781 non-null  object 
dtypes: float64(3), int32(1), object(7)
memory usage: 15.8+ MB


In [4]:
##  -- SEND TO UNLEASHED CLEAN DATABASE

unleashed_raw.to_sql('unleashed_clean', engine, if_exists='replace', index=False)

332

In [5]:
## read in unleashed_clean

unleashed_clean = pd.read_sql('SELECT * FROM unleashed_clean;', con = engine)

print(unleashed_clean.completed_date.min())
print(unleashed_clean.completed_date.max())
print(f"Rows: {len(unleashed_clean)}")
unleashed_clean.head(3)

2018-01-05
2024-01-19
Rows: 197332


Unnamed: 0,month,year,customer_name,completed_date,product,quantity,sub_total,usd,sale_origin,market_segment,parent_customer
0,January,2018,1999,2018-01-19,Shopify Shipping cost,1.0,6.39,4.7925,unl,Other,1999
1,January,2018,1999,2018-01-19,Cinnamon Bun Granola Bars 16 Pack,1.0,28.74,21.555,unl,Other,1999
2,January,2018,ACME,2018-01-12,Shopify Shipping cost,1.0,6.39,4.7925,unl,Other,ACME


In [7]:
unleashed_clean.market_segment.unique()

array(['Other', 'Online', 'Vending', 'Canada', 'Grocery', 'Convenience',
       'Samples', None, 'Alternate Retail'], dtype=object)

In [10]:
unleashed_clean.convert_dtypes().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197332 entries, 0 to 197331
Data columns (total 11 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   month            197332 non-null  string 
 1   year             197332 non-null  Int64  
 2   customer_name    197332 non-null  string 
 3   completed_date   197332 non-null  object 
 4   product          197332 non-null  string 
 5   quantity         197332 non-null  Float64
 6   sub_total        197332 non-null  Float64
 7   usd              197332 non-null  Float64
 8   sale_origin      197332 non-null  string 
 9   market_segment   195781 non-null  string 
 10  parent_customer  195781 non-null  string 
dtypes: Float64(3), Int64(1), object(1), string(6)
memory usage: 17.3+ MB
