# Performance Testing

## Import packages

In [2]:
import swat
import pandas as pd
import numpy as np
import os
import sys
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import linear_model

pd.set_option('display.max_columns', None)

## my personal module to connect to CAS
try:
    from casauth import CASAuth
    print('Imported personal custom CAS auth package')
except:
    print('casauth package not available')

print(f'Python version:{sys.version.split("|")[0]}')
print(f'swat version:{swat.__version__}')
print(f'pandas version:{pd.__version__}')
print(f'numpy version:{np.__version__}')

Imported personal custom CAS auth package
Python version:3.8.16 (default, Mar  2 2023, 03:18:16) [MSC v.1916 64 bit (AMD64)]
swat version:1.13.0
pandas version:1.5.3
numpy version:1.24.3


## Testing on 28 million rows 
### Local pandas

In [None]:
%%time

# 1
df = pd.read_parquet('retail_sales_28mil.parquet')

# 2
df.head()

# 3
display(df.shape)

# 4
df.dtypes

# 5
df.info(verbose=True)

# 6
df.isna().sum()

# 7
(df                          
 .age_bucket                     
 .value_counts(normalize=True)  
)

# 8
df.describe()

# 9
(df                                
 .query('age_bucket == "26-35"')    
 .loyalty_card                     
 .value_counts()                   
)

# 10
(df                             
 .groupby(['Year','age_bucket'])          
 .sum()     
 .loc[:,['Sales','Cost']]     
)

# 11
newdf = (df
         .assign(
             Profit = df.Sales - df.Cost,
             loyalty_card_value = df.loyalty_card.map({0:'No Loyalty Card', 1:'Loyalty Card'})
         )
)

# 12
display(newdf.head())

# 13 
reg = linear_model.LinearRegression()
target = df['Sales']
inputs = df.loc[:,['Cost', 'mkt_bdgt', 'Margin']]
reg.fit(inputs, target)
display('Intercept',reg.intercept_, 'Coefficients', reg.coef_)

# 14
newdf.to_parquet('sales_final.parquet')

del newdf
del df

(27901380, 40)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27901380 entries, 0 to 930045
Data columns (total 40 columns):
 #   Column         Dtype  
---  ------         -----  
 0   CustID         float64
 1   bucket         float64
 2   age            float64
 3   loyalty_card   float64
 4   Department     object 
 5   brand_name     object 
 6   Storechain     object 
 7   ChannelType    object 
 8   Class          object 
 9   StoreNum       float64
 10  Storeage       float64
 11  trx_hr_char    object 
 12  trx_dow_new    object 
 13  trx_tod        object 
 14  Date           object 
 15  MDY            object 
 16  Year           float64
 17  Sales          float64
 18  Cost           float64
 19  mkt_bdgt       float64
 20  Margin         float64
 21  sss            float64
 22  City           object 
 23  City_Lat       float64
 24  City_Long      float64
 25  Country        object 
 26  Country_Lat    float64
 27  Country_Long   float64
 28  Region         object 
 29  Region_Lat    



Unnamed: 0,CustID,bucket,age,loyalty_card,Department,brand_name,Storechain,ChannelType,Class,StoreNum,Storeage,trx_hr_char,trx_dow_new,trx_tod,Date,MDY,Year,Sales,Cost,mkt_bdgt,Margin,sss,City,City_Lat,City_Long,Country,Country_Lat,Country_Long,Region,Region_Lat,Region_Long,Region_2,Region_2_Lat,Region_2_Long,State,State_Lat,State_Long,age_bucket,Storechain1,brand_name1,Profit,loyalty_card_value
0,1083863.0,1.0,,0.0,health,Pine,GRAND,Resale,oral care,736.0,12.0,2 PM,TUE,Afternoon,2009-12-29,12:09:00,2009.0,42.0,12.0,1328884.02,30.36,165122.0,Southampton,40.885188,-72.395162,United States,41.5435,-74.6941,US_AT,40.3,-74.5,US Atlantic Coast,40.3,-74.5,NY,41.5435,-74.6941,18-25,FAST,Oak,30.0,No Loyalty Card
1,1083863.0,1.0,,0.0,kids,Pine,GRAND,Resale,kids_swimwear,736.0,12.0,4 PM,WED,Afternoon,2009-11-11,11:09:00,2009.0,60.0,14.0,1202456.25,46.07,165122.0,New York,40.704234,-73.917927,United States,41.5435,-74.6941,US_AT,40.3,-74.5,US Atlantic Coast,40.3,-74.5,NY,41.5435,-74.6941,36-45,GRAND,Pine,46.0,No Loyalty Card
2,1083863.0,1.0,,0.0,men,Pine,GRAND,Resale,men_slippers,736.0,12.0,10 AM,THU,Morning,2010-01-28,01:10:00,2010.0,37.0,9.0,415904.64,28.16,165122.0,Champaign,40.112981,-88.261227,United States,41.234,-88.3961,US_MW,43.3,-84.75,US Midwest,43.3,-84.75,IL,41.234,-88.3961,56-65,MODA,Maple,28.0,No Loyalty Card
3,1083863.0,2.0,,0.0,men,Pine,GRAND,Internet,men_underwear,724.0,12.0,2 PM,SAT,Afternoon,2009-10-31,10:09:00,2009.0,39.0,11.0,415904.64,28.82,195111.0,Atlantic City,39.364966,-74.439034,United States,40.4403,-74.4296,US_AT,40.3,-74.5,US Atlantic Coast,40.3,-74.5,NJ,40.4403,-74.4296,46-55,MODA,Maple,28.0,No Loyalty Card
4,1083883.0,2.0,23.0,1.0,electronics,Pine,GRAND,Store,DVD & Blu-ray,757.0,11.0,11 AM,FRI,Morning,2009-09-18,09:09:00,2009.0,65.0,17.0,134668.27,47.99,154705.0,Lawrence,38.959902,-95.253199,United States,38.4575,-96.583,US_MW,43.3,-84.75,US Midwest,43.3,-84.75,KS,38.4575,-96.583,26-35,GRAND,Pine,48.0,Loyalty Card


'Intercept'

-0.001278416735914334

'Coefficients'

array([9.94565916e-01, 7.64540110e-09, 1.00089739e+00])

### CAS server

In [2]:
%%time

## Connect to CAS
path = os.getenv('HOMEPATH') + '\\.sas'
conn = CASAuth(path, ssl_ca_list = path + '\ssemonthly-rootCA-Intermidiates_4CLI.pem')


# 1. pd.read_parquet('retail_sales.parquet')
conn.loadTable(path = 'retail_sales.parquet', caslib = 'casuser',
               casout = {
                   'name':'retail_sales',
                   'caslib':'casuser',
                   'replace':True
               })


# 2. df.head()
castbl = conn.CASTable('retail_sales', caslib = 'casuser')
castbl.head()

# 3. df.shape
display(castbl.shape)

# 4. df.dtypes
castbl.columnInfo()

# 5. df.info(verbose = True)
castbl.tableDetails()

# 6. df.isna().sum()
castbl.nmiss()

# 7
# (df                          
#  .age_bucket                     
#  .value_counts(normalize=True)  
# )
(castbl                          ## CAS table
 .age_bucket                     ## CAS table column
 .value_counts(normalize=True)   ## SWAT package value_counts method
)


# 8. df.describe()
castbl.describe()

# 9
# (df                                
#  .query('age_bucket == "26-35"')    
#  .loyalty_card                     
#  .value_counts()                   
# )
(castbl                            ## CASTable object
 .query('age_bucket = "26-35"')    ## SWAT query method
 .loyalty_card                     ## CAS column
 .value_counts()                   ## SWAT value_counts method
)


# 10
# (df                             
#  .groupby(['Year','age_bucket'])          
#  .sum()     
#  .loc[:,['Sales','Cost']]     
# )
(castbl                             ## CASTable object
 .groupby(['Year','age_bucket'])    ## Grouping columns
 .loc[:,['Sales','Cost']]           ## Sepcify the columns to aggregate
 .sum()                             ## Aggregate function
)

# 11
# newdf = (df
#          .assign(
#              Profit = df.Sales - df.Cost,
#              loyalty_card_value = df.loyalty_card.map({0:'No Lyalty Card', 1:'Loyalty Card'})
#          )
castbl.eval('Profit = Sales - Cost')
castbl.eval("loyalty_card_value = ifc(loyalty_card=0,'No Loyalty Card','Loyalty Card')")
castbl.copyTable(casout={'name':'sales_final', 
                         'caslib':'casuser', 
                         'label':'Final table with two new calculated columns'})


# 12
display(castbl.head())


# 13 
reg = linear_model.LinearRegression()
target = df['Sales']
inputs = df.loc[:,['Cost', 'mkt_bdgt', 'Margin']]
reg.fit(inputs, target)
display('Intercept',reg.intercept_, 'Coefficients', reg.coef_)


# 14. newdf.to_parquet('sales_final.parquet')
final_castbl = conn.CASTable('sales_final', caslib = 'casuser')
display(final_castbl.head())
final_castbl.save(name = 'sales_final.parquet', caslib = 'casuser', replace = True)

conn.terminate()

CAS Connection created
NOTE: Cloud Analytic Services made the file retail_sales.parquet available as table RETAIL_SALES in caslib CASUSER(Peter.Styliadis@sas.com).


(27901380, 40)

Unnamed: 0,CustID,bucket,age,loyalty_card,Department,brand_name,Storechain,ChannelType,Class,StoreNum,Storeage,trx_hr_char,trx_dow_new,trx_tod,Date,MDY,Year,Sales,Cost,mkt_bdgt,Margin,sss,City,City_Lat,City_Long,Country,Country_Lat,Country_Long,Region,Region_Lat,Region_Long,Region_2,Region_2_Lat,Region_2_Long,State,State_Lat,State_Long,age_bucket,Storechain1,brand_name1,Profit,loyalty_card_value
0,36491293.0,2.0,,0.0,electronics,Pine,GRAND,Internet,cell phones,203.0,13.0,3 PM,WED,Afternoon,2009-10-21,10:09:00,2009.0,26.0,9.0,258303.24,17.2,116226.0,Houston,29.762895,-95.383173,United States,30.9807,-97.4423,US_SW,35.0,-106.0,US Southwest,35.0,-106.0,TX,30.9807,-97.4423,36-45,MODA,Maple,17.0,No Loyalty Card
1,36491293.0,2.0,,0.0,electronics,Pine,GRAND,Internet,desktop computers,203.0,13.0,3 PM,WED,Afternoon,2009-10-21,10:09:00,2009.0,72.0,15.0,258303.24,57.02,116226.0,Houston,29.762895,-95.383173,United States,30.9807,-97.4423,US_SW,35.0,-106.0,US Southwest,35.0,-106.0,TX,30.9807,-97.4423,18-25,FAST,Oak,57.0,No Loyalty Card
2,36491302.0,2.0,,0.0,health,Maple,MODA,Internet,bath & body,807.0,9.0,3 PM,TUE,Afternoon,2009-09-15,09:09:00,2009.0,6.0,2.0,702450.45,3.8,10629.0,Woodbury,44.920101,-92.935415,United States,45.2213,-93.6086,US_MW,43.3,-84.75,US Midwest,43.3,-84.75,MN,45.2213,-93.6086,46-55,MODA,Maple,4.0,No Loyalty Card
3,36491311.0,2.0,,0.0,grocery,Pine,GRAND,Internet,breakfast,563.0,7.0,3 PM,SUN,Afternoon,2009-10-25,10:09:00,2009.0,48.0,10.0,3604.81,38.16,94042.0,Fort Lauderdale,26.135763,-80.14181,United States,27.8002,-81.6528,US_SE,30.0,-83.0,US Southeast,30.0,-83.0,FL,27.8002,-81.6528,26-35,GRAND,Pine,38.0,No Loyalty Card
4,36491417.0,3.0,,0.0,grocery,Pine,GRAND,Internet,oils,203.0,13.0,1 PM,MON,Afternoon,2009-08-03,08:09:00,2009.0,40.0,14.0,13563.17,26.39,116226.0,Houston,29.762895,-95.383173,United States,30.9807,-97.4423,US_SW,35.0,-106.0,US Southwest,35.0,-106.0,TX,30.9807,-97.4423,26-35,FAST,Oak,26.0,No Loyalty Card


NOTE: Cloud Analytic Services saved the file sales_final.parquet in caslib CASUSER(Peter.Styliadis@sas.com).
CPU times: total: 2.02 s
Wall time: 1min 10s
