In [None]:
## Packages
import swat
import os
import pandas as pd
pd.set_option('display.max_columns', 50)
import numpy as np


## custom personal module to connect to my CAS environment
try:
    from casConnect import connect_to_cas 
except:
    print('CasConnect package not available')

    

######################
## Connect to CAS   ##
######################

## My personal connection to CAS. You will need to modify your connection object
try:
    conn = connect_to_cas()
except:
    print('My personal connection to CAS. You will need to modify yours using your connection information.')


## General connection syntax
# conn = swat.CAS(host, port, username, password)

## Viya for Learners 3.5 connection
# hostValue = os.environ.get('CASHOST')
# portValue = os.environ.get('CASPORT')
# passwordToken=os.environ.get('SAS_VIYA_TOKEN')
# conn = swat.CAS(hostname=hostValue, port=portValue, password=passwordToken)

## Create demo CAS table

In [2]:
## Load the RAND_RETAILDEMO.sashdat file into memory on the CAS server
conn.loadTable(path = 'RAND_RETAILDEMO.sashdat', caslib = 'samples',
               casout = {
                      'name' : 'rand_retaildemo',
                      'caslib' : 'casuser',
                      'replace' : True
               })

## Reference the CAS table
retailTbl = conn.CASTable('rand_retaildemo', caslib = 'casuser')

## Create a copy of the table with a new column
(retailTbl
 .eval("age_dup = age", inplace = False)          ## create a duplicate of the age column
 .partition(casout = {'name':'rand_retaildemo',
                      'caslib':'casuser',
                      'replace':True})
)


## Create a list of columns to rename 
newColNames = [{'name':col,'rename':col.lower()} for col in retailTbl.columns.to_list()]

## List of columns to keep
keepColumns = ['custid','bucket','age','age_dup','loyalty_card','brand_name','channeltype','class']

## Rename and keep columns
retailTbl.alterTable(columns = newColNames, 
                     keep = keepColumns)

## Preview the new CAS table
display(retailTbl.shape, 
        retailTbl.tableDetails(),
        retailTbl.tableInfo(caslib = 'casuser'),
        retailTbl.head())

NOTE: Cloud Analytic Services made the file RAND_RETAILDEMO.sashdat available as table RAND_RETAILDEMO in caslib CASUSER(Peter.Styliadis@sas.com).


(930046, 8)

Unnamed: 0,Node,Blocks,Active,Rows,IndexSize,DataSize,VardataSize,CompressedSize,CompressionRatio,Mapped,MappedMemory,Unmapped,UnmappedMemory,Allocated,AllocatedMemory,DeletedRows
0,ALL,383,383,930046,0,372018400,0,0,0,346,362786496,0,0,37,9272000,0


Unnamed: 0,Name,Rows,Columns,IndexedColumns,Encoding,CreateTimeFormatted,ModTimeFormatted,AccessTimeFormatted,JavaCharSet,CreateTime,ModTime,AccessTime,Global,Repeated,View,MultiPart,SourceName,SourceCaslib,Compressed,Creator,Modifier,SourceModTimeFormatted,SourceModTime
0,RAND_RETAILDEMO,930046,8,0,utf-8,2023-04-04T07:01:25-04:00,2023-04-04T07:01:25-04:00,2023-04-04T07:01:25-04:00,UTF8,1996225000.0,1996225000.0,1996225000.0,0,0,0,0,,,0,Peter.Styliadis@sas.com,,,


Unnamed: 0,custid,bucket,age,age_dup,loyalty_card,brand_name,channeltype,class
0,23682014.0,2.0,52.0,52.0,1.0,Maple,Internet,men's fragrances
1,23682017.0,1.0,32.0,32.0,1.0,Pine,Internet,kids_bottoms
2,23682017.0,1.0,32.0,32.0,1.0,Pine,Internet,kids_boys' clothing
3,23682037.0,1.0,,,0.0,Pine,Internet,men_pants
4,23682045.0,2.0,,,0.0,Pine,Internet,ink & toner


## Simple column updates in place

In [3]:
retailTbl.update(set = [
    {'var':'brand_name', 'value':'upcase(brand_name)'},
    {'var':'channeltype', 'value':'lowcase(channeltype)'},
    {'var':'class', 'value':'propcase(class)'}
])

In [4]:
retailTbl.head()

Unnamed: 0,custid,bucket,age,age_dup,loyalty_card,brand_name,channeltype,class
0,23682014.0,2.0,52.0,52.0,1.0,MAPLE,internet,Men's Fragrances
1,23682017.0,1.0,32.0,32.0,1.0,PINE,internet,Kids_bottoms
2,23682017.0,1.0,32.0,32.0,1.0,PINE,internet,Kids_boys' Clothing
3,23682037.0,1.0,,,0.0,PINE,internet,Men_pants
4,23682045.0,2.0,,,0.0,PINE,internet,Ink & Toner


## Update column based on a conditions

In [5]:
retailTbl.distinct(inputs = ['age', 'age_dup'])

Unnamed: 0,Column,NDistinct,NMiss,Trunc
0,age,124.0,673447.0,0.0
1,age_dup,124.0,673447.0,0.0


Get the mean of the age column

In [6]:
meanAge = retailTbl.age.mean().round(3)
meanAge

43.577

In [7]:
(retailTbl
 .query("age is null")
 .update(set = [
     {'var':'age', 'value':f'{meanAge}'}])
)

### Confirm no missing values exists in age

In [9]:
retailTbl.distinct(inputs = ['age', 'age_dup'])

Unnamed: 0,Column,NDistinct,NMiss,Trunc
0,age,124.0,0.0,0.0
1,age_dup,124.0,673447.0,0.0


Notice that all the missing values (673,447) are now the mean age (44)

In [10]:
(retailTbl
 .age
 .value_counts()
)

43.577     673447
19.000       6996
23.000       6944
24.000       6941
21.000       6882
            ...  
97.000         26
98.000         25
94.000         21
105.000        20
140.000        18
Length: 124, dtype: int64

## Update rows using conditional logic

In [12]:
(retailTbl
 .update(set = [
     {'var':'age_dup', 'value':f'ifn(age_dup = . , {meanAge}, age_dup)'}])
)

### Confirm no missing values exists in age_dup

In [13]:
retailTbl.distinct(inputs = ['age', 'age_dup'])

Unnamed: 0,Column,NDistinct,NMiss,Trunc
0,age,124.0,0.0,0.0
1,age_dup,124.0,0.0,0.0


In [14]:
(retailTbl
 .age_dup
 .value_counts()
)

43.577     673447
19.000       6996
23.000       6944
24.000       6941
21.000       6882
            ...  
97.000         26
98.000         25
94.000         21
105.000        20
140.000        18
Length: 124, dtype: int64

## Save the CAS table as a data source file

In [None]:
retailTbl.save(name = 'retail_clean.parquet', caslib = 'casuser')

## Delete the source file

In [None]:
conn.deleteSource(source = 'retail_clean.parquet', caslib = 'casuser')

## Terminate the CAS connection

In [18]:
conn.terminate()