# Solve splitting problem

In [51]:
## Packages
import swat
import os
import pandas as pd
import numpy as np

## Options
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', None)

## My custom connection package for CAS
try:
    from casauth import CASAuth
    print('Imported personal custom CAS auth package')
except:
    print('casauth package not available')

Imported personal custom CAS auth package


## Make a Connection to CAS (REQUIRED: MODIFY CONNECTION INFORMATION)

##### To connect to the CAS server you will need:
1. the host name, 
2. the portnumber, 
3. your user name, and your password.

Visit the documentation [Getting Started with SAS® Viya® for Python](https://go.documentation.sas.com/doc/en/pgmsascdc/default/caspg3/titlepage.htm) for more information about connecting to CAS.

**Be aware that connecting to the CAS server can be implemented in various ways, so you might need to see your system administrator about how to make a connection. Please follow company policy regarding authentication.**

In [52]:
##
## Connect to CAS
##

################################
## General connection syntax  ##
################################
# conn = swat.CAS(host, port, username, password)

############################################
## SAS Viya for Learners 3.5 connection   ##
############################################
# hostValue = os.environ.get('CASHOST')
# portValue = os.environ.get('CASPORT')
# passwordToken=os.environ.get('SAS_VIYA_TOKEN')
# conn = swat.CAS(hostname=hostValue, port=portValue, password=passwordToken)

## My Personal connection
try:
    path = os.getenv('CAS_CREDENTIALS')
    pem_file = os.getenv('CAS_CLIENT_SSL_CA_LIST')
    conn = CASAuth(path, ssl_ca_list = pem_file)
except:
    print('No connection')
    pass

CAS Connection created


## Create fake data

In [53]:
raw_table = pd.DataFrame(["Tot Amt = USD 2; Tot Count = 5; Min Amt = 0; Min Count = 3", 
                          "Tot Amt = USD 3; Tot Count = 6; Min Amt = 1; Min Count = 4", 
                          "Tot Amt = USD 4; Tot Count = 7; Min Amt = 2; Min Count = 5",
                          "Tot Amt = USD 25; Tot Count = 101; Min Amt = 85; Min Count = 49"], 
             columns=["column_A"])
raw_table

Unnamed: 0,column_A
0,Tot Amt = USD 2; Tot Count = 5; Min Amt = 0; Min Count = 3
1,Tot Amt = USD 3; Tot Count = 6; Min Amt = 1; Min Count = 4
2,Tot Amt = USD 4; Tot Count = 7; Min Amt = 2; Min Count = 5
3,Tot Amt = USD 25; Tot Count = 101; Min Amt = 85; Min Count = 49


In [54]:
castbl = conn.upload_frame(raw_table, casout={'name':'raw_table', 'replace':True})
conn.tableInfo()

NOTE: Cloud Analytic Services made the uploaded file available as table RAW_TABLE in caslib CASUSER(Peter.Styliadis@sas.com).
NOTE: The table RAW_TABLE has been created in caslib CASUSER(Peter.Styliadis@sas.com) from binary data uploaded to Cloud Analytic Services.


Unnamed: 0,Name,Label,Rows,Columns,IndexedColumns,Encoding,CreateTimeFormatted,ModTimeFormatted,AccessTimeFormatted,JavaCharSet,CreateTime,ModTime,AccessTime,Global,Repeated,View,MultiPart,SourceName,SourceCaslib,Compressed,Creator,Modifier,SourceModTimeFormatted,SourceModTime
0,RAW_TABLE,,4,1,0,utf-8,2023-08-31T15:05:45+00:00,2023-08-31T15:05:45+00:00,2023-08-31T15:05:45+00:00,UTF8,2009114000.0,2009114000.0,2009114000.0,0,0,0,0,,,0,Peter.Styliadis@sas.com,,2023-08-31T15:05:45+00:00,2009114000.0
1,GOV_IT_BUDGET_NARROW,,13555,6,0,utf-8,2023-08-16T14:55:21+00:00,2023-08-16T14:55:21+00:00,2023-08-23T20:46:46+00:00,UTF8,2007817000.0,2007817000.0,2008443000.0,1,0,0,0,gov_it_budget_narrow.sashdat,CASUSER(Peter.Styliadis@sas.com),0,Peter.Styliadis@sas.com,,2023-08-16T14:52:12+00:00,2007817000.0
2,POP2021_PROC_PYTHON,,52,6,0,utf-8,2023-08-23T18:18:09+00:00,2023-08-23T18:18:09+00:00,2023-08-23T18:18:10+00:00,UTF8,2008434000.0,2008434000.0,2008434000.0,1,0,0,0,,,0,Peter.Styliadis@sas.com,,,
3,CARS,2004 Car Data,428,15,0,utf-8,2023-08-25T18:16:23+00:00,2023-08-25T18:16:23+00:00,2023-08-31T13:37:23+00:00,UTF8,2008607000.0,2008607000.0,2009108000.0,1,0,0,0,,,0,Peter.Styliadis@sas.com,,,


In [55]:
castbl.head()

Unnamed: 0,column_A
0,Tot Amt = USD 2; Tot Count = 5; Min Amt = 0; Min Count = 3
1,Tot Amt = USD 3; Tot Count = 6; Min Amt = 1; Min Count = 4
2,Tot Amt = USD 4; Tot Count = 7; Min Amt = 2; Min Count = 5
3,Tot Amt = USD 25; Tot Count = 101; Min Amt = 85; Min Count = 49


In [56]:
castbl

CASTable('RAW_TABLE', caslib='CASUSER(Peter.Styliadis@sas.com)')

## Create columns (test, long solution for the first group)

Inplace = false temporarily assigns a new column to the object for development.

Eventually create a new CAS table with the calculated columns.

In SWAT you will use the SCAN function like you would use the SPLIT funtion in Python

In [7]:
(castbl
 .eval("Tot_Amt_string_1 = scan(column_A,1,';')", inplace=False)            ## Get the string using the ; as the delimiter similar to df.col.split(':')
 .eval("Tot_Amt_1 = scan(Tot_Amt_string_1, -1, ' ')", inplace=False)        ## Get the last value
 .copyTable(casout = {'name':'Final_table', 
                      'caslib':'casuser', 
                      'replace':True})
)

finalTbl = conn.CASTable('final_table', caslib = 'casuser')

finalTbl.head()

Unnamed: 0,column_A,Tot_Amt_string_1,Tot_Amt_1
0,Tot Amt = USD 2; Tot Count = 5; Min Amt = 0; Min Count = 3,Tot Amt = USD 2,2
1,Tot Amt = USD 3; Tot Count = 6; Min Amt = 1; Min Count = 4,Tot Amt = USD 3,3
2,Tot Amt = USD 4; Tot Count = 7; Min Amt = 2; Min Count = 5,Tot Amt = USD 4,4
3,Tot Amt = USD 25; Tot Count = 101; Min Amt = 85; Min Count = 49,Tot Amt = USD 25,25


In [8]:
castbl

CASTable('RAW_TABLE', caslib='CASUSER(Peter.Styliadis@sas.com)')

In [9]:
finalTbl.columnInfo()

Unnamed: 0,Column,Label,ID,Type,RawLength,FormattedLength,Format,NFL,NFD
0,column_A,,1,varchar,63,63,,0,0
1,Tot_Amt_string_1,,2,char,63,63,,0,0
2,Tot_Amt_1,,3,char,63,63,,0,0


## Current issue is the SCAN function returns a string
Multiple solutions. You can use INPUT to convert a char to a numeric. 

Use the input function to convert that char to a numeric. The input function is like the .astype function in Python.

In [10]:
(castbl
 .eval("Tot_Amt_string_1 = scan(column_A,1,';')", inplace=False)            ## Get the string using the ; as the delimiter similar to df.col.split(':')
 .eval("Tot_Amt_1 = input(scan(Tot_Amt_string_1, -1, ' '), 8.)", inplace=False)        ## Get the last value
 .copyTable(casout = {'name':'Final_table', 
                      'caslib':'casuser', 
                      'replace':True})
)

finalTbl = conn.CASTable('final_table', caslib = 'casuser')

finalTbl.head()

Unnamed: 0,column_A,Tot_Amt_string_1,Tot_Amt_1
0,Tot Amt = USD 2; Tot Count = 5; Min Amt = 0; Min Count = 3,Tot Amt = USD 2,2.0
1,Tot Amt = USD 3; Tot Count = 6; Min Amt = 1; Min Count = 4,Tot Amt = USD 3,3.0
2,Tot Amt = USD 4; Tot Count = 7; Min Amt = 2; Min Count = 5,Tot Amt = USD 4,4.0
3,Tot Amt = USD 25; Tot Count = 101; Min Amt = 85; Min Count = 49,Tot Amt = USD 25,25.0


In [11]:
finalTbl.columnInfo()

Unnamed: 0,Column,Label,ID,Type,RawLength,FormattedLength,Format,NFL,NFD
0,column_A,,1,varchar,63,63,,0,0
1,Tot_Amt_string_1,,2,char,63,63,,0,0
2,Tot_Amt_1,,3,double,8,12,,0,0


## FINAL SOLUTION

- Solution with Pandas
- Solution with SWAT/CAS

#### 1. Recreate the CAS table to start from scratch

In [57]:
castbl = conn.upload_frame(raw_table, casout={'name':'raw_table', 'replace':True})
conn.tableInfo()

NOTE: Cloud Analytic Services made the uploaded file available as table RAW_TABLE in caslib CASUSER(Peter.Styliadis@sas.com).
NOTE: The table RAW_TABLE has been created in caslib CASUSER(Peter.Styliadis@sas.com) from binary data uploaded to Cloud Analytic Services.


Unnamed: 0,Name,Label,Rows,Columns,IndexedColumns,Encoding,CreateTimeFormatted,ModTimeFormatted,AccessTimeFormatted,JavaCharSet,CreateTime,ModTime,AccessTime,Global,Repeated,View,MultiPart,SourceName,SourceCaslib,Compressed,Creator,Modifier,SourceModTimeFormatted,SourceModTime
0,RAW_TABLE,,4,1,0,utf-8,2023-08-31T15:05:57+00:00,2023-08-31T15:05:57+00:00,2023-08-31T15:05:57+00:00,UTF8,2009114000.0,2009114000.0,2009114000.0,0,0,0,0,,,0,Peter.Styliadis@sas.com,,2023-08-31T15:05:57+00:00,2009114000.0
1,GOV_IT_BUDGET_NARROW,,13555,6,0,utf-8,2023-08-16T14:55:21+00:00,2023-08-16T14:55:21+00:00,2023-08-23T20:46:46+00:00,UTF8,2007817000.0,2007817000.0,2008443000.0,1,0,0,0,gov_it_budget_narrow.sashdat,CASUSER(Peter.Styliadis@sas.com),0,Peter.Styliadis@sas.com,,2023-08-16T14:52:12+00:00,2007817000.0
2,POP2021_PROC_PYTHON,,52,6,0,utf-8,2023-08-23T18:18:09+00:00,2023-08-23T18:18:09+00:00,2023-08-23T18:18:10+00:00,UTF8,2008434000.0,2008434000.0,2008434000.0,1,0,0,0,,,0,Peter.Styliadis@sas.com,,,
3,CARS,2004 Car Data,428,15,0,utf-8,2023-08-25T18:16:23+00:00,2023-08-25T18:16:23+00:00,2023-08-31T13:37:23+00:00,UTF8,2008607000.0,2008607000.0,2009108000.0,1,0,0,0,,,0,Peter.Styliadis@sas.com,,,


In [58]:
castbl.head()

Unnamed: 0,column_A
0,Tot Amt = USD 2; Tot Count = 5; Min Amt = 0; Min Count = 3
1,Tot Amt = USD 3; Tot Count = 6; Min Amt = 1; Min Count = 4
2,Tot Amt = USD 4; Tot Count = 7; Min Amt = 2; Min Count = 5
3,Tot Amt = USD 25; Tot Count = 101; Min Amt = 85; Min Count = 49


In [59]:
castbl

CASTable('RAW_TABLE', caslib='CASUSER(Peter.Styliadis@sas.com)')

### Solving this with Pandas

- Use SPLIT twice
- Use ASTYPE to convert to integeter (or whatever you want)

In [60]:
## Preview pandas dataframe
raw_table.head()

Unnamed: 0,column_A
0,Tot Amt = USD 2; Tot Count = 5; Min Amt = 0; Min Count = 3
1,Tot Amt = USD 3; Tot Count = 6; Min Amt = 1; Min Count = 4
2,Tot Amt = USD 4; Tot Count = 7; Min Amt = 2; Min Count = 5
3,Tot Amt = USD 25; Tot Count = 101; Min Amt = 85; Min Count = 49


In [61]:
## Create function to get desired number (this is nice with Pandas)
def get_numeric_value(column, position):
    
    return (raw_table['column_A']
            .str.split(';')          ## Split all statements by ;
            .str[position -1]        ## Get the statement by position (Use 1,2,3 instead of 0,1,2
            .str.split(' ')          ## Split the single statement by a space
            .str[-1]                 ## Pull the last element (the number)
            .astype('int'))          ## Convert the characer number to a numeric column


## Create final dataframe
final_df = (raw_table
            .assign(
                 Tot_Amt_1 = get_numeric_value('column_A',1),
                 Tot_Amt_2 = get_numeric_value('column_A',2),
                 Tot_Amt_3 = get_numeric_value('column_A',3),
                 Tot_Amt_4 = get_numeric_value('column_A',4),
            )
)

final_df.head()

Unnamed: 0,column_A,Tot_Amt_1,Tot_Amt_2,Tot_Amt_3,Tot_Amt_4
0,Tot Amt = USD 2; Tot Count = 5; Min Amt = 0; Min Count = 3,2,5,0,3
1,Tot Amt = USD 3; Tot Count = 6; Min Amt = 1; Min Count = 4,3,6,1,4
2,Tot Amt = USD 4; Tot Count = 7; Min Amt = 2; Min Count = 5,4,7,2,5
3,Tot Amt = USD 25; Tot Count = 101; Min Amt = 85; Min Count = 49,25,101,85,49


Check the data types of final dataframe

In [62]:
final_df.dtypes

column_A     object
Tot_Amt_1     int32
Tot_Amt_2     int32
Tot_Amt_3     int32
Tot_Amt_4     int32
dtype: object

### Final SWAT/CAS MPP Solution

- Use SCAN instead of SPLIT
- Use INPUT instead of ASTYPE to convert to integeter (or whatever you want)

#### Add individual statement to each eval method

In [63]:
(castbl                  
 .eval("Tot_Amt_1 = input(scan(scan(column_A,1,';'), -1, ' '), 8.)", inplace=False)   
 .eval("Tot_Amt_2 = input(scan(scan(column_A,2,';'), -1, ' '), 8.)", inplace=False) 
 .eval("Tot_Amt_3 = input(scan(scan(column_A,3,';'), -1, ' '), 8.)", inplace=False)  
 .eval("Tot_Amt_4 = input(scan(scan(column_A,4,';'), -1, ' '), 8.)", inplace=False) 
 .copyTable(casout = {'name':'Final_table', 
                      'caslib':'casuser', 
                      'replace':True})
)

finalTbl = conn.CASTable('final_table', caslib = 'casuser')

finalTbl.head()

Unnamed: 0,column_A,Tot_Amt_1,Tot_Amt_2,Tot_Amt_3,Tot_Amt_4
0,Tot Amt = USD 2; Tot Count = 5; Min Amt = 0; Min Count = 3,2.0,5.0,0.0,3.0
1,Tot Amt = USD 3; Tot Count = 6; Min Amt = 1; Min Count = 4,3.0,6.0,1.0,4.0
2,Tot Amt = USD 4; Tot Count = 7; Min Amt = 2; Min Count = 5,4.0,7.0,2.0,5.0
3,Tot Amt = USD 25; Tot Count = 101; Min Amt = 85; Min Count = 49,25.0,101.0,85.0,49.0


In [19]:
finalTbl.columnInfo()

Unnamed: 0,Column,Label,ID,Type,RawLength,FormattedLength,Format,NFL,NFD
0,column_A,,1,varchar,63,63,,0,0
1,Tot_Amt_1,,2,double,8,12,,0,0
2,Tot_Amt_2,,3,double,8,12,,0,0
3,Tot_Amt_3,,4,double,8,12,,0,0
4,Tot_Amt_4,,5,double,8,12,,0,0


#### Make it a bit more dynamic with a Python string function to return the expression

In [64]:
## Create a function that uses the expression as a string. 
## That way it can be dynamic and you can just add the position
## Not as elegant as pandas, but works well

def cas_function(position):
    return f"input(scan(scan(column_A,{position},';'), -1, ' '), 8.)"

(castbl                  
 .eval(f"Tot_Amt_1 = {cas_function(1)}", inplace=False)   
 .eval(f"Tot_Amt_2 = {cas_function(2)}", inplace=False) 
 .eval(f"Tot_Amt_3 = {cas_function(3)}", inplace=False)
 .eval(f"Tot_Amt_4 = {cas_function(4)}", inplace=False)  
 .copyTable(casout = {'name':'Final_table', 
                      'caslib':'casuser', 
                      'replace':True})
)

finalTbl = conn.CASTable('final_table', caslib = 'casuser')

finalTbl.head()

Unnamed: 0,column_A,Tot_Amt_1,Tot_Amt_2,Tot_Amt_3,Tot_Amt_4
0,Tot Amt = USD 2; Tot Count = 5; Min Amt = 0; Min Count = 3,2.0,5.0,0.0,3.0
1,Tot Amt = USD 3; Tot Count = 6; Min Amt = 1; Min Count = 4,3.0,6.0,1.0,4.0
2,Tot Amt = USD 4; Tot Count = 7; Min Amt = 2; Min Count = 5,4.0,7.0,2.0,5.0
3,Tot Amt = USD 25; Tot Count = 101; Min Amt = 85; Min Count = 49,25.0,101.0,85.0,49.0


In [65]:
##
## Create user defined function
##

## Load the UDF action set to create user defined functions
conn.loadActionSet('fcmpact')


## Create UDF using PROC FCMP
myUDF = '''
    function get_numeric_value(colname $, position);
        
        /* Get the statement by position */
        get_statement_from_position = scan(colname, position,';');
        
        /* Get the number from the statement and convert to a numeric column */
        get_numeric_value = input(scan(get_statement_from_position, -1, ' '), 8.);
        
        /* Return numeric value */
        return(get_numeric_value);
        
    endsub;
'''

## Create the function and store it
conn.addroutines(routineCode = myUDF,                 ## PROC FMCP code to create UDF
                 package = "MyCompanyFunction",       ## Names the package of function(s)
                 saveTable = True,
                 appendTable = True,
                 funcTable = {'name':"Functions", 
                              'caslib':'casuser', 
                              'replace':True})


## Point to the new stored function(s)
conn.setSessOpt(cmplib='casuser.Functions')

NOTE: Added action set 'fcmpact'.
NOTE: Cloud Analytic Services saved the file FUNCTIONS.sashdat in caslib CASUSER(Peter.Styliadis@sas.com).


In [66]:
## Use the PROC FMCP created UDF

(castbl                  
 .eval("Tot_Amt_1 = get_numeric_value(column_A, 1)", inplace=False)   
 .eval("Tot_Amt_2 = get_numeric_value(column_A, 2)", inplace=False) 
 .eval("Tot_Amt_3 = get_numeric_value(column_A, 3)", inplace=False)
 .eval("Tot_Amt_4 = get_numeric_value(column_A, 4)", inplace=False)  
 .copyTable(casout = {'name':'Final_table', 
                      'caslib':'casuser', 
                      'replace':True})
)

finalTbl = conn.CASTable('final_table', caslib = 'casuser')

finalTbl.head()

Unnamed: 0,column_A,Tot_Amt_1,Tot_Amt_2,Tot_Amt_3,Tot_Amt_4
0,Tot Amt = USD 2; Tot Count = 5; Min Amt = 0; Min Count = 3,2.0,5.0,0.0,3.0
1,Tot Amt = USD 3; Tot Count = 6; Min Amt = 1; Min Count = 4,3.0,6.0,1.0,4.0
2,Tot Amt = USD 4; Tot Count = 7; Min Amt = 2; Min Count = 5,4.0,7.0,2.0,5.0
3,Tot Amt = USD 25; Tot Count = 101; Min Amt = 85; Min Count = 49,25.0,101.0,85.0,49.0


## Terminate the CAS session

In [None]:
conn.terminate()