# Impute Missing Values

In [3]:
## Packages
import swat
import sys
import os
import pandas as pd
import numpy as np

## My custom package to connect to the CAS Server
try:
    from casauth import CASAuth
    print('Imported personal custom CAS auth package')
except:
    print('casauth package not available')


print(f'Python version:{sys.version.split("|")[0]}')
print(f'swat version:{swat.__version__}')
print(f'pandas version:{pd.__version__}')
print(f'numpy version:{np.__version__}')

Imported personal custom CAS auth package
Python version:3.8.16 (default, Mar  2 2023, 03:18:16) [MSC v.1916 64 bit (AMD64)]
swat version:1.13.1
pandas version:1.5.3
numpy version:1.24.3


## Make a Connection to CAS (REQUIRED: MODIFY CONNECTION INFORMATION)

##### To connect to the CAS server you will need:
1. the host name, 
2. the portnumber, 
3. your user name, and your password.

Visit the documentation [Getting Started with SAS® Viya® for Python](https://go.documentation.sas.com/doc/en/pgmsascdc/default/caspg3/titlepage.htm) for more information about connecting to CAS.

**Be aware that connecting to the CAS server can be implemented in various ways, so you might need to see your system administrator about how to make a connection. Please follow company policy regarding authentication.**

## Enter your connection information to CAS below

In [2]:
################################
## General connection syntax  ##
################################
# conn = swat.CAS(host, port, username, password)

##########################################
## SAS Viya for Learners 3.5 connection ##
##########################################
# hostValue = os.environ.get('CASHOST')
# portValue = os.environ.get('CASPORT')
# passwordToken=os.environ.get('SAS_VIYA_TOKEN')
# conn = swat.CAS(hostname=hostValue, port=portValue, password=passwordToken)

############################
## My Personal connection ##
############################
try:
    ## My connection to CAS
    path = os.environ.get('CAS_CREDENTIALS')
    pem = os.environ.get('CAS_CLIENT_SSL_CA_LIST')
    conn = CASAuth(path, ssl_ca_list = pem)
except:
    print('No connection')
    pass

CAS Connection created


## Create the data for the demonstration

In [3]:
## Create a simple dataframe
df = pd.DataFrame([
                [np.nan, 2, 45, 0, 'A'],
                [3, 4, np.nan, 1,'A'],
                [np.nan, np.nan, 50, np.nan,'B'],
                [np.nan, 3, np.nan, 4,],
                [2, 2, np.nan, 0, 'A'],
                [3, 4, np.nan, 1,'A'],
                [np.nan, np.nan, 75, np.nan,'B'],
                [np.nan, 3, 60, 4,]
            ],
            columns=['col1','col2','col3','col4','col5'])
 
## Upload the dataframe to the CAS server as a CAS table
castbl = conn.upload_frame(df,
                           casout = {'name':'missing_data', 
                                     'caslib':'casuser', 
                                     'replace':True})

castbl.head(20)

NOTE: Cloud Analytic Services made the uploaded file available as table MISSING_DATA in caslib CASUSER(Peter.Styliadis@sas.com).
NOTE: The table MISSING_DATA has been created in caslib CASUSER(Peter.Styliadis@sas.com) from binary data uploaded to Cloud Analytic Services.


Unnamed: 0,col1,col2,col3,col4,col5
0,,2.0,45.0,0.0,A
1,3.0,4.0,,1.0,A
2,,,50.0,,B
3,,3.0,,4.0,
4,2.0,2.0,,0.0,A
5,3.0,4.0,,1.0,A
6,,,75.0,,B
7,,3.0,60.0,4.0,


In [4]:
conn.tableInfo(caslib = 'casuser')

Unnamed: 0,Name,Rows,Columns,IndexedColumns,Encoding,CreateTimeFormatted,ModTimeFormatted,AccessTimeFormatted,JavaCharSet,CreateTime,Repeated,View,MultiPart,SourceName,SourceCaslib,Compressed,Creator,Modifier,SourceModTimeFormatted,SourceModTime
0,MISSING_DATA,8,5,0,utf-8,2023-08-10T13:03:40+00:00,2023-08-10T13:03:40+00:00,2023-08-10T13:03:41+00:00,UTF8,2007292000.0,0,0,0,,,0,Peter.Styliadis@sas.com,,2023-08-10T13:03:40+00:00,2007292000.0


## Impute missing values using the fillna method

In [5]:
castbl.fillna(value = {'col1':100, 'col2':200}, inplace = True)

In [6]:
castbl.head(20)

Unnamed: 0,col1,col2,col3,col4,col5
0,100.0,2.0,45.0,0.0,A
1,3.0,4.0,,1.0,A
2,100.0,200.0,50.0,,B
3,100.0,3.0,,4.0,
4,2.0,2.0,,0.0,A
5,3.0,4.0,,1.0,A
6,100.0,200.0,75.0,,B
7,100.0,3.0,60.0,4.0,


## Impute missing values using the impute CAS action

In [7]:
conn.loadActionSet('dataPreprocess')

NOTE: Added action set 'dataPreprocess'.


In [8]:
conn.dataPreprocess?

[1;31mSignature:[0m   [0mconn[0m[1;33m.[0m[0mdataPreprocess[0m[1;33m([0m[1;33m*[0m[0margs[0m[1;33m,[0m [1;33m**[0m[0mkwargs[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mType:[0m        Datapreprocess
[1;31mString form:[0m <swat.cas.actions.Datapreprocess object at 0x000001C76DF3D490>
[1;31mFile:[0m        c:\users\pestyl\anaconda3\lib\site-packages\swat\cas\actions.py
[1;31mDocstring:[0m  
Data Preprocess

Actions
-------
datapreprocess.binning         : Performs unsupervised variable discretization
datapreprocess.cattrans        : Groups and encodes categorical variables using
                                 unsupervised and supervised grouping techniques
datapreprocess.discretize      : Performs supervised and unsupervised variable
                                 discretization
datapreprocess.highcardinality : Performs randomized cardinality estimation
datapreprocess.histogram       : Generates histogram bins and simple bin-based
                      

In [13]:
colsToImpute = ['col3', 'col4', 'col5']

castbl.impute(inputs = colsToImpute,
              copyAllVars = True,               ## Output all columns to the new table
              methodInterval = "MEDIAN",        ## How to impute numeric columns
              methodNominal = "MODE",           ## How to impute character columns
              casout = {
                  'name':'imputed_data',
                  'caslib':'casuser',
                  'replace':True
              })

Unnamed: 0,Variable,ImputeTech,ResultVar,N,NMiss,ImputedValueContinuous,ImputedValueNominal
0,col3,Median,IMP_col3,4.0,4.0,55.0,
1,col4,Median,IMP_col4,6.0,2.0,1.0,
2,col5,Mode,IMP_col5,6.0,2.0,,A

Unnamed: 0,casLib,Name,Rows,Columns,casTable
0,CASUSER(Peter.Styliadis@sas.com),imputed_data,8,8,"CASTable('imputed_data', caslib='CASUSER(Peter..."


In [11]:
conn.tableInfo(caslib = 'casuser')

Unnamed: 0,Name,Rows,Columns,IndexedColumns,Encoding,CreateTimeFormatted,ModTimeFormatted,AccessTimeFormatted,JavaCharSet,CreateTime,Repeated,View,MultiPart,SourceName,SourceCaslib,Compressed,Creator,Modifier,SourceModTimeFormatted,SourceModTime
0,MISSING_DATA,8,5,0,utf-8,2023-08-10T13:03:41+00:00,2023-08-10T13:03:41+00:00,2023-08-10T13:04:02+00:00,UTF8,2007292000.0,0,0,0,,,0,Peter.Styliadis@sas.com,,,
1,IMPUTED_DATA,8,5,0,utf-8,2023-08-10T13:04:03+00:00,2023-08-10T13:04:03+00:00,2023-08-10T13:04:03+00:00,UTF8,2007292000.0,0,0,0,,,0,Peter.Styliadis@sas.com,,,


In [12]:
imp_tbl = conn.CASTable('imputed_data', caslib = 'casuser')
imp_tbl.head(20)

Unnamed: 0,col1,col2,IMP_col3,IMP_col4,IMP_col5
0,100.0,2.0,45.0,0.0,A
1,3.0,4.0,55.0,1.0,A
2,100.0,200.0,50.0,1.0,B
3,100.0,3.0,55.0,4.0,A
4,2.0,2.0,55.0,0.0,A
5,3.0,4.0,55.0,1.0,A
6,100.0,200.0,75.0,1.0,B
7,100.0,3.0,60.0,4.0,A


## Terminate the CAS session

In [8]:
conn.terminate()