# Drop duplicate rows

In [31]:
## Packages
import swat
import sys
import os
import pandas as pd


## Options
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', None)

try:
    from casauth import CASAuth
    print('Imported personal custom CAS auth package')
except:
    print('casauth package not available')

print(f'Python version:{sys.version.split("|")[0]}')
print(f'swat version:{swat.__version__}')
print(f'pandas version:{pd.__version__}')

Imported personal custom CAS auth package
Python version:3.8.16 (default, Mar  2 2023, 03:18:16) [MSC v.1916 64 bit (AMD64)]
swat version:1.13.1
pandas version:1.5.3


## Connect to CAS
My personal CAS connection information. Yours will differ.

In [32]:
try:
    path = os.getenv('CAS_CREDENTIALS')
    pem_file = os.getenv('CAS_CLIENT_SSL_CA_LIST')
    conn = CASAuth(path, ssl_ca_list = pem_file)
except:
    print('Use your own connection information with the swat.CAS method')

#########################################
##  Enter your connection information  ##
#########################################
## conn = swat.CAS()

CAS Connection created


Check the connection and version of Viya.

In [33]:
conn.about()['About']['Viya Version']

NOTE: Grid node action status report: 5 nodes, 9 total actions executed.


'Stable 2023.05'

## Create demonstration data
Create a simple DataFrame and load it as a distributed CAS table. 

NOTE: This example uses small data for training purposes. Typically the CAS server is used for big data.

In [34]:
## Create the local DataFrame
df = pd.DataFrame({
    'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
    'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
    'rating': [4, 4, 3.5, 15, 5]
})

## Load the DataFrame to CAS as a distributed CASTable
castbl = conn.upload_frame(df, casout = {'name':'dup_tbl', 
                                         'caslib':'casuser', 
                                         'replace':True})

## View the value of castbl and view rows from the CAS table
display(type(castbl), castbl.head())

NOTE: Cloud Analytic Services made the uploaded file available as table DUP_TBL in caslib CASUSER(Peter.Styliadis@sas.com).
NOTE: The table DUP_TBL has been created in caslib CASUSER(Peter.Styliadis@sas.com) from binary data uploaded to Cloud Analytic Services.


swat.cas.table.CASTable

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
1,Yum Yum,cup,4.0
2,Indomie,cup,3.5
3,Indomie,pack,15.0
4,Indomie,pack,5.0


## Using the Pandas API in the SWAT package(drop_duplicates)

Remove duplicate rows based on all columns and create a new CAS table.

In [35]:
no_dup_castbl = castbl.drop_duplicates(casout = {'name':'drop_dups', 'caslib':'casuser'})
no_dup_castbl

CASTable('drop_dups', caslib='casuser')

In [38]:
conn.tableInfo(caslib = 'casuser')

Unnamed: 0,Name,Rows,Columns,IndexedColumns,Encoding,CreateTimeFormatted,ModTimeFormatted,AccessTimeFormatted,JavaCharSet,CreateTime,ModTime,AccessTime,Global,Repeated,View,MultiPart,SourceName,SourceCaslib,Compressed,Creator,Modifier,SourceModTimeFormatted,SourceModTime
0,DUP_TBL,5,3,0,utf-8,2023-07-27T13:00:53+00:00,2023-07-27T13:00:53+00:00,2023-07-27T13:00:55+00:00,UTF8,2006082000.0,2006082000.0,2006082000.0,0,0,0,0,,,0,Peter.Styliadis@sas.com,,2023-07-27T13:00:53+00:00,2006082000.0
1,DROP_DUPS,4,3,0,utf-8,2023-07-27T13:00:55+00:00,2023-07-27T13:00:55+00:00,2023-07-27T13:00:55+00:00,UTF8,2006082000.0,2006082000.0,2006082000.0,0,0,0,0,,,0,Peter.Styliadis@sas.com,,,


The drop_duplicates method returns a reference to the new CAS table.

In [39]:
display(type(no_dup_castbl), no_dup_castbl)

swat.cas.table.CASTable

CASTable('drop_dups', caslib='casuser')

In [40]:
no_dup_castbl.head()

Unnamed: 0,brand,style,rating
0,Indomie,pack,15.0
1,Indomie,pack,5.0
2,Yum Yum,cup,4.0
3,Indomie,cup,3.5


In [36]:
no_dup_castbl.head()

Unnamed: 0,brand,style,rating
0,Indomie,pack,15.0
1,Indomie,pack,5.0
2,Yum Yum,cup,4.0
3,Indomie,cup,3.5


## Remove duplicate rows based on specific columns

To remove duplicates on specific column(s) specify a single column or a list.

In [44]:
no_dup_subset = castbl.drop_duplicates(subset=['brand','style'], 
                                       casout={'name':'drop_dups_subset', 'caslib':'casuser'})

no_dup_subset.head()

Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
1,Indomie,pack,5.0
2,Indomie,cup,3.5


In [27]:
conn.tableInfo(caslib = 'casuser')

Unnamed: 0,Name,Rows,Columns,IndexedColumns,Encoding,CreateTimeFormatted,ModTimeFormatted,AccessTimeFormatted,JavaCharSet,CreateTime,ModTime,AccessTime,Global,Repeated,View,MultiPart,SourceName,SourceCaslib,Compressed,Creator,Modifier,SourceModTimeFormatted,SourceModTime
0,DUP_TBL,5,3,0,utf-8,2023-07-21T14:10:41+00:00,2023-07-21T14:10:41+00:00,2023-07-21T15:57:58+00:00,UTF8,2005568000.0,2005568000.0,2005574000.0,0,0,0,0,,,0,Peter.Styliadis@sas.com,,2023-07-21T14:10:41+00:00,2005568000.0
1,DROP_DUPS,4,3,0,utf-8,2023-07-21T14:10:45+00:00,2023-07-21T14:10:45+00:00,2023-07-21T15:45:16+00:00,UTF8,2005568000.0,2005568000.0,2005574000.0,0,0,0,0,,,0,Peter.Styliadis@sas.com,,,
2,DROP_DUPS_SUBSET,3,3,0,utf-8,2023-07-21T15:57:58+00:00,2023-07-21T15:57:58+00:00,2023-07-21T15:57:58+00:00,UTF8,2005574000.0,2005574000.0,2005574000.0,0,0,0,0,,,0,Peter.Styliadis@sas.com,,,


## Using a native CAS action (deduplication.deduplicatea)

[deduplicate Action Documentation](https://go.documentation.sas.com/doc/en/pgmsascdc/default/caspg/cas-deduplication-deduplicate.htm)

I'll confirm the castbl variable is referencing the original CAS table.

In [47]:
castbl

CASTable('DUP_TBL', caslib='CASUSER(Peter.Styliadis@sas.com)')

First i'll load the deduplication action Set.

In [41]:
conn.loadActionSet('deduplication')

NOTE: Added action set 'deduplication'.


Next I'll remove duplicate rows based on all columns using the action. This is a bit more involved.

In [50]:
colNames = castbl.columns.to_list()
castbl.groupby = colNames
castbl.deduplicate(noDupkeys = True, 
                   casout = {'name':'no_dup_rows_action', 
                             'caslib':'casuser'})

NOTE: There were 5 rows read from the table DUP_TBL.
NOTE: The table no_dup_rows_action has 4 rows and 3 columns.


Alternate method. This temporarily adds the parameters to the CASTable object.

In [54]:
with castbl:
    colNams = castbl.columns.to_list()
    castbl.groupby = colNams
    castbl.deduplicate(noDupkeys = True, 
                       casout = {'name':'no_dup_rows_with', 
                               'caslib':'casuser'})

NOTE: There were 5 rows read from the table DUP_TBL.
NOTE: The table no_dup_rows_with has 4 rows and 3 columns.


In [51]:
conn.tableInfo(caslib = 'casuser')

Unnamed: 0,Name,Rows,Columns,IndexedColumns,Encoding,CreateTimeFormatted,ModTimeFormatted,AccessTimeFormatted,JavaCharSet,CreateTime,ModTime,AccessTime,Global,Repeated,View,MultiPart,SourceName,SourceCaslib,Compressed,Creator,Modifier,SourceModTimeFormatted,SourceModTime
0,DUP_TBL,5,3,0,utf-8,2023-07-27T13:00:53+00:00,2023-07-27T13:00:53+00:00,2023-07-27T13:20:30+00:00,UTF8,2006082000.0,2006082000.0,2006083000.0,0,0,0,0,,,0,Peter.Styliadis@sas.com,,2023-07-27T13:00:53+00:00,2006082000.0
1,DROP_DUPS,4,3,0,utf-8,2023-07-27T13:00:55+00:00,2023-07-27T13:00:55+00:00,2023-07-27T13:01:12+00:00,UTF8,2006082000.0,2006082000.0,2006082000.0,0,0,0,0,,,0,Peter.Styliadis@sas.com,,,
2,DROP_DUPS_SUBSET,3,3,0,utf-8,2023-07-27T13:02:57+00:00,2023-07-27T13:02:57+00:00,2023-07-27T13:02:57+00:00,UTF8,2006082000.0,2006082000.0,2006082000.0,0,0,0,0,,,0,Peter.Styliadis@sas.com,,,
3,NO_DUP_ROWS_ACTION,4,3,0,utf-8,2023-07-27T13:20:30+00:00,2023-07-27T13:20:30+00:00,2023-07-27T13:20:30+00:00,UTF8,2006083000.0,2006083000.0,2006083000.0,0,0,0,0,,,0,Peter.Styliadis@sas.com,,,


In [53]:
no_dup_rows = conn.CASTable('no_dup_rows_action', caslib = 'casuser')
no_dup_rows.head()

Unnamed: 0,brand,style,rating
0,Indomie,pack,15.0
1,Indomie,pack,5.0
2,Yum Yum,cup,4.0
3,Indomie,cup,3.5


In [56]:
conn.terminate()