# Loading a Client-Side File into Memory
Downloading data from the SAS Viya Example Data Sets Website: https://support.sas.com/documentation/onlinedoc/viya/examples.htm

In [3]:
## Packages
import swat
import pandas as pd
from casConnect import connect_to_cas ## custom personal module

##
## Connect to CAS
##

## General connection syntax
# conn = swat.CAS(host, port, username, password)

## Viya for Learners 3.5 connection
# hostValue = os.environ.get('CASHOST')
# portValue = os.environ.get('CASPORT')
# passwordToken=os.environ.get('SAS_VIYA_TOKEN')
# conn = swat.CAS(hostname=hostValue, port=portValue, password=passwordToken)

## Personal connection
conn = connect_to_cas()

## Documentation (copied from pandas drop_duplicates)

Consider dataset containing ramen rating.

In [2]:
#conn = swat.CAS() ## Enter connection information
df = pd.DataFrame({
    'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
    'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
    'rating': [4, 4, 3.5, 15, 5]
})
castbl = conn.upload_frame(df)
castbl.head()

NOTE: Cloud Analytic Services made the uploaded file available as table TMP1PWYYBVT in caslib CASUSER(Peter.Styliadis@sas.com).
NOTE: The table TMP1PWYYBVT has been created in caslib CASUSER(Peter.Styliadis@sas.com) from binary data uploaded to Cloud Analytic Services.


Unnamed: 0,brand,style,rating
0,Yum Yum,cup,4.0
1,Yum Yum,cup,4.0
2,Indomie,cup,3.5
3,Indomie,pack,15.0
4,Indomie,pack,5.0


In [None]:
castbl.head()

By default, it removes duplicate rows based on all columns.

In [33]:
castbl.drop_duplicates(casout = {'name':'remove_all_duplicates', 'caslib':'casuser'})

tbl = conn.CASTable('remove_all_duplicates')
tbl.head()

NOTE: There were 5 rows read from the table TMPTCD9OMT3.
NOTE: The table remove_duplicates has 4 rows and 3 columns.


Unnamed: 0,brand,style,rating
0,Indomie,pack,15.0
1,Indomie,pack,5.0
2,Yum Yum,cup,4.0
3,Indomie,cup,3.5


To remove duplicates on specific column(s), use subset. Order of rows is not guaranteed in the distributed CAS server.

In [37]:
castbl.drop_duplicates(subset=['brand'], casout = {'name':'remove_brand_duplicates', 'caslib':'casuser'})
tbl = conn.CASTable('remove_brand_duplicates')
tbl.head()

NOTE: There were 5 rows read from the table TMPTCD9OMT3.
NOTE: The table remove_brand_duplicates has 2 rows and 3 columns.


Unnamed: 0,brand,style,rating
0,Indomie,pack,15.0
1,Yum Yum,cup,4.0


## My tests with the cars.csv file

In [4]:
tbl = conn.read_csv('https://support.sas.com/documentation/onlinedoc/viya/exampledatasets/cars.csv', 
                    casout={'name':'cars_test', 'caslib':'casuser'})
carsdup = tbl.append(tbl, casout='cars_duplicate')
conn.tableInfo()

NOTE: Cloud Analytic Services made the uploaded file available as table CARS_TEST in caslib CASUSER(Peter.Styliadis@sas.com).
NOTE: The table CARS_TEST has been created in caslib CASUSER(Peter.Styliadis@sas.com) from binary data uploaded to Cloud Analytic Services.


Unnamed: 0,Name,Rows,Columns,IndexedColumns,Encoding,CreateTimeFormatted,ModTimeFormatted,AccessTimeFormatted,JavaCharSet,CreateTime,Repeated,View,MultiPart,SourceName,SourceCaslib,Compressed,Creator,Modifier,SourceModTimeFormatted,SourceModTime
0,CARS_TEST,428,15,0,utf-8,2023-01-11T16:09:02+00:00,2023-01-11T16:09:02+00:00,2023-01-11T16:09:05+00:00,UTF8,1989073000.0,0,0,0,,,0,Peter.Styliadis@sas.com,,2023-01-11T16:09:02+00:00,1989073000.0
1,CARS_DUPLICATE,856,15,0,utf-8,2023-01-11T16:09:05+00:00,2023-01-11T16:09:05+00:00,2023-01-11T16:09:05+00:00,UTF8,1989073000.0,0,0,0,,,0,Peter.Styliadis@sas.com,,,


In [6]:
carsdup.head()

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,36945.0,33337.0,3.5,6.0,265.0,17.0,23.0,4451.0,106.0,189.0
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,23820.0,21761.0,2.0,4.0,200.0,24.0,31.0,2778.0,101.0,172.0
2,Acura,TSX 4dr,Sedan,Asia,Front,26990.0,24647.0,2.4,4.0,200.0,22.0,29.0,3230.0,105.0,183.0
3,Acura,TL 4dr,Sedan,Asia,Front,33195.0,30299.0,3.2,6.0,270.0,20.0,28.0,3575.0,108.0,186.0
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,43755.0,39014.0,3.5,6.0,225.0,18.0,24.0,3880.0,115.0,197.0


In [7]:
carsdup.drop_duplicates(casout = 'test')

NOTE: Added action set 'deduplication'.
NOTE: There were 856 rows read from the table CARS_DUPLICATE.
NOTE: The table test has 428 rows and 15 columns.


CASTable('test', caslib='CASUSER(Peter.Styliadis@sas.com)')

In [9]:
x = carsdup.drop_duplicates(casout = 'test', inplace=True)
x.head()

ERROR: The table test already exists in the session.
ERROR: The action stopped due to errors.


Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Mercedes-Benz,C240 4dr,Sedan,Europe,Rear,32280.0,30071.0,2.6,6.0,168.0,20.0,25.0,3360.0,107.0,178.0
1,Mercedes-Benz,C320 Sport 4dr,Sedan,Europe,Rear,35920.0,33456.0,3.2,6.0,215.0,19.0,26.0,3430.0,107.0,178.0
2,Mitsubishi,Galant GTS 4dr,Sedan,Asia,Front,25700.0,23883.0,3.8,6.0,230.0,18.0,26.0,3649.0,108.0,191.0
3,Chevrolet,Tahoe LT,SUV,USA,All,41465.0,36287.0,5.3,8.0,295.0,14.0,18.0,5050.0,116.0,197.0
4,Dodge,Intrepid SE 4dr,Sedan,USA,Front,22035.0,20502.0,2.7,6.0,200.0,21.0,29.0,3469.0,113.0,204.0


In [10]:
x = carsdup.drop_duplicates(casout = {'name':'test2'}, subset=['Make', 'Model'], inplace=True)
x.head()

NOTE: There were 856 rows read from the table CARS_DUPLICATE.
NOTE: The table test2 has 425 rows and 15 columns.


Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Chevrolet,Cavalier LS 2dr,Sedan,USA,Front,16385.0,15357.0,2.2,4.0,140.0,26.0,37.0,2617.0,104.0,183.0
1,Ford,F-150 Supercab Lariat,Truck,USA,All,33540.0,29405.0,5.4,8.0,300.0,14.0,18.0,5464.0,133.0,218.0
2,Honda,CR-V LX,SUV,Asia,All,19860.0,18419.0,2.4,4.0,160.0,21.0,25.0,3258.0,103.0,179.0
3,Saturn,Ion1 4dr,Sedan,USA,Front,10995.0,10319.0,2.2,4.0,140.0,26.0,35.0,2692.0,103.0,185.0
4,BMW,525i 4dr,Sedan,Europe,Rear,39995.0,36620.0,2.5,6.0,184.0,19.0,28.0,3428.0,114.0,191.0


## Test: No subset parameter specified (Default uses all columns)
If it's not specified all columns should be used to determine duplicates.

In [11]:
carsdup.drop_duplicates(casout={'name':'no_param_specified', 'caslib':'casuser'})

NOTE: There were 856 rows read from the table CARS_DUPLICATE.
NOTE: The table no_param_specified has 428 rows and 15 columns.


CASTable('no_param_specified', caslib='casuser')

Try the overwrite the table above.

In [5]:
x = carsdup.drop_duplicates(casout={'name':'no_param_specified', 'caslib':'casuser'}, inplace = True)

NOTE: Added action set 'deduplication'.
NOTE: There were 856 rows read from the table CARS_DUPLICATE.
NOTE: The table no_param_specified has 428 rows and 15 columns.


In [12]:
casout = {'name':'test','test':'test'}

In [29]:
if (type(casout)==str) or ()

True

In [35]:
type(x)

NoneType

In [11]:
conn.tableInfo()

Unnamed: 0,Name,Rows,Columns,IndexedColumns,Encoding,CreateTimeFormatted,ModTimeFormatted,AccessTimeFormatted,JavaCharSet,CreateTime,Repeated,View,MultiPart,SourceName,SourceCaslib,Compressed,Creator,Modifier,SourceModTimeFormatted,SourceModTime
0,CARS,428,15,0,utf-8,2023-01-09T14:06:39+00:00,2023-01-09T14:06:39+00:00,2023-01-09T14:06:44+00:00,UTF8,1988892000.0,0,0,0,,,0,Peter.Styliadis@sas.com,,2023-01-09T14:06:39+00:00,1988892000.0
1,CARSDUP,856,15,0,utf-8,2023-01-09T14:06:44+00:00,2023-01-09T14:06:44+00:00,2023-01-09T14:06:57+00:00,UTF8,1988892000.0,0,0,0,,,0,Peter.Styliadis@sas.com,,,
2,NO_PARAM_SPECIFIED,428,15,0,utf-8,2023-01-09T14:06:57+00:00,2023-01-09T14:06:57+00:00,2023-01-09T14:06:57+00:00,UTF8,1988892000.0,0,0,0,,,0,Peter.Styliadis@sas.com,,,


## Test: Specify a single column as a string in the subset parameter
The string should be converted to a list and used to determine duplicates.

In [12]:
carsdup.drop_duplicates(casout='single_column', subset='Make')

NOTE: There were 856 rows read from the table CARS_DUPLICATE.
NOTE: The table single_column has 38 rows and 15 columns.


CASTable('single_column', caslib='CASUSER(Peter.Styliadis@sas.com)')

Try to ovewrite the previous.

In [13]:
carsdup.drop_duplicates(casout='single_column', subset='Make', inplace = True)

NOTE: There were 856 rows read from the table CARS_DUPLICATE.
NOTE: The table single_column has 38 rows and 15 columns.


CASTable('single_column', caslib='CASUSER(Peter.Styliadis@sas.com)')

If the string is not a column in the CAS table, return an error.

In [14]:
carsdup.drop_duplicates(casout='test_bad_col_str', subset='Makeadfd')

ERROR: The column 'Makeadfd' does not exist in the table 'CARSDUP'.
ERROR: Failure opening table 'CARSDUP': A required column does not exist in the table.
ERROR: The action stopped due to errors.


## Test: Specify a list of columns in the subset parameter.
List should get unique of each combination. Just add the CAS table name.

In [17]:
carsdup.drop_duplicates(casout='make_model_list', subset=['Make','Model'])

ERROR: The table make_model_list already exists in the session.
ERROR: The action stopped due to errors.


Overwrite the existing table.

In [18]:
carsdup.drop_duplicates(casout='make_model_list', subset=['Make','model'], inplace = True)

NOTE: There were 856 rows read from the table CARSDUP.
NOTE: The table make_model_list has 425 rows and 15 columns.


Try using a dictionary in subset.

In [19]:
carsdup.drop_duplicates(casout={'name':'make_model_list', 'caslib':'casuser'}, subset=['Make','model'], inplace = True)

NOTE: There were 856 rows read from the table CARSDUP.
NOTE: The table make_model_list has 425 rows and 15 columns.


## Specify duplicate columns in the subset parameter

In [19]:
carsdup.drop_duplicates(casout='test_dup_columns', subset = ['Make','Model','Make','Weight'])

NOTE: There were 856 rows read from the table CARSDUP.
NOTE: The table test_dup_columns has 426 rows and 15 columns.


In [21]:
tbl1 = conn.CASTable('test_dup_columns')
tbl.head()

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,36945.0,33337.0,3.5,6.0,265.0,17.0,23.0,4451.0,106.0,189.0
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,23820.0,21761.0,2.0,4.0,200.0,24.0,31.0,2778.0,101.0,172.0
2,Acura,TSX 4dr,Sedan,Asia,Front,26990.0,24647.0,2.4,4.0,200.0,22.0,29.0,3230.0,105.0,183.0
3,Acura,TL 4dr,Sedan,Asia,Front,33195.0,30299.0,3.2,6.0,270.0,20.0,28.0,3575.0,108.0,186.0
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,43755.0,39014.0,3.5,6.0,225.0,18.0,24.0,3880.0,115.0,197.0


In [24]:
carsdup.drop_duplicates(casout='test_dup_columns3', subset = ['Make','Model','Weight','Make','Make'])

NOTE: There were 856 rows read from the table CARSDUP.
NOTE: The table test_dup_columns3 has 426 rows and 15 columns.


In [25]:
tbl1 = conn.CASTable('test_dup_columns2')
tbl.head()

Unnamed: 0,Make,Model,Type,Origin,DriveTrain,MSRP,Invoice,EngineSize,Cylinders,Horsepower,MPG_City,MPG_Highway,Weight,Wheelbase,Length
0,Acura,MDX,SUV,Asia,All,36945.0,33337.0,3.5,6.0,265.0,17.0,23.0,4451.0,106.0,189.0
1,Acura,RSX Type S 2dr,Sedan,Asia,Front,23820.0,21761.0,2.0,4.0,200.0,24.0,31.0,2778.0,101.0,172.0
2,Acura,TSX 4dr,Sedan,Asia,Front,26990.0,24647.0,2.4,4.0,200.0,22.0,29.0,3230.0,105.0,183.0
3,Acura,TL 4dr,Sedan,Asia,Front,33195.0,30299.0,3.2,6.0,270.0,20.0,28.0,3575.0,108.0,186.0
4,Acura,3.5 RL 4dr,Sedan,Asia,Front,43755.0,39014.0,3.5,6.0,225.0,18.0,24.0,3880.0,115.0,197.0


In [None]:
deduplicate

In [None]:
conn.read_csv()

In [26]:
conn.tableInfo()

Unnamed: 0,Name,Rows,Columns,IndexedColumns,Encoding,CreateTimeFormatted,ModTimeFormatted,AccessTimeFormatted,JavaCharSet,CreateTime,Repeated,View,MultiPart,SourceName,SourceCaslib,Compressed,Creator,Modifier,SourceModTimeFormatted,SourceModTime
0,CARS,428,15,0,utf-8,2023-01-09T13:36:24+00:00,2023-01-09T13:36:24+00:00,2023-01-09T13:38:35+00:00,UTF8,1988891000.0,0,0,0,,,0,Peter.Styliadis@sas.com,,2023-01-09T13:36:24+00:00,1988891000.0
1,CARSDUP,856,15,0,utf-8,2023-01-09T13:36:27+00:00,2023-01-09T13:36:27+00:00,2023-01-09T13:38:34+00:00,UTF8,1988891000.0,0,0,0,,,0,Peter.Styliadis@sas.com,,,
2,TEST,428,15,0,utf-8,2023-01-09T13:36:31+00:00,2023-01-09T13:36:31+00:00,2023-01-09T13:36:31+00:00,UTF8,1988891000.0,0,0,0,,,0,Peter.Styliadis@sas.com,,,
3,TEST_MAKE,38,15,0,utf-8,2023-01-09T13:36:32+00:00,2023-01-09T13:36:32+00:00,2023-01-09T13:36:32+00:00,UTF8,1988891000.0,0,0,0,,,0,Peter.Styliadis@sas.com,,,
4,MAKE_MODEL_TEST,425,15,0,utf-8,2023-01-09T13:36:40+00:00,2023-01-09T13:36:40+00:00,2023-01-09T13:36:40+00:00,UTF8,1988891000.0,0,0,0,,,0,Peter.Styliadis@sas.com,,,
5,MAKE_MODEL_BAD_COL,425,15,0,utf-8,2023-01-09T13:36:40+00:00,2023-01-09T13:36:40+00:00,2023-01-09T13:36:40+00:00,UTF8,1988891000.0,0,0,0,,,0,Peter.Styliadis@sas.com,,,
6,TEST_DUP_COLUMNS,426,15,0,utf-8,2023-01-09T13:37:13+00:00,2023-01-09T13:37:13+00:00,2023-01-09T13:37:13+00:00,UTF8,1988891000.0,0,0,0,,,0,Peter.Styliadis@sas.com,,,
7,TEST_DUP_COLUMNS2,426,15,0,utf-8,2023-01-09T13:37:35+00:00,2023-01-09T13:37:35+00:00,2023-01-09T13:37:35+00:00,UTF8,1988891000.0,0,0,0,,,0,Peter.Styliadis@sas.com,,,
8,TEST_DUP_COLUMNS3,426,15,0,utf-8,2023-01-09T13:38:34+00:00,2023-01-09T13:38:34+00:00,2023-01-09T13:38:34+00:00,UTF8,1988891000.0,0,0,0,,,0,Peter.Styliadis@sas.com,,,


## Specify a data type other than a list or string

In [14]:
carsdup.drop_duplicates(casout='test', subset=3)

NotImplementedError: Specify a single column or list of columns.

In [32]:
specifiedColumns=["Col1", "Col2","Col2"]
tableColumns=["Col1", "Col2", "Col3", "Col4"]

In [18]:
notintable = set(specifiedColumns).difference(set(carsdup.columns))
notintable

{'Col1', 'Col2', 'Colx'}

In [12]:
if (len(set(specifiedColumns)) == len(specifiedColumns)) == False:
    print('false')

false


In [46]:
columns = ['Make', 'Make', 'Model','Make' ]
column_list = []
duplicate_columns = set()
for column in columns:
    if column in column_list:
        duplicate_columns.add(column)
    else:
        column_list.append(column)

In [47]:
duplicate_columns

{'Make'}

In [48]:
column_list

['Make', 'Model']

In [36]:
duplicate_columns

['Make']

In [53]:
columns in ['a']

False

In [16]:
x =[1,2,3,4,5,1]

In [17]:
x

[1, 2, 3, 4, 5, 1]

In [18]:
set(x)

{1, 2, 3, 4, 5}

In [None]:
x.

## Terminate the CAS connection

In [38]:
conn.terminate()