# Read Mutiple CSV Files

In [48]:
## Packages
import swat
import os
import pandas as pd
import numpy as np

## Options
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', None)

## custom personal module to connect to my CAS environment
try:
    from casConnect import connect_to_cas 
except:
    print('casConnect package not available')

## Make a Connection to CAS (REQUIRED: MODIFY CONNECTION INFORMATION)

##### To connect to the CAS server you will need:
1. the host name, 
2. the portnumber, 
3. your user name, and your password.

Visit the documentation [Getting Started with SAS® Viya® for Python](https://go.documentation.sas.com/doc/en/pgmsascdc/default/caspg3/titlepage.htm) for more information about connecting to CAS.

**Be aware that connecting to the CAS server can be implemented in various ways, so you might need to see your system administrator about how to make a connection. Please follow company policy regarding authentication.**

In [49]:
##
## Connect to CAS
##

## General connection syntax
# conn = swat.CAS(host, port, username, password)

## SAS Viya for Learners 3.5 connection
# hostValue = os.environ.get('CASHOST')
# portValue = os.environ.get('CASPORT')
# passwordToken=os.environ.get('SAS_VIYA_TOKEN')
# conn = swat.CAS(hostname=hostValue, port=portValue, password=passwordToken)

## Personal connection
try:
    conn = connect_to_cas()
    print('CAS connection succesful')
    print(conn)
except:
    print('No connection')
    pass

CAS connection succesful
CAS('ssemonthly.demo.sas.com', 443, protocol='https', name='py-session-5', session='28d7782c-d9f0-c042-a005-c963e6d23def')


In [50]:
conn.caslibInfo()

Unnamed: 0,Name,Type,Description,Path,Definition,Subdirs,Local,Active,Personal,Hidden,Transient
0,CASUSER(Peter.Styliadis@sas.com),PATH,Personal File System Caslib,/cas/data/caslibs/casuserlibraries/peter.styliadis@sas.com/,,1.0,0.0,1.0,1.0,0.0,1.0
1,cpgretl,PATH,,/cas/data/caslibs/cpgretail/,,1.0,0.0,0.0,0.0,0.0,0.0
2,CPSAppData,PATH,,/cas/data/caslibs/CPSAppData/,,0.0,0.0,0.0,0.0,0.0,0.0
3,EDUPub,PATH,,/cas/data/caslibs/edupub/,,0.0,0.0,0.0,0.0,0.0,0.0
4,EP_CommunityCollege,PATH,,/cas/data/caslibs/educationpractice/EP_CommunityCollege/,,0.0,0.0,0.0,0.0,0.0,0.0
5,EP_DOE,PATH,,/cas/data/caslibs/educationpractice/EP_DOE/,,0.0,0.0,0.0,0.0,0.0,0.0
6,EP_Forecasting,PATH,,/cas/data/caslibs/educationpractice/EP_Forecasting/,,0.0,0.0,0.0,0.0,0.0,0.0
7,EP_K12,PATH,,/cas/data/caslibs/educationpractice/EP_K12/,,0.0,0.0,0.0,0.0,0.0,0.0
8,EP_SAfE,PATH,,/cas/data/caslibs/educationpractice/EP_SAfE/,,0.0,0.0,0.0,0.0,0.0,0.0
9,EP_Shapes,PATH,,/cas/data/caslibs/educationpractice/EP_Shapes/,,0.0,0.0,0.0,0.0,0.0,0.0


## Enter your connection information to CAS below

In [51]:
## conn = swat.CAS()

## Prepare data

I created a script to load and prepare the CSV files in the CAS server for this post. This should work in your environment. 

The script does the following:

- Loads the WARRANTY_CLAIMS_0117.sashdat files from the Samples caslib into memory.
- Modifies the in-memory table by renaming column and dropping columns.
- Adds a sub directory in the Casuser caslib named csv_file_blogs.
- Saves a CSV file for each distinct year in the csv_file_blogs folder (5 CSV files).
- Display files in the Casuser caslib and the csv_file_blogs folder.

In [52]:
def prep_data():
    """
    Load and prepare the warranty_claims_0017.sashdat file in CAS
    """
    ## Load the WARRANTY_CLAIMS_0117.sashdat from the Samples caslib into memory in Casuser
    conn.loadTable(path='WARRANTY_CLAIMS_0117.sashdat', caslib='samples',
                   casout={'name':'warranty_claims', 
                           'caslib':'casuser',
                           'replace':True})
    ##
    ## DATA PREP
    ## 
    
    ## Reference the CAS table in an object
    castbl = conn.CASTable('warranty_claims', caslib = 'casuser')
 
    ## Store the column names and labels in a dataframe
    df_col_names = castbl.columnInfo()['ColumnInfo'].loc[:,['Column','Label']]
 
    ## Create a list of dictionaries of how to rename each column using the column labels
    renameColumns = []
    for row in df_col_names.iterrows():
        colName = row[1].values[0]
        labelName = row[1].values[1].replace(' ','_')
        renameColumns.append(dict(name=colName, rename=labelName))
 
    ## List of columns to keep in the CAS table
    keepColumns = {'Campaign_Type', 'Platform','Trim_Level','Make','Model_Year','Engine_Model',
                   'Vehicle_Assembly_Plant','Claim_Repair_Start_Date', 'Claim_Repair_End_Date'}
 
    ## Rename and drop columns to make the table easier to use
    castbl.alterTable(columns = renameColumns, keep = keepColumns)
 
    ## Return the CASTable object reference
    return castbl



def save_cas_table_as_csv_files(cas_table_reference):
    """
    Create a subdirectory in Casuser and save mutliple CSV files in it.
    """
    ## Create a subdirectory in the Casuser caslib named csv_file_blogs
    conn.addCaslibSubdir(name = 'casuser', path = 'csv_file_blogs')
    
    ## Create a CSV file for each year
    for year in list(castbl.Model_Year.unique()):      
        (cas_table_reference
         .query(f"Model_Year ='{year}'")
         .save(name = f'csv_file_blogs/warranty_claims_{year}.csv', 
               caslib = 'casuser',
               replace = True)
        )
    
    ## Drop the CAS Table
    cas_table_reference.dropTable()
    
## Create the CAS table
castbl = prep_data()

## Save the CAS table as a CSV file for each year
save_cas_table_as_csv_files(castbl)

NOTE: Cloud Analytic Services made the file WARRANTY_CLAIMS_0117.sashdat available as table WARRANTY_CLAIMS in caslib CASUSER(Peter.Styliadis@sas.com).
NOTE: Cloud Analytic Services saved the file csv_file_blogs/warranty_claims_2015.csv in caslib CASUSER(Peter.Styliadis@sas.com).
NOTE: Cloud Analytic Services saved the file csv_file_blogs/warranty_claims_2016.csv in caslib CASUSER(Peter.Styliadis@sas.com).
NOTE: Cloud Analytic Services saved the file csv_file_blogs/warranty_claims_2017.csv in caslib CASUSER(Peter.Styliadis@sas.com).
NOTE: Cloud Analytic Services saved the file csv_file_blogs/warranty_claims_2018.csv in caslib CASUSER(Peter.Styliadis@sas.com).
NOTE: Cloud Analytic Services saved the file csv_file_blogs/warranty_claims_2019.csv in caslib CASUSER(Peter.Styliadis@sas.com).
NOTE: Cloud Analytic Services dropped table warranty_claims from caslib CASUSER(Peter.Styliadis@sas.com).


## View the new files

In [53]:
conn.fileInfo(includeDirectories = True, caslib = 'casuser')

Unnamed: 0,Permission,Owner,Group,Name,Size,Encryption,Time,ModTime
0,-rwxr-xr-x,sas,sas,cars.sas7bdat,139264,,2023-02-23T14:21:31+00:00,1992781000.0
1,-rwxr-xr-x,sas,sas,previoussales.sas7bdat,73728,,2023-04-26T20:22:48+00:00,1998160000.0
2,-rwxr-xr-x,sas,sas,VTI.sashdat,413080,NONE,2022-10-11T13:40:38+00:00,1981115000.0
3,-rwxr-xr-x,sas,sas,hmeq.sashdat,630384,NONE,2022-10-13T17:56:59+00:00,1981303000.0
4,-rwxr-xr-x,sas,sas,tsa_claims_raw.csv,34936205,,2023-01-16T13:13:53+00:00,1989494000.0
5,-rwxr-xr-x,sas,sas,warranty_demo.csv,53297896,,2023-03-02T12:05:27+00:00,1993378000.0
6,-rwxr-xr-x,sas,sas,warranty_final.sashdat,68666080,NONE,2023-05-09T18:16:05+00:00,1999275000.0
7,-rwxr-xr-x,sas,sas,cars.parquet,4096,NONE,2022-11-17T14:19:19+00:00,1984314000.0
8,-rwxr-xr-x,sas,sas,warranty.sashdat,104227048,NONE,2023-04-05T20:51:09+00:00,1996347000.0
9,-rwxr-xr-x,sas,sas,warranty.sashdatxfdas.sashdat,104227048,NONE,2023-04-05T20:42:54+00:00,1996347000.0


To view the files in a subdirectory in a caslib, specify the folder name in the path parameter. The above script created the subdirectory csv_file_blogs with 5 CSV files in it.

In [54]:
conn.fileInfo(path = 'csv_file_blogs', caslib = 'casuser')

Unnamed: 0,Permission,Owner,Group,Name,Size,Encryption,Time,ModTime
0,-rwxr-xr-x,sas,sas,warranty_claims_2015.csv,144481,,2023-05-12T12:50:59+00:00,1999515000.0
1,-rwxr-xr-x,sas,sas,warranty_claims_2016.csv,1853574,,2023-05-12T12:50:59+00:00,1999515000.0
2,-rwxr-xr-x,sas,sas,warranty_claims_2017.csv,3994542,,2023-05-12T12:51:00+00:00,1999515000.0
3,-rwxr-xr-x,sas,sas,warranty_claims_2018.csv,2485684,,2023-05-12T12:51:01+00:00,1999515000.0
4,-rwxr-xr-x,sas,sas,warranty_claims_2019.csv,197103,,2023-05-12T12:51:01+00:00,1999515000.0


## Load all of the CSV files

Check to see the available CAS tables in the Casuser caslib.

In [55]:
conn.tableInfo(caslib = 'casuser')

NOTE: No tables are available in caslib CASUSER(Peter.Styliadis@sas.com) of Cloud Analytic Services.


Load all the CSV files from the csv_file_blogs subdirectory in the Casuser caslib into memory in a single CAS table. Name the CAS table allCSVFiles.

In [56]:
conn.loadTable(path="csv_file_blogs", caslib = 'casuser',  ## Specify the subdirectory name (csv_file_blogs) and the input caslib name
               importOptions = {                           ## Specify the import options
                   'fileType' : 'CSV',
                   'multiFile' : True
               },
               casOut = {                                  ## Specify output CAS table information
                   'name' : 'allCSVFiles',
                   'caslib' : 'casuser',
                   'replace' : True
               })

NOTE: The file, '/cas/data/caslibs/casuserlibraries/peter.styliadis@sas.com/csv_file_blogs/warranty_claims_2015.csv' was used to create the CAS Table column names.
NOTE: The CSV file table load for table, 'allCSVFiles' produced 153217 rows from 5 files.
NOTE: Cloud Analytic Services made the file csv_file_blogs available as table ALLCSVFILES in caslib CASUSER(Peter.Styliadis@sas.com).


Notice a single CAS table was created.

In [57]:
conn.tableInfo(caslib = 'casuser')

Unnamed: 0,Name,Rows,Columns,IndexedColumns,Encoding,CreateTimeFormatted,ModTimeFormatted,AccessTimeFormatted,JavaCharSet,CreateTime,ModTime,AccessTime,Global,Repeated,View,MultiPart,SourceName,SourceCaslib,Compressed,Creator,Modifier,SourceModTimeFormatted,SourceModTime
0,ALLCSVFILES,153217,9,0,utf-8,2023-05-12T12:51:02+00:00,2023-05-12T12:51:02+00:00,2023-05-12T12:51:02+00:00,UTF8,1999515000.0,1999515000.0,1999515000.0,0,0,0,0,csv_file_blogs,CASUSER(Peter.Styliadis@sas.com),0,Peter.Styliadis@sas.com,,,


Reference and preview the CAS table.

In [58]:
allcsvfilesTbl = conn.CASTable('allcsvfiles', caslib = 'casuser')
allcsvfilesTbl.head()

Unnamed: 0,Vehicle_Assembly_Plant,Make,Claim_Repair_End_Date,Campaign_Type,Claim_Repair_Start_Date,Engine_Model,Model_Year,Trim_Level,Platform
0,Pittsburgh,Zeus,20554.0,Type 6,20551.0,8 cylinder,2016.0,110.0,XE
1,Charlotte,Titan,21087.0,Type 6,21084.0,8 cylinder,2016.0,95.0,XE
2,Charlotte,Zeus,21099.0,Type 6,21096.0,8 cylinder,2016.0,122.0,XE
3,Pittsburgh,Zeus,20527.0,Type 6,20524.0,4 cylinder,2016.0,110.0,XE
4,Detroit,Zeus,21071.0,Type 6,21064.0,4 cylinder,2016.0,110.0,XE


Each CSV file contains data for a specific year. View the frequency of the Model_year column.

In [30]:
(allcsvfilesTbl   ## CAS table
 .Model_Year      ## CAS column
 .value_counts()  ## SWAT value_counts method
)

2017.0    70479
2018.0    43975
2016.0    32707
2019.0     3510
2015.0     2546
dtype: int64

## Add input file name and path columns

When loading all the CSV files in a subdirectory into a single CAS table, you can add a column that indicates the file name and path.

In [68]:
conn.loadTable(path="csv_file_blogs", caslib = 'casuser',  ## Specify the subdirectory name (csv_file_blogs) and the input caslib name
               importOptions = {                           ## Specify the import options
                   'fileType' : 'CSV',
                   'multiFile' : True,                     
                   'showFile' : True,
                   'showPath' : True
               },
               casOut = {                                  ## Specify output CAS table information
                   'name' : 'allCSVFiles_path_info',
                   'caslib' : 'casuser',
                   'replace' : True
               })

NOTE: The file, '/cas/data/caslibs/casuserlibraries/peter.styliadis@sas.com/csv_file_blogs/warranty_claims_2015.csv' was used to create the CAS Table column names.
NOTE: The CSV file table load for table, 'allCSVFiles_path_info' produced 153217 rows from 5 files.
NOTE: Cloud Analytic Services made the file csv_file_blogs available as table ALLCSVFILES_PATH_INFO in caslib CASUSER(Peter.Styliadis@sas.com).


A second CAS table was created called ALLCSVFILES_PATH_INFO. Notice that this CAS table has two additional columns.

In [66]:
conn.tableInfo(caslib = 'casuser')

Unnamed: 0,Name,Rows,Columns,IndexedColumns,Encoding,CreateTimeFormatted,ModTimeFormatted,AccessTimeFormatted,JavaCharSet,CreateTime,ModTime,AccessTime,Global,Repeated,View,MultiPart,SourceName,SourceCaslib,Compressed,Creator,Modifier,SourceModTimeFormatted,SourceModTime
0,ALLCSVFILES,153217,9,0,utf-8,2023-05-12T12:51:02+00:00,2023-05-12T12:51:02+00:00,2023-05-12T12:51:12+00:00,UTF8,1999515000.0,1999515000.0,1999515000.0,0,0,0,0,csv_file_blogs,CASUSER(Peter.Styliadis@sas.com),0,Peter.Styliadis@sas.com,,,
1,ALLCSVFILES_PATH_INFO,153217,11,0,utf-8,2023-05-12T13:11:13+00:00,2023-05-12T13:11:13+00:00,2023-05-12T13:11:13+00:00,UTF8,1999516000.0,1999516000.0,1999516000.0,0,0,0,0,csv_file_blogs,CASUSER(Peter.Styliadis@sas.com),0,Peter.Styliadis@sas.com,,,


Reference and preview the new CAS table. Notice that the fullPath and fileName columns were added to the CAS table.

In [67]:
allcsvfiles_path_infoTbl = conn.CASTable('allcsvfiles_path_info', caslib = 'casuser')
allcsvfiles_path_infoTbl.head()

Unnamed: 0,path,fileName,Vehicle_Assembly_Plant,Make,Claim_Repair_End_Date,Campaign_Type,Claim_Repair_Start_Date,Engine_Model,Model_Year,Trim_Level,Platform
0,/cas/data/caslibs/casuserlibraries/peter.styliadis@sas.com/csv_file_blogs,warranty_claims_2017.csv,Pittsburgh,Zeus,20819.0,Type 6,20816.0,4 cylinder,2017.0,104.0,XE
1,/cas/data/caslibs/casuserlibraries/peter.styliadis@sas.com/csv_file_blogs,warranty_claims_2017.csv,Charlotte,Zeus,20845.0,Type 6,20842.0,4 cylinder,2017.0,110.0,XE
2,/cas/data/caslibs/casuserlibraries/peter.styliadis@sas.com/csv_file_blogs,warranty_claims_2017.csv,Detroit,Zeus,20815.0,Type 6,20812.0,4 cylinder,2017.0,110.0,XE
3,/cas/data/caslibs/casuserlibraries/peter.styliadis@sas.com/csv_file_blogs,warranty_claims_2017.csv,Pittsburgh,Zeus,20858.0,Type 6,20855.0,4 cylinder,2017.0,110.0,XE
4,/cas/data/caslibs/casuserlibraries/peter.styliadis@sas.com/csv_file_blogs,warranty_claims_2017.csv,Pittsburgh,Zeus,20694.0,Type 6,20691.0,4 cylinder,2017.0,110.0,XE


View how many rows were in each CSV file.

In [69]:
(allcsvfiles_path_infoTbl
 .fileName
 .value_counts()
)

warranty_claims_2017.csv    70479
warranty_claims_2018.csv    43975
warranty_claims_2016.csv    32707
warranty_claims_2019.csv     3510
warranty_claims_2015.csv     2546
dtype: int64

## Delete all demo files

Get a list of all the files in the csv_file_blogs subdirectory.

In [40]:
allFiles = (conn.fileInfo(path = 'csv_file_blogs', caslib = 'casuser')['FileInfo']
            .loc[:,'Name']
            .to_list()
           )

allFiles

['warranty_claims_2015.csv',
 'warranty_claims_2016.csv',
 'warranty_claims_2017.csv',
 'warranty_claims_2018.csv',
 'warranty_claims_2019.csv']

Delete each CSV file.

In [41]:
for file in allFiles:
    conn.deleteSource(source = f'csv_file_blogs/{file}', caslib = 'casuser')

NOTE: Cloud Analytic Services removed the source data csv_file_blogs/warranty_claims_2015.csv from caslib CASUSER(Peter.Styliadis@sas.com).
NOTE: Cloud Analytic Services removed the source data csv_file_blogs/warranty_claims_2016.csv from caslib CASUSER(Peter.Styliadis@sas.com).
NOTE: Cloud Analytic Services removed the source data csv_file_blogs/warranty_claims_2017.csv from caslib CASUSER(Peter.Styliadis@sas.com).
NOTE: Cloud Analytic Services removed the source data csv_file_blogs/warranty_claims_2018.csv from caslib CASUSER(Peter.Styliadis@sas.com).
NOTE: Cloud Analytic Services removed the source data csv_file_blogs/warranty_claims_2019.csv from caslib CASUSER(Peter.Styliadis@sas.com).


Delete the subdirectory csv_file_blogs.

In [42]:
conn.deleteSource(source = 'csv_file_blogs', caslib = 'casuser')

NOTE: Cloud Analytic Services removed the source data csv_file_blogs from caslib CASUSER(Peter.Styliadis@sas.com).


## Terminate the CAS session

In [17]:
conn.terminate()