# Read a PDF file into a Table

In [13]:
## Packages
import swat
import sys
import os
import pandas as pd
import numpy as np

## My custom package to connect to the CAS Server. Will not work in your environment.
try:
    from casauth import CASAuth
    print('Imported personal custom CAS auth package')
except:
    print('casauth package not available')


print(f'Python version:{sys.version.split("|")[0]}')
print(f'swat version:{swat.__version__}')
print(f'pandas version:{pd.__version__}')
print(f'numpy version:{np.__version__}')

Imported personal custom CAS auth package
Python version:3.8.16 (default, Mar  2 2023, 03:18:16) [MSC v.1916 64 bit (AMD64)]
swat version:1.13.1
pandas version:1.5.3
numpy version:1.24.3


## Make a Connection to CAS (REQUIRED: MODIFY CONNECTION INFORMATION)

##### To connect to the CAS server you will need:
1. the host name, 
2. the portnumber, 
3. your user name, and your password.

Visit the documentation [Getting Started with SAS® Viya® for Python](https://go.documentation.sas.com/doc/en/pgmsascdc/default/caspg3/titlepage.htm) for more information about connecting to CAS.

**Be aware that connecting to the CAS server can be implemented in various ways, so you might need to see your system administrator about how to make a connection. Please follow company policy regarding authentication.**

In [14]:
##
## Connect to CAS
##

################################
## General connection syntax  ##
################################
# conn = swat.CAS(host, port, username, password)

############################################
## SAS Viya for Learners 3.5 connection   ##
############################################
# hostValue = os.environ.get('CASHOST')
# portValue = os.environ.get('CASPORT')
# passwordToken=os.environ.get('SAS_VIYA_TOKEN')
# conn = swat.CAS(hostname=hostValue, port=portValue, password=passwordToken)


##############################
## My Personal connection   ##
##############################
try:
    path = os.getenv('CAS_CREDENTIALS')
    pem_file = os.getenv('CAS_CLIENT_SSL_CA_LIST')
    conn = CASAuth(path, ssl_ca_list = pem_file)
except:
    print('No connection')
    pass

CAS Connection created


## Enter your connection information to CAS below

In [15]:
## conn = swat.CAS()

## Manually upload the PDF files to the CAS server (SAS Viya permissions required)
You can do this by going 

In [16]:
conn.addCaslib(name='my_pdfs', path = '/greenmonthly-export/ssemonthly/homes/Peter.Styliadis@sas.com/Blog_PDFs/', subdirectories=True)

NOTE: 'my_pdfs' is now the active caslib.
NOTE: Cloud Analytic Services added the caslib 'my_pdfs'.


Unnamed: 0,Name,Type,Description,Path,Definition,Subdirs,Local,Active,Personal,Hidden,Transient
0,my_pdfs,DNFS,,/greenmonthly-export/ssemonthly/homes/Peter.St...,,1.0,1.0,1.0,0.0,0.0,0.0


In [17]:
conn.fileInfo(caslib = 'my_pdfs', allFiles = True)

Unnamed: 0,Permission,Owner,Group,Name,Size,Encryption,Time,ModTime
0,-rwxr-xr-x,,,PDF_Form_1.pdf,114965,,2023-10-01T13:15:03+00:00,2011785000.0


In [18]:
conn.loadTable(path = '',                  
               caslib = 'my_pdfs',
               importOptions = {
                   'fileType' : 'DOCUMENT',
                   'fileExtList' : 'PDF',
                   'tikaConv' : True
               },
               casOut = {'name':'test', 'caslib' : 'casuser', 'replace':True}
              );


NOTE: Cloud Analytic Services made documents available as table TEST in caslib CASUSER(Peter.Styliadis@sas.com).


In [20]:
tbl = conn.CASTable('test', caslib = 'casuser')
tbl.head()

Unnamed: 0,path,fileName,fileType,fileSize,fileDate,content
0,/greenmonthly-export/ssemonthly/homes/Peter.St...,PDF_Form_1.pdf,pdf,114965,2023-10-01 13:15:03.125280,\nPeter and Mark’s Data Jedi Solutions \nPART ...


In [22]:
print(tbl.head().loc[:,'content'])

0    \nPeter and Mark’s Data Jedi Solutions \nPART ...
Name: Fetch, dtype: object


## Terminate the CAS session

In [12]:
conn.terminate()