In [1]:
# Change directory to VSCode workspace root so that relative path loads work correctly. Turn this addition off with the DataScience.changeDirOnImportExport setting
# ms-python.python added
import os
try:
    os.chdir(os.path.join(os.getcwd(), '../../python-tools'))
    print(os.getcwd())
except:
    pass


  ### 1. Read and write text data
  * Use the `open()` function with mode `rt` to read a text file.

In [2]:
# Read the entire file as a single string
with open('data/ai-wiki.txt', 'rt') as f:
    data = f.read()


In [3]:
# Iterate over the lines of the file
with open('data/ai-wiki.txt', 'rt') as f:
    for line in f:
        # process line
        line.split()


In [4]:
# Write chunks of text data
# with open('somefile.txt', 'wt') as f:
#     f.write(text1)
#     f.write(text2)


In [5]:
# Redirected print statement
# with open('somefile.txt', 'wt') as f:
#     print(line1, file=f)
#     print(line2, file=f)
#     ...


  ### 2. Printing to a file
  * use __file__ keyword argument to __print()__

In [6]:
# with open('somefile.txt', 'rt') as f:
#     print('Hello World!', file=f)


  ### 3. Printing with a different separator or line ending

In [7]:
print('SBUX', 50, 91.5)


SBUX 50 91.5


In [8]:
print('SBUX', 50, 91.5, sep=',')


SBUX,50,91.5


In [9]:
print('SBUX', 50, 91.5, sep=',', end='!!\n')


SBUX,50,91.5!!


In [10]:
#use of the end argument is also how you suppress the output of newlines in output.
for i in range(5):
    print(i)


0
1
2
3
4


In [11]:
for i in range(5):
    print(i, end=' ')


0 1 2 3 4

  ### 4. Performing i/o operations on a string
  * Use the `io.StringIO()` and `io.BytesIO()` classes to create file-like objects that operate on string data.

In [12]:
import io


In [13]:
s = io.StringIO()
s.write('Hello World\n')


12

In [14]:
print('This is a test', file=s)


In [15]:
# Get all of the data written so far
s.getvalue()


'Hello World\nThis is a test\n'

In [16]:
# Wrap a file interface around an existing string
s = io.StringIO('Hello\nWorld\n')
s.read(4)


'Hell'

In [17]:
s.read()


'o\nWorld\n'

In [18]:
# The io.StringIO class should only be used for text.
# If you are operating with binary data, use the io.BytesIO class instead.
s_bit = io.BytesIO()
s_bit.write(b'binary data')
s_bit.getvalue()


b'binary data'

  ### 5. Read and writing compressed datafiles
  *  Both `gzip`and `bz2` provides an alternative implementation of open() that can be used for this purpose.

In [19]:
import gzip
import bz2


In [20]:
# gzip compression

# with gzip.open('somefile.gz', 'rt') as f:
#     text = f.read()

# with gzip.open('somefile.gz', 'wt') as f:
#     f.write(text)


In [21]:
# bz2 compression

# with bz2.open('somefile.bz2', 'rt') as f:
#     text = f.read()

# with bz2.open('somefile.bz2', 'wt') as f:
#     f.write(text)


 ### 6. Iterate over fixed-sized records
 * use the `iter()` function and `functools.partial()`

In [22]:
from functools import partial


 ```
 RECORD_SIZE = 32

 with open('somefile.data', 'rb') as f:
     records = iter(partial(f.read, RECORD_SIZE), b'')
     for r in records:
         ...
 ```

 ### 7. Getting a directory listing
 * use the `os.listdir()` function to obtain a list of files in a directory

In [23]:
import os

names = os.listdir('data')
names


['billboard_ratings.csv',
 'pew.csv',
 'billboard.csv',
 'gapminder.tsv',
 '.DS_Store',
 'ai-wiki.txt',
 'corpus.txt',
 'country_timeseries.csv',
 'avocado.csv',
 'weather.csv',
 'billboard_songs.csv',
 'sample.txt']

In [24]:
import os.path


In [25]:
# get all regular files
names = [
    name for name in os.listdir('data')
    if os.path.isfile(os.path.join('data', name))
]


In [26]:
# get all dirs
dirnames = [
    name for name in os.listdir('data')
    if os.path.isdir(os.path.join('data', name))
]


In [27]:
# the startswith() and endwith() methods of strings can be useful for filtering the contents
txtfiles = [name for name in os.listdir('data') if name.endswith('.txt')]
txtfiles


['ai-wiki.txt', 'corpus.txt', 'sample.txt']

In [28]:
from fnmatch import fnmatch
txtfiles = [name for name in os.listdir('data') if fnmatch(name, '*.txt')]


In [29]:
# for filename matching, use the glob or fnmatch module instead.
import glob
txtfiles = glob.glob('data/*.txt')


In [30]:
# get file sizes and modification dates
name_sz_date = [(name, os.path.getsize(name), os.path.getmtime(name))
                for name in txtfiles]

for name, size, mtime in name_sz_date:
    print(name, size, mtime)


data/ai-wiki.txt 5764 1570306660.0374773
data/corpus.txt 4057920 1546981770.0
data/sample.txt 6 1571083270.1057572


In [31]:
# alternative: get file metadata
file_metadata = [(name, os.stat(name)) for name in txtfiles]
for name, meta in file_metadata:
    print(name, meta)


data/ai-wiki.txt os.stat_result(st_mode=33188, st_ino=8602359409, st_dev=16777220, st_nlink=1, st_uid=503, st_gid=20, st_size=5764, st_atime=1571083426, st_mtime=1570306660, st_ctime=1570306861)
data/corpus.txt os.stat_result(st_mode=33261, st_ino=8600501754, st_dev=16777220, st_nlink=1, st_uid=503, st_gid=20, st_size=4057920, st_atime=1566263353, st_mtime=1546981770, st_ctime=1566263032)
data/sample.txt os.stat_result(st_mode=33261, st_ino=8602665045, st_dev=16777220, st_nlink=1, st_uid=503, st_gid=20, st_size=6, st_atime=1571083270, st_mtime=1571083270, st_ctime=1571083270)


 ### 8. Add or change encoding of an opened file
 * wrap file with `os.TextIOWrapper()` object

In [32]:
import urllib.request
import io


In [33]:
url = urllib.request.urlopen('https://planetpython.org')
f = io.TextIOWrapper(url, encoding='utf-8')
text = f.read()


In [34]:
# change the encoding of an already open text mode file, use its detach() method
import sys
sys.stdout.encoding


'UTF-8'

In [35]:
# sys.stdout = io.TextIOWrapper(sys.stdout.detach(), encoding='latin-1')


In [36]:
f = open('data/sample.txt')
f


<_io.TextIOWrapper name='data/sample.txt' mode='r' encoding='UTF-8'>

In [37]:
f.buffer


<_io.BufferedReader name='data/sample.txt'>

In [38]:
f.buffer.raw


<_io.FileIO name='data/sample.txt' mode='rb' closefd=True>

In [39]:
# use detach() method to disconnect the topmost layer of a file and returns the next lower layer.

b = f.detach()
b


<_io.BufferedReader name='data/sample.txt'>

In [40]:
# once detached, add a new top layer to the returned result
f = io.TextIOWrapper(b, encoding='utf-8')
f


<_io.TextIOWrapper name='data/sample.txt' encoding='utf-8'>

In [41]:
f.close()


 ### 9. Wrapping existing file descriptor as a file object


In [42]:
# open a low-level file descriptor

fd = os.open('data/sample.txt', os.O_WRONLY | os.O_CREAT)


In [43]:
# turn into a proper file
f = open(fd, 'wt')
f.write('hello\n')
f.close()


 ### 10. Making temporary files and directories
 * the `tempfile ` module has a variety of functions for performing this task.
 * make an unnamed temp file, use `tempfile.TemporaryFile`

In [44]:
from tempfile import TemporaryFile

with TemporaryFile('w+t') as f:
    # read/write to the file
    f.write('hello world\n')
    f.write('testing\n')

    # seek back to beginning and read the data
    f.seek(0)
    data = f.read()

    # temp file is destoryed


In [45]:
# alternative
f = TemporaryFile('w+t')
# use the tempory file
# ...
f.close()
# temp file is destroyed


 ### 11. Serializing python objects
 * The most common approach for serializing data is to use the `pickle` module.

In [46]:
import pickle


 * To dump an object to a file, do this:
 ```
 data = ... # Some Python object
 f = open('somefile', 'wb')
 pickle.dump(data, f)
 ```
 * To dump an object to a string, use pickle.dumps():
 ```
 s = pickle.dumps(data)
 ```

 * To re-create an object from a byte stream, use either the `pickle.load()` or
 `pickle.loads()` functions.
 * Restore from a file
 ```
 f = open('somefile', 'rb')
 data = pickle.load(f)
 ```
 * Restore from a string
 ```
 data = pickle.loads(s)
 ```

In [47]:
# example:
f = open('data/sample.txt', 'wb')
pickle.dump([1, 2, 3, 4], f)
pickle.dump('hello', f)
pickle.dump({'Apple', 'Pear', 'Banana'}, f)
f.close()

In [48]:
f = open('data/sample.txt', 'rb')
pickle.load(f)

[1, 2, 3, 4]

In [49]:
pickle.load(f)

'hello'

In [50]:
pickle.load(f)


{'Apple', 'Banana', 'Pear'}

In [51]:
f.close()
