# 01 - interacting with the file system
 welcome to the first workshop, which trains use of the `os` module 

# problem statement
we'll start by describing the problem we want to solve. if you already know all you need to solve it, you can skip the rest of this session! 

imagine you found a nice dataset you need to analyse, but instead of consisting of a single file or a set of conveniently named files sitting together in a directory the files are scattered about in folders, sub folders, and sub-sub folders. there are tens, or hundreds of them. also, not all of the files are data files, some of them are documentation files. 

you just want a list of the data files so you can iterate over it and process them all in some way. so you now need to create a python function that takes a path to a root directory as its arguement, and then will traverse the  folder tree structure and collect all the files therein that have a given filename extension and returns a list of the files found (path + filename) 

## bonus: 
filter the data files (assumed to have `.dat` filename ending) and return the list, ordered by **decreasing file size**.

we will start easy. the `os` module allows our python session to interact with the wider world of the operating system outside of it, including the file system.

In [1]:
import os

In [2]:
# let's find out where the root folder of the python session is:
print(os.getcwd()) # get-current-working-directory
start_here = os.getcwd() # save it in a variable.

/Users/oholm/work/python_exercises/exercises/01 intro and setup


In [3]:
# this command returns a python list whose elements are the contents of the directory
print(os.listdir('.'))      # current directory
print(os.listdir('../'))    # up one level
print(os.listdir('../../')) # up two levels, etc
items = os.listdir('.') # store contents of current directory

['.DS_Store', 'first_exercise.md', 'list_my_files.ipynb', 'listmyfiles.py', 'exciting_data', '.ipynb_checkpoints', 'Python Cheat Sheet | OverAPI.com.pdf']
['05_joining_data', 'xx_anomalies', 'xx_graphs_dash', '.DS_Store', 'xx_randomness', 'xx_spark_and_python', 'xx_numerical_computation', 'xx_brandwatch_api', 'xx_dask', 'xx_read_pdf', 'xx_clustering', 'xx_machine_learning ', 'xx_keras', 'xx_data_cleaning', '06_web_scraping', 'xx_data_manipulation', '03 dates_and_times', '01 intro and setup', '07_advanced_dataframe_manipulation', '04_dataframes', '02 read_write_data', 'xx_using_google_translate']
['.DS_Store', 'solutions', 'code snippets', 'README.md', 'Pipfile', 'notes', '.gitignore', 'exercises', 'graphs', '.git', 'Pipfile.lock', 'using_apis']


In [4]:
# we can create a new folder within the python session
os.mkdir('tempfolder')
os.listdir('.')      # see it?

['.DS_Store',
 'first_exercise.md',
 'list_my_files.ipynb',
 'tempfolder',
 'listmyfiles.py',
 'exciting_data',
 '.ipynb_checkpoints',
 'Python Cheat Sheet | OverAPI.com.pdf']

In [5]:
# we can even create a subfolder
os.mkdir('tempfolder/sub')
print(os.listdir('tempfolder')) # now you see it?
os.rmdir('tempfolder/sub/')
print(os.listdir('tempfolder')) # now you don't!

['sub']
[]


In [6]:
# to make a subfolder hierarchy in one go use 
os.makedirs('tempfolder/sub1/sub2') # (new in python3)
print(os.listdir('tempfolder'))
print(os.listdir('tempfolder/sub1'))
print(os.listdir('tempfolder/sub1/sub2'))

['sub1']
['sub2']
[]


In [7]:
# we can traverse the folder structure
os.chdir('tempfolder/sub1/')
print(os.listdir('.')) # sub2
os.chdir('sub2/')
print(os.listdir('.')) # nothing here yet.
os.chdir(start_here)   # going back 'home'

['sub2']
[]


In [8]:
# can only remove empty folders.
path='tempfolder'
try:  
    os.rmdir(path)
except OSError:  
    print ("deletion of the directory %s failed" % path)
else:  
    print ("successfully deleted the directory %s" % path)

deletion of the directory tempfolder failed


In [9]:
print(os.listdir('.'))
#print(os.listdir('tempfolder/sub1/sub2'))

['.DS_Store', 'first_exercise.md', 'list_my_files.ipynb', 'tempfolder', 'listmyfiles.py', 'exciting_data', '.ipynb_checkpoints', 'Python Cheat Sheet | OverAPI.com.pdf']


In [10]:
os.rmdir('tempfolder/sub1/sub2')
os.rmdir('tempfolder/sub1/')
os.rmdir('tempfolder/')
print(os.listdir('.')) # all gone now!

['.DS_Store', 'first_exercise.md', 'list_my_files.ipynb', 'listmyfiles.py', 'exciting_data', '.ipynb_checkpoints', 'Python Cheat Sheet | OverAPI.com.pdf']


In [11]:
# note the home directory component of the cwd:
print(os.getcwd())
# there is a function for expanding the user home directory
print(os.path.expanduser('~'))
USER_HOME = os.path.expanduser('~') # save that

/Users/oholm/work/python_exercises/exercises/01 intro and setup
/Users/oholm


In [12]:
# we can join folder paths together using os.path.join
example_dir = os.path.join(USER_HOME, 'Desktop')
print(example_dir)

/Users/oholm/Desktop


In [13]:
# we probably all have this file on our systems
example_file = os.path.join(USER_HOME, '.bash_profile')
print(example_file)
print(os.path.exists(example_file)) # should be true?
print(os.path.isfile(example_file)) # should be true?
example_dir = os.path.join(USER_HOME, 'Desktop')
print(os.path.isdir(example_dir)) # should be true?

/Users/oholm/.bash_profile
True
True
True


In [14]:
all_items = os.listdir('.') # save the list
for item in all_items:
    if os.path.isfile(item):
        print(item, 'is a file')
    elif os.path.isdir(item):
        print(item, 'is a directory')
    else:
        print('what is', item, '??')

.DS_Store is a file
first_exercise.md is a file
list_my_files.ipynb is a file
listmyfiles.py is a file
exciting_data is a directory
.ipynb_checkpoints is a directory
Python Cheat Sheet | OverAPI.com.pdf is a file


# back to the problem at hand
how do we use all this to solve the issue mentioned earlier about data files in subdirectories? 

we could use all the the commands above, build a function that *recursively* took a directory, scanned a it, collected any files found, and then entered any directory found and scanned that, collected files, entered directories, etc...

but someone has already done that for us. there is a function for that! `os.walk(somepath)`. try it:

In [15]:
os.walk('.')

<generator object walk at 0x10ff1f390>

`os.walk` returns an generator that recursively yields three-tuples (triples?) of: 
 - the current directory path
 - directories in the current path
 - files in the current path
then it traverses into the directories in turn and repeats for that path (yielding a three-tuple).


consider this output:

In [16]:
for current_dir, directories, files in os.walk("exciting_data"):
    print('root:', current_dir)
    print('dirs:', directories)
    print('files', files)
    print('-------------')

root: exciting_data
dirs: ['sub_1', 'sub_3', 'sub_2']
files ['file2.dat', 'file1.dat']
-------------
root: exciting_data/sub_1
dirs: []
files ['file3.dat']
-------------
root: exciting_data/sub_3
dirs: []
files ['file5.dat']
-------------
root: exciting_data/sub_2
dirs: ['subsub_2', 'subsub_3', 'subsub_1']
files ['file4.dat']
-------------
root: exciting_data/sub_2/subsub_2
dirs: []
files []
-------------
root: exciting_data/sub_2/subsub_3
dirs: ['thisisenough']
files []
-------------
root: exciting_data/sub_2/subsub_3/thisisenough
dirs: []
files ['file7.dat', 'file8.doc', 'file9.cat']
-------------
root: exciting_data/sub_2/subsub_1
dirs: []
files ['file6.dat', 'evil_file.dat.doc']
-------------


# concluding remark
this has been an overview of some of the features of the `os` module. you should now have all you need to put together the function described above. remember, the output should be a list containing all the data files in the folder hierarchy (and nothing but the data files in the hierarchy!). good luck and remember to use the slack channel `#python_exercises` to get any and all help you need!

In [17]:
# a minimal solution
def scrape_directory_tree(starting_dir, file_ending):
    matching_files = []
    for current_dir, directories, files in os.walk(starting_dir):
       for filename in files:
           if filename.endswith(file_ending):
               matching_files.append(os.path.join(current_dir, filename))
    return matching_files

# a possible solution
def scrape_directory_tree(starting_dir, file_ending):
    if starting_dir:
        try:
            assert os.path.isdir(starting_dir) # do not pass a filename
        except:
            print(starting_dir, 'is not a directory?')
            return None
    matching_files = []
    for current_dir, directories, files in os.walk(starting_dir, topdown=True):
       for filename in files:
           if filename.endswith(file_ending):
               matching_files.append(os.path.join(current_dir, filename))
    return matching_files

def list_files_by_size(filepathlist, decreasing=True):
    sorted_files = sorted(filepathlist, key=os.path.getsize, reverse=decreasing)
    return sorted_files

In [18]:
# test solution what if a filename gets passed instead of a directory
start = 'first_exercise.md' # fails when given a file instead of dir?
name_ending = '.cat'
print(scrape_directory_tree(start, name_ending))

first_exercise.md is not a directory?
None


In [19]:
# test on the directory:
start = 'exciting_data'
name_ending = '.cat' # find the cat file
print(scrape_directory_tree(start, name_ending))

['exciting_data/sub_2/subsub_3/thisisenough/file9.cat']


In [20]:
name_ending = '.doc' # find and rank the docs
print(scrape_directory_tree(start, name_ending))

['exciting_data/sub_2/subsub_3/thisisenough/file8.doc', 'exciting_data/sub_2/subsub_1/evil_file.dat.doc']


In [21]:
name_ending = '.dat'
filelist = scrape_directory_tree(start, name_ending)
filelist_by_size = list_files_by_size(filelist)
for filepath in filelist_by_size:
    print(filepath, os.path.getsize(filepath))
# passed the tests.

exciting_data/sub_2/subsub_3/thisisenough/file7.dat 130
exciting_data/sub_1/file3.dat 65
exciting_data/sub_3/file5.dat 26
exciting_data/file2.dat 13
exciting_data/sub_2/file4.dat 13
exciting_data/sub_2/subsub_1/file6.dat 13
exciting_data/file1.dat 12


In [22]:
# how about more general patterns?
import re
def scrape_directory_tree(starting_dir, file_pattern):
    assert os.path.isdir(starting_dir) # do not pass a filename
    matching_files = []
    for current_dir, directories, files in os.walk(starting_dir, topdown=True):
       for filename in files:
           if file_pattern.match(filename):
               matching_files.append(os.path.join(current_dir, filename))
    return matching_files

start = 'exciting_data'
pattern = re.compile("file[13579]\.dat$") # only odd numbered files

filelist = scrape_directory_tree(start, pattern)
filelist_by_size = list_files_by_size(filelist, decreasing=False)
for filepath in filelist_by_size:
    #print(filepath, os.path.getsize(filepath))
    print(filepath)

exciting_data/file1.dat
exciting_data/sub_3/file5.dat
exciting_data/sub_1/file3.dat
exciting_data/sub_2/subsub_3/thisisenough/file7.dat
