# **Part 3: File management**

## os

In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Import
import os
import glob

In [None]:
# My folder path
my_folder = '/content/gdrive/MyDrive/DSTI'

In [None]:
# List the files and subfolders in a given folder                                                     os.listdir
my_files_and_folders = os.listdir(my_folder)
my_files_and_folders

['A21-Simuation_GradesFinalResults.xlsx',
 'colab_exercise_data.csv',
 'BreastCancer']

In [None]:
# Join the my_folder path to the name of the files and subfolders to get the full paths               os.path.join
[os.path.join(my_folder, my_files_and_folders[i]) for i in range(len(my_files_and_folders))]

['/content/gdrive/MyDrive/DSTI/A21-Simuation_GradesFinalResults.xlsx',
 '/content/gdrive/MyDrive/DSTI/colab_exercise_data.csv',
 '/content/gdrive/MyDrive/DSTI/BreastCancer']

In [None]:
# Another way to get full paths with os.scandir                                                       os.scandir
my_files_and_folders = [ f.path for f in os.scandir(my_folder)]
my_files_and_folders

['/content/gdrive/MyDrive/DSTI/A21-Simuation_GradesFinalResults.xlsx',
 '/content/gdrive/MyDrive/DSTI/colab_exercise_data.csv',
 '/content/gdrive/MyDrive/DSTI/BreastCancer']

In [None]:
# Return all the subfolders full paths in the given folder                                            is_dir()
subfolders = [ f.path for f in os.scandir(my_folder) if f.is_dir() ]
subfolders

['/content/gdrive/MyDrive/DSTI/BreastCancer']

In [None]:
# Return the files in a given folder                                                                  is_file()
my_files = [ f.path for f in os.scandir(my_folder) if f.is_file() ]
my_files

['/content/gdrive/MyDrive/DSTI/A21-Simuation_GradesFinalResults.xlsx',
 '/content/gdrive/MyDrive/DSTI/colab_exercise_data.csv']

In [None]:
# List the xlsx files only (files that finish with ".xlsx")                                           endswith()
[my_files[i] for i in range(len(v)) if v[i].endswith('.xlsx')]

['/content/gdrive/MyDrive/DSTI/A21-Simuation_GradesFinalResults.xlsx']

In [None]:
# Count the nb of .xlsx files in my_path
len([my_files[i] for i in range(len(v)) if v[i].endswith('.xlsx')]) 

1

In [None]:
# A function that returns all the files in the given formats
def list_of_files(my_path, formats):
  '''
  returns all the files in the path that have one of the given formats
  formats example: ('.json','.txt','.xlsx') or ['.json','.txt','.xlsx']
  '''
  # Make sure the formats variable is a tuple
  formats = tuple(formats)
  # List of all files
  elements = [ f.path for f in os.scandir(my_path) if f.is_file() ]
  # List of files in the given formats
  elements2 = [elements[i] for i in range(len(elements)) if elements[i].endswith(formats)]
  
  return elements2

In [None]:
# Call the previous function
p = '/content/sample_data'
f = ['.csv', '.json']

list_of_files(p,f)

['/content/sample_data/anscombe.json',
 '/content/sample_data/california_housing_train.csv',
 '/content/sample_data/mnist_test.csv',
 '/content/sample_data/mnist_train_small.csv',
 '/content/sample_data/california_housing_test.csv']

## glob2

In [None]:
import glob2
import itertools

In [None]:
# Return all the csv files in the my_folder folder
extensions = "*.csv"
filenames = glob2.glob(os.path.join(my_folder, extensions))
filenames

['/content/gdrive/MyDrive/DSTI/colab_exercise_data.csv']

In [None]:
# Trying to do this with multiple extentions
extensions = ["*.csv", "*.json"]

In [None]:
filenames = [glob2.glob(os.path.join(p, i)) for i in extensions]
filenames

[['/content/sample_data/california_housing_train.csv',
  '/content/sample_data/mnist_test.csv',
  '/content/sample_data/mnist_train_small.csv',
  '/content/sample_data/california_housing_test.csv'],
 ['/content/sample_data/anscombe.json']]

In [None]:
# Flatten the returned list (we only want 1 list)
list(itertools.chain(*filenames))

['/',
 'c',
 'o',
 'n',
 't',
 'e',
 'n',
 't',
 '/',
 'g',
 'd',
 'r',
 'i',
 'v',
 'e',
 '/',
 'M',
 'y',
 'D',
 'r',
 'i',
 'v',
 'e',
 '/',
 'D',
 'S',
 'T',
 'I',
 '/',
 'c',
 'o',
 'l',
 'a',
 'b',
 '_',
 'e',
 'x',
 'e',
 'r',
 'c',
 'i',
 's',
 'e',
 '_',
 'd',
 'a',
 't',
 'a',
 '.',
 'c',
 's',
 'v']