# FILE EXTRACTION & CATEGORIZATION
#### by Matthew Gordon

### Step 1: Ingest data

In [1]:
import zipfile
import shutil
import os
import pandas as pd
from os import listdir
from os.path import isfile, join


Check to see if the UCI dataset has already been unzipped and the folder exists, if not unzip it

In [10]:
# If the unzipped folder doesn't exists, open the zip file and 
# extract it to the current directory
if not os.path.exists(r'HMP_Dataset'):
    fh = open( '../ADL_Dataset.zip', 'rb')
    z = zipfile.ZipFile(fh)
    z.extractall()
    fh.close()

Check to see if a folder named Raw_Data already exists, if it doesn't, create the folder so that the individual activity files can be collated in it

In [11]:
# If the Raw_Data folder doesn't exist, create it
newpath = r'../Raw_Data'
if not os.path.exists(r'../Raw_Data'):
    os.makedirs(newpath)

Iterate through a folder and subfolders and generate a list of files with their full file path and return it

In [12]:
def get_filepaths(directory):
    """
    This function will generate the file names in a directory 
    tree by walking the tree either top-down or bottom-up. For each 
    directory in the tree rooted at directory top (including top itself), 
    it yields a 3-tuple (dirpath, dirnames, filenames).
    """
    file_paths = []  # List which will store all of the full filepaths.

    # Walk the tree.
    for root, directories, files in os.walk(directory):
        for filename in files:
            # Join the two strings in order to form the full filepath.
            filepath = os.path.join(root, filename)
            file_paths.append(filepath)  # Add it to the list.

    return file_paths  # Self-explanatory.

In [13]:
# Search through the folder and create a list of all files  
full_file_paths = get_filepaths(os.getcwd() + '/HMP_Dataset')

Loop through the list and if it's a .txt file, move it to the Raw_Data file unless it is marked as a MODEL file;  Once complete, delete the HMP_Dataset folder and remaining files

In [14]:
# Loop through the file list and move .txt files to Raw_Data folder; MODEL folder contains duplicates so ignore
dst = '../Raw_Data'
df = pd.DataFrame(list(full_file_paths),columns=['filename'])
for f in df.filename.unique():
    if 'MODEL' not in f:
        if f.endswith(".txt"):
            #f = "~" + f
            shutil.move(f, dst)

In [15]:
# Delete the folder that was extracted from the .zip file; folder only contains duplicates
shutil.rmtree('HMP_Dataset')

Turn the list of files saved in the Raw_Data folder into a Pandas Dataframe of filepaths, organized by activity

In [18]:
# Create a list of filenames in the mypath directory
mypath = '../Raw_Data/'

txt_file_list = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [19]:
# Create a pandas dataframe where column is a list of filenames for datasets for that activity
# we'll use this to then conduct some summary statistics on number of files for each activity
# and then use it to create a randomised train and test dataset for each activity
Activity_List = ['standup_chair','sitdown_chair','comb_hair','walk','descend_stairs','drink_glass', \
                 'eat_meat','eat_soup','pour_water','liedown_bed','getup_bed','use_telephone','brush_teeth']
dataset = pd.DataFrame()
fileList = []

for activity in Activity_List: # loop through the file list for each activity
    for f in txt_file_list: # loop through the file list
        if 'Accelerometer' in f: # only consider filenames starting with Accelerometer
            if activity in f: 
                fileList.append(f)
    fileList = pd.DataFrame(fileList,columns=[activity],dtype=object) # turn list into dataframe
    dataset = pd.concat([dataset,fileList], axis=1) # concatenate the single col df to master df
    fileList = [] # clear the fileList before looping through again for next activity


Save the dataframe to the file: MasterFile.csv for future use to avoid having to continuously re-run this Notebook

In [20]:
# output the filename dataframe to a csv to be re-used by other analysis
# steps to avoid needing to run this notebook every time
dataset.to_csv('../Data/MasterFileDF.csv',index=False)