In [1]:
import pandas as pd
import os
import numpy as np

In [None]:
"""Our goal is to build one big pandas data frame out of the information stored in three "ready to use" data files I have
arbitrarily made up.  The three "good ones" start with "demo" and a number in the filename and end with today's date,
to indicate when they were generated. This stupid example illustrates some tools for working with filenames and files that
I find helpful for organizing data files and creating a cleanish workflow."""

In [2]:
# ----- SET UP FILES TO WORK ON -----

#check that all metadata files are in order - write a function that will check this before you begin

# tell the computer where the files are hidden -- you will need to update this for your path
data_dir = os.path.join('C:\\','Users','nhogl','Documents','GitHub','ppt-lanes','file-handling')

# get a list of all the files in this directory, including undesirable ones
dir_list = os.listdir(data_dir)
print(dir_list)

# list the files you want
all_files = [] # make an empty string to hold all the paths

# This next command has a lot packed into it:
# -- 1. the += adds the result of the bracketed command to the list
# -- 2. the os.path.join(data_dir, each) joins the path to the file name, with each as a blind variable on this line
# -- 3. for each loops over the variable "each" which exists only on this line, this works like a for loop
# -- 4. os.listdir(data_dir) makes an iterable object for each file in the specified directory
# -- 5. if each.endswith does an if loop, assumes the objects represented by each are strings, and returns true if the
#       last characters in the object match the specified ones

# This single line is equivalent to code reading:
# for each in os.listdir(data_dir):
#    if each.endswith('20231023.csv'):
#        all_files.append(os.path.join(data_dir,each))

all_files += [os.path.join(data_dir,each) for each in os.listdir(data_dir) if each.endswith('20231023.csv')]
all_files

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\nhogl\\Documents\\GitHub\\ppt-lanes\\file-handling'

In [29]:
# ----- COLLATE DATA AND LABEL WITH INFO FROM FILENAME -----

# make empty lists to keep your records in -- requires you to know what you're loading up
all_days = []
all_months = []
all_dates = []
all_demos = []

# loop over the files and extract the information you care about
for file in all_files:
# this assumes that all_files exists and only has elements that you expect
# the code will of course break if all_files is something else, which you can test for yourself
    
    # break up the filename for metadata
    demo = int(file.split('_')[0][-1]) # use string split and your knowledge of the filename format
    
    # read the csv file as a pandas dataframe
    thisdf = pd.read_csv(os.path.join(data_dir,file))
    
    # collect data from dataframe and add to the empty lists we already made
    # this loops through all of the values in the current dataframe and adds them to our lists for subsequent conversion
    all_days += [each for each in thisdf.weekday]
    all_months += [each for each in thisdf.month]
    all_dates += [each for each in thisdf.day]
    all_demos += [int(each) for each in demo*np.ones(len(thisdf.weekday))]
    
# collate all the data into a dataframe for subsequent use
groupdata = {'day':all_days,'month':all_months,'date':all_dates,'demo':all_demos}
gd = pd.DataFrame(data=groupdata)
gd

Unnamed: 0,day,month,date,demo
0,Monday,October,23,1
1,Tuesday,October,24,1
2,Wednesday,October,25,1
3,Thursday,October,26,1
4,Friday,October,27,1
5,Saturday,October,28,1
6,Sunday,October,29,1
7,Monday,October,23,2
8,Tuesday,October,24,2
9,Wednesday,October,25,2


In [18]:
# ----- CODE TO GENERATE THE DEMO FILES -----
# do not run; I included it just in case seeing it ever comes in handy

# make a new subdirectory for our demo files
mkdir = './/file-handling'
if not os.path.exists(mkdir):
    os.makedirs(mkdir)

# make some demo data
days_of_week = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
month = ['October']*7
dates = 23*np.ones(7)+range(0,7)
dates = dates.astype(int)
d = {'weekday':days_of_week,'month':month,'day':dates}
df = pd.DataFrame(data=d)

# save files
fnames = ['demo_data_old.csv',
             'demo_data_old2.csv',
             'demodataold.csv',
             'demo1_20231023.csv',
             'demo2_20231023.csv',
             'demo3_20231023.csv']

for fn in fnames:
    df.to_csv(os.path.join(mkdir,fn))