# Load multiple `csv` files with Pandas

## Imports

In [1]:
from pathlib import Path

import pandas as pd

## Get paths to all `.csv` files

In [2]:
paths = Path("datasets/data_1").glob("*.csv")

In [3]:
paths

<generator object Path.glob at 0x7df60559df10>

In [4]:
paths = list(paths)

In [5]:
paths

[PosixPath('datasets/data_1/wk1.csv'),
 PosixPath('datasets/data_1/wk4.csv'),
 PosixPath('datasets/data_1/wk3.csv'),
 PosixPath('datasets/data_1/wk2.csv')]

## Read files

In [6]:
temp = list()

for path in paths:
    temp_df = pd.read_csv(
        filepath_or_buffer=path,
        index_col=0,
        parse_dates=[0],
        dayfirst=True
    )

    temp.append(temp_df)

In [7]:
len(temp)

4

### Preview 1st file

In [8]:
temp[0]

Unnamed: 0_level_0,ebook_download,course_purchase,newsletter_signup
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-03-04 12:00:00,19,11,9
2024-03-05 12:00:00,11,8,5
2024-03-06 12:00:00,12,9,4
2024-03-07 12:00:00,6,14,18
2024-03-08 12:00:00,18,2,12


## Merge files into one `DataFrame`

In [9]:
df = pd.concat(
    temp,
    axis=0
)

In [10]:
df.columns

Index(['ebook_download', 'course_purchase', 'newsletter_signup'], dtype='object')

In [11]:
df

Unnamed: 0_level_0,ebook_download,course_purchase,newsletter_signup
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-03-04 12:00:00,19,11,9
2024-03-05 12:00:00,11,8,5
2024-03-06 12:00:00,12,9,4
2024-03-07 12:00:00,6,14,18
2024-03-08 12:00:00,18,2,12
2024-03-25 12:00:00,17,19,5
2024-03-26 12:00:00,5,19,2
2024-03-27 12:00:00,13,3,6
2024-03-28 12:00:00,16,14,18
2024-03-29 12:00:00,2,12,12


## Add *source* columns

### Use `Path` object method to get a file name

In [12]:
paths

[PosixPath('datasets/data_1/wk1.csv'),
 PosixPath('datasets/data_1/wk4.csv'),
 PosixPath('datasets/data_1/wk3.csv'),
 PosixPath('datasets/data_1/wk2.csv')]

In [13]:
paths[0]

PosixPath('datasets/data_1/wk1.csv')

In [14]:
paths[0].name

'wk1.csv'

### Read and merge data

In [15]:
temp = list()

# read files
for path in paths:
    temp_df = pd.read_csv(
        filepath_or_buffer=path,
        index_col=0,
        parse_dates=[0],
        dayfirst=True
    )

    # Add source column
    temp_df["source"] = path.name

    temp.append(temp_df)

# merge files
df = pd.concat(
    temp,
    axis=0
)

# print output
df

Unnamed: 0_level_0,ebook_download,course_purchase,newsletter_signup,source
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-03-04 12:00:00,19,11,9,wk1.csv
2024-03-05 12:00:00,11,8,5,wk1.csv
2024-03-06 12:00:00,12,9,4,wk1.csv
2024-03-07 12:00:00,6,14,18,wk1.csv
2024-03-08 12:00:00,18,2,12,wk1.csv
2024-03-25 12:00:00,17,19,5,wk4.csv
2024-03-26 12:00:00,5,19,2,wk4.csv
2024-03-27 12:00:00,13,3,6,wk4.csv
2024-03-28 12:00:00,16,14,18,wk4.csv
2024-03-29 12:00:00,2,12,12,wk4.csv


## Control column names

### Use 2nd dataset

In [16]:
paths = Path("datasets/data_2").glob("*.csv")
paths = list(paths)
paths

[PosixPath('datasets/data_2/wk1.csv'), PosixPath('datasets/data_2/wk2.csv')]

### Read and merge data *"as is"*

In [17]:
temp = list()

# read files
for path in paths:
    temp_df = pd.read_csv(
        filepath_or_buffer=path,
        index_col=0,
        parse_dates=[0],
        dayfirst=True
    )

    # Add source column
    temp_df["source"] = path.name

    temp.append(temp_df)

# merge files
df = pd.concat(
    temp,
    axis=0
)

# print output
df

Unnamed: 0_level_0,ebook_download,course_purchase,newsletter_signup,source,ebook_downloads
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-02-05 12:00:00,16.0,11,5,wk1.csv,
2024-02-06 12:00:00,13.0,16,9,wk1.csv,
2024-02-07 12:00:00,16.0,7,4,wk1.csv,
2024-02-08 12:00:00,13.0,8,14,wk1.csv,
2024-02-09 12:00:00,4.0,11,17,wk1.csv,
2024-02-12 12:00:00,,7,16,wk2.csv,10.0
2024-02-13 12:00:00,,17,3,wk2.csv,15.0
2024-02-14 12:00:00,,8,8,wk2.csv,1.0
2024-02-15 12:00:00,,17,5,wk2.csv,11.0
2024-02-16 12:00:00,,8,1,wk2.csv,17.0


### Read and merge data using common column names

In [18]:
temp = list()

# read files
for path in paths:
    temp_df = pd.read_csv(
        filepath_or_buffer=path,
        index_col=0,
        parse_dates=[0],
        dayfirst=True,
        names=["dates", "ebooks", "courses", "newsletters"],
        header=0
    )

    # Add source column
    temp_df["source"] = path.name

    temp.append(temp_df)

# merge files
df = pd.concat(
    temp,
    axis=0
)

# print output
df

Unnamed: 0_level_0,ebooks,courses,newsletters,source
dates,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-02-05 12:00:00,16,11,5,wk1.csv
2024-02-06 12:00:00,13,16,9,wk1.csv
2024-02-07 12:00:00,16,7,4,wk1.csv
2024-02-08 12:00:00,13,8,14,wk1.csv
2024-02-09 12:00:00,4,11,17,wk1.csv
2024-02-12 12:00:00,10,7,16,wk2.csv
2024-02-13 12:00:00,15,17,3,wk2.csv
2024-02-14 12:00:00,1,8,8,wk2.csv
2024-02-15 12:00:00,11,17,5,wk2.csv
2024-02-16 12:00:00,17,8,1,wk2.csv


## Read data from folders and subfolders

### Use 3rd dataset

In [19]:
paths = Path("datasets/data_3")
paths

PosixPath('datasets/data_3')

In [20]:
paths.rglob("*")

<generator object Path.rglob at 0x7df5bf389e00>

In [21]:
# list all items
list(paths.rglob("*"))

[PosixPath('datasets/data_3/more_data'),
 PosixPath('datasets/data_3/wk1.csv'),
 PosixPath('datasets/data_3/wk4.csv'),
 PosixPath('datasets/data_3/wk3.csv'),
 PosixPath('datasets/data_3/wk2.csv'),
 PosixPath('datasets/data_3/more_data/wk1.csv'),
 PosixPath('datasets/data_3/more_data/wk2.csv')]

In [22]:
# list all items ending with `.csv`
list(paths.rglob("*.csv"))

[PosixPath('datasets/data_3/wk1.csv'),
 PosixPath('datasets/data_3/wk4.csv'),
 PosixPath('datasets/data_3/wk3.csv'),
 PosixPath('datasets/data_3/wk2.csv'),
 PosixPath('datasets/data_3/more_data/wk1.csv'),
 PosixPath('datasets/data_3/more_data/wk2.csv')]