In [1]:
# -*- coding: utf-8 -*-
"""
Created on Tue Jan  09 14:28:35 2024

@author: trevizo

Read Excel file, multiple data sheets.
The first column in each data sheet has the date time
The second column has a value
The sheet names provide the name of each time series.
"""

'\nCreated on Tue Jan  09 14:28:35 2024\n\n@author: trevizo\n\nRead Excel file, multiple data sheets.\nThe first column in each data sheet has the date time\nThe second column has a value\nThe sheet names provide the name of each time series.\n'

# Load the libraries

In [2]:
import pandas as pd

# Load the data - Excel file containing multiple sheets with time series

Each sheet has a time series, two columns per sheet: The first column of the sheet is date time. The second column has numeric values.

In this case, we will use pd.read_excel() with sheet_name=None to return a dictionry. Each sheet name becomes the key of a dictionary. The values of the dictionary are the columns of each sheet.

In [3]:
# Read the Excel sheets into a dictionary. that is when sheet_name=None
df_dict = pd.read_excel('../data/time_series_sheets.xlsx', sheet_name=None)
type(df_dict)

dict

# Build a dataframe

The sheet names will be the column names of our dataframe.

## Option 1: Init a dataframe and concat

In [4]:
# Put the keys of the dict into a list. We will use them as dataframe column names downstream.
sheet_names = list(df_dict.keys())
sheet_names

['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta']

In [5]:
# Initialize a dataframe
df = pd.DataFrame()

In [6]:
# Iterate over each sheet in the dictionary
for sheet_name, df_sheet in df_dict.items():
    # df_sheets is a dataframe with two columns; date and values
    date_column = df_sheet.columns[0]
    values_column = df_sheet.columns[1]
    
    # Since dates are repeated in each sheet, as first column,
    # use the dates column only once, and from the first sheet.
    # Therefore, drop the dates column if the df_sheets does not come from the first sheet.
    if sheet_name != sheet_names[0]:
        df_sheet = df_sheet.drop(df_sheet.columns[0], axis=1)
        
    # Append the df_sheet dataframe to df
    df = pd.concat([df, df_sheet], axis=1)

In [7]:
df.head()

Unnamed: 0,day,counts,counts.1,counts.2,counts.3,counts.4,counts.5
0,2024-01-01,19692553,59502552,63580138,15151849,76677698,23673410
1,2024-01-02,22848115,71530775,75645548,17485422,89529332,25208023
2,2024-01-03,23298057,73475400,77714874,17328522,91287931,25731390
3,2024-01-04,23589747,74150311,78371008,17587027,92548950,26463460
4,2024-01-05,24015623,75702008,79805329,17576636,94881956,27281075


In [8]:
# Convert the dataframe into a time series.
df.set_index(df.columns[0], inplace=True)

In [9]:
df.head()

Unnamed: 0_level_0,counts,counts,counts,counts,counts,counts
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-01-01,19692553,59502552,63580138,15151849,76677698,23673410
2024-01-02,22848115,71530775,75645548,17485422,89529332,25208023
2024-01-03,23298057,73475400,77714874,17328522,91287931,25731390
2024-01-04,23589747,74150311,78371008,17587027,92548950,26463460
2024-01-05,24015623,75702008,79805329,17576636,94881956,27281075


In [10]:
sheet_names

['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta']

In [11]:
# Assign sheet_names to the dataframe columns
df.columns = sheet_names


In [12]:
df.head()

Unnamed: 0_level_0,alpha,beta,gamma,delta,epsilon,zeta
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-01-01,19692553,59502552,63580138,15151849,76677698,23673410
2024-01-02,22848115,71530775,75645548,17485422,89529332,25208023
2024-01-03,23298057,73475400,77714874,17328522,91287931,25731390
2024-01-04,23589747,74150311,78371008,17587027,92548950,26463460
2024-01-05,24015623,75702008,79805329,17576636,94881956,27281075


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 31 entries, 2024-01-01 to 2024-01-31
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   alpha    31 non-null     int64
 1   beta     31 non-null     int64
 2   gamma    31 non-null     int64
 3   delta    31 non-null     int64
 4   epsilon  31 non-null     int64
 5   zeta     31 non-null     int64
dtypes: int64(6)
memory usage: 1.7 KB


## Option 2: Init a dictionary and store dataframes in it

In [14]:
# Initialize an empty dictionary to store dataframes that are bult from each sheet
# It will have a key and the value will be the columns of the sheets
sheet_dict = {}

In [15]:
# Iterate over each sheet in the dictionary that came from the Excel file
for sheet_name, df_sheet in df_dict.items():
    # df_sheets is a dataframe with two columns; date and values
    date_column = df_sheet.columns[0]
    values_column = df_sheet.columns[1]
    
    # Since dates are repeated in each sheet, as first column,
    # use the dates column only once, and from the first sheet.
    # Therefore, drop the dates column if the df_sheets does not come from the first sheet.
    if sheet_name != sheet_names[0]:
        df_sheet = df_sheet.drop(df_sheet.columns[0], axis=1)

    # Store the dataframe with sheet name as key
    sheet_dict[sheet_name] = df_sheet

In [16]:
sheet_dict.keys()

dict_keys(['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta'])

In [17]:
# Concatenate dictionary values into a dataframe
df = pd.concat(sheet_dict.values(), axis=1)

In [18]:
df.head()

Unnamed: 0,day,counts,counts.1,counts.2,counts.3,counts.4,counts.5
0,2024-01-01,19692553,59502552,63580138,15151849,76677698,23673410
1,2024-01-02,22848115,71530775,75645548,17485422,89529332,25208023
2,2024-01-03,23298057,73475400,77714874,17328522,91287931,25731390
3,2024-01-04,23589747,74150311,78371008,17587027,92548950,26463460
4,2024-01-05,24015623,75702008,79805329,17576636,94881956,27281075


In [19]:
# Convert the dataframe into a time series.
df.set_index(df.columns[0], inplace=True)

In [20]:
df.head()

Unnamed: 0_level_0,counts,counts,counts,counts,counts,counts
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-01-01,19692553,59502552,63580138,15151849,76677698,23673410
2024-01-02,22848115,71530775,75645548,17485422,89529332,25208023
2024-01-03,23298057,73475400,77714874,17328522,91287931,25731390
2024-01-04,23589747,74150311,78371008,17587027,92548950,26463460
2024-01-05,24015623,75702008,79805329,17576636,94881956,27281075


In [21]:
# Recall the sheet_names from above
sheet_names

['alpha', 'beta', 'gamma', 'delta', 'epsilon', 'zeta']

In [22]:
# Assign sheet_names to the dataframe columns
df.columns = sheet_names

In [23]:
# Instpect results
df.head()

Unnamed: 0_level_0,alpha,beta,gamma,delta,epsilon,zeta
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-01-01,19692553,59502552,63580138,15151849,76677698,23673410
2024-01-02,22848115,71530775,75645548,17485422,89529332,25208023
2024-01-03,23298057,73475400,77714874,17328522,91287931,25731390
2024-01-04,23589747,74150311,78371008,17587027,92548950,26463460
2024-01-05,24015623,75702008,79805329,17576636,94881956,27281075


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 31 entries, 2024-01-01 to 2024-01-31
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   alpha    31 non-null     int64
 1   beta     31 non-null     int64
 2   gamma    31 non-null     int64
 3   delta    31 non-null     int64
 4   epsilon  31 non-null     int64
 5   zeta     31 non-null     int64
dtypes: int64(6)
memory usage: 1.7 KB
