# Pandas - Working with Time Series Date Data

In [1]:
import pandas as pd
from datetime import datetime
import numpy as np

# Let's create a pandas series that logs time every hour from 1st Nov'19 to 7th Nov'19
df = pd.date_range(start='11/01/2019', end='11/07/2019', freq='H')
df

DatetimeIndex(['2019-11-01 00:00:00', '2019-11-01 01:00:00',
               '2019-11-01 02:00:00', '2019-11-01 03:00:00',
               '2019-11-01 04:00:00', '2019-11-01 05:00:00',
               '2019-11-01 06:00:00', '2019-11-01 07:00:00',
               '2019-11-01 08:00:00', '2019-11-01 09:00:00',
               ...
               '2019-11-06 15:00:00', '2019-11-06 16:00:00',
               '2019-11-06 17:00:00', '2019-11-06 18:00:00',
               '2019-11-06 19:00:00', '2019-11-06 20:00:00',
               '2019-11-06 21:00:00', '2019-11-06 22:00:00',
               '2019-11-06 23:00:00', '2019-11-07 00:00:00'],
              dtype='datetime64[ns]', length=145, freq='H')

**NOTE ISO 8601 format**
-  yyyy-mm-dd hh:mm:ss

In [2]:
len(df)

145

In [3]:
type(df)

pandas.core.indexes.datetimes.DatetimeIndex

In [4]:
df = pd.DataFrame(df, columns=['date'])

# And add a 'made up' column for sales data
df['sales'] = np.random.randint(0,1000,size=(len(df)))
df.head()

Unnamed: 0,date,sales
0,2019-11-01 00:00:00,911
1,2019-11-01 01:00:00,559
2,2019-11-01 02:00:00,939
3,2019-11-01 03:00:00,517
4,2019-11-01 04:00:00,89


# Selecting using dates

In [5]:
# Set your date as the index 
df = df.set_index('date')
df.head()

Unnamed: 0_level_0,sales
date,Unnamed: 1_level_1
2019-11-01 00:00:00,911
2019-11-01 01:00:00,559
2019-11-01 02:00:00,939
2019-11-01 03:00:00,517
2019-11-01 04:00:00,89


## Using .loc[] to index specific dates

In [6]:
# Selecting using date - getting exact value for cell 
df.loc['2019-11-01 03:00:00', 'sales']

517

In [7]:
# Selecting using date to return the row corresponding to that date
df.loc['2019-11-01 03:00:00']

sales    517
Name: 2019-11-01 03:00:00, dtype: int64

In [8]:
# Selecting an entire day
df.loc['2019-11-01']

# Similary you an use df.loc['2019-11'] to select and entire month

Unnamed: 0_level_0,sales
date,Unnamed: 1_level_1
2019-11-01 00:00:00,911
2019-11-01 01:00:00,559
2019-11-01 02:00:00,939
2019-11-01 03:00:00,517
2019-11-01 04:00:00,89
2019-11-01 05:00:00,987
2019-11-01 06:00:00,135
2019-11-01 07:00:00,509
2019-11-01 08:00:00,687
2019-11-01 09:00:00,182


In [9]:
# Selecting a range of dates
df.loc['2019-11-01':'2019-11-02']

Unnamed: 0_level_0,sales
date,Unnamed: 1_level_1
2019-11-01 00:00:00,911
2019-11-01 01:00:00,559
2019-11-01 02:00:00,939
2019-11-01 03:00:00,517
2019-11-01 04:00:00,89
2019-11-01 05:00:00,987
2019-11-01 06:00:00,135
2019-11-01 07:00:00,509
2019-11-01 08:00:00,687
2019-11-01 09:00:00,182


# Resampling

**Summary States** - we can use Statistical methods over different time intervals
- mean(), sum(), count(), min(), max()

**Down-sampling**
- reduce datetime rows to longer frequency

**Up-sampling**
- increase datetime rows to shorter frequency

In [10]:
# Using resample to get the average for each day per hour

df.resample('D').mean()
#df.resample('D').sum()

Unnamed: 0_level_0,sales
date,Unnamed: 1_level_1
2019-11-01,510.75
2019-11-02,528.375
2019-11-03,470.5
2019-11-04,429.791667
2019-11-05,501.208333
2019-11-06,467.083333
2019-11-07,448.0


In [11]:
# Using resample to get the average for each minute (upsampling)

df.resample('T').mean().head()

Unnamed: 0_level_0,sales
date,Unnamed: 1_level_1
2019-11-01 00:00:00,911.0
2019-11-01 00:01:00,
2019-11-01 00:02:00,
2019-11-01 00:03:00,
2019-11-01 00:04:00,


## Resampling frequencies

- 'min', 'T' - minute
- ‘H’ - hour
- ‘D’ - day
- ‘B’ - business day
- ‘W’ - week
- ‘M’ - month
- ‘Q’ - quarter
- ‘A’ - year

# Parsing dates

In [12]:
df = pd.DataFrame({'year': [2015, 2016],
                   'month': [2, 3],
                   'day': [4, 5]})
df

Unnamed: 0,year,month,day
0,2015,2,4
1,2016,3,5


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   year    2 non-null      int64
 1   month   2 non-null      int64
 2   day     2 non-null      int64
dtypes: int64(3)
memory usage: 176.0 bytes


In [14]:
pd.to_datetime(df)

0   2015-02-04
1   2016-03-05
dtype: datetime64[ns]

In [15]:
pd.to_datetime('2019-01-01', format='%Y-%m-%d', errors='ignore')

Timestamp('2019-01-01 00:00:00')