In [1]:
#7.1 Converting Strings to Dates

# Load libraries
import numpy as np
import pandas as pd 

# Create strings
data_string = np.array(['03-04-2005 11:35 PM',
                        '23-05-2010 12:01 AM',
                        '04-09-2009 09:09 PM'])

# Convert to datetimes
[pd.to_datetime(data, format = '%d-%m-%Y %I:%M %p') for data in data_string]

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2009-09-04 21:09:00')]

In [2]:
# Convert to datetimes with errors parameter
[pd.to_datetime(data, format = '%d-%m-%Y %I:%M %p', errors='coerce') for data in data_string]

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2009-09-04 21:09:00')]

In [3]:
#7.2 Handling Time Zones

# Create datetime
pd.Timestamp('2017-05-01 06:00:00', tz='Europe/London')

# Create datetime
data = pd.Timestamp('2017-05-01 06:00:00')

# Set time zone
data_in_london = data.tz_localize('Europe/London')

# Show datetime
print(data_in_london)

# Change time zone
data_in_london.tz_convert('Africa/Abidjan')

# Create three dates
datas = pd.Series(pd.date_range('2/2/2002', periods=3, freq='ME'))

datas.dt.tz_localize('Africa/Abidjan')

2017-05-01 06:00:00+01:00


0   2002-02-28 00:00:00+00:00
1   2002-03-31 00:00:00+00:00
2   2002-04-30 00:00:00+00:00
dtype: datetime64[ns, Africa/Abidjan]

In [4]:
# Load library
from pytz import all_timezones

# Show two time zones
all_timezones[0:2]

['Africa/Abidjan', 'Africa/Accra']

In [5]:
#7.3 Selecting Dates and Times

# Create data frame
df = pd.DataFrame()

# Create datetimes
df['data'] = pd.date_range('1/1/2001', periods=100000, freq="h")

# Select observations between two datetimes
df[(df['data'] > '2002-1-1 01:00:00') &
   (df['data'] <= '2002-1-1 04:00:00')]

Unnamed: 0,data
8762,2002-01-01 02:00:00
8763,2002-01-01 03:00:00
8764,2002-01-01 04:00:00


In [6]:
# Set index
df = df.set_index(df['data'])

# Select observations between two datetimes
df.loc['2002-1-1 01:00:00':'2002-1-1 04:00:00']

Unnamed: 0_level_0,data
data,Unnamed: 1_level_1
2002-01-01 01:00:00,2002-01-01 01:00:00
2002-01-01 02:00:00,2002-01-01 02:00:00
2002-01-01 03:00:00,2002-01-01 03:00:00
2002-01-01 04:00:00,2002-01-01 04:00:00


In [7]:
#7.4 Breaking Up Date Data into Multiple Features

# Create data frame
df= pd.DataFrame()

# Create five dates
df['data'] = pd.date_range('1/1/2201', periods=150, freq="W")

# Create features for year, month, day, hour, and minute
df['year'] = df['data'].dt.year
df['month'] = df['data'].dt.month
df['day'] = df['data'].dt.day
df['hour'] = df['data'].dt.hour
df['minute'] = df['data'].dt.minute

# Show three rows
df.head(3)

Unnamed: 0,data,year,month,day,hour,minute
0,2201-01-04,2201,1,4,0,0
1,2201-01-11,2201,1,11,0,0
2,2201-01-18,2201,1,18,0,0


In [8]:
#7.5 Calculating the Difference Between Dates

# Create data frame
df = pd.DataFrame()

# Create two datetime features
df['Arrived'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-04-2017')]
df['Left'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-06-2017')]

# Calculate duration between features
df['Left'] - df['Arrived']

0   0 days
1   2 days
dtype: timedelta64[ns]

In [9]:
#7.6 Encoding Days of the Week

# Create dates
dates = pd.Series(pd.date_range('2/2/2002', periods=3, freq='ME'))

# Show days of the week
dates.dt.day_name()

0    Thursday
1      Sunday
2     Tuesday
dtype: object

In [10]:
# Show days of the week
dates.dt.weekday

0    3
1    6
2    1
dtype: int32

In [11]:
#7.7 Creating a Lagged Feature

# Create data frame
df = pd.DataFrame()

# Create data
df['dates'] = pd.date_range('1/1/2201', periods=5, freq='D')
df['stock_price'] = [1.1, 2.2, 3.3, 4.4, 5.5]

# Lagged values by one row
df['previous_days_stock_price'] = df['stock_price'].shift(1)

# Show data frame
df

Unnamed: 0,dates,stock_price,previous_days_stock_price
0,2201-01-01,1.1,
1,2201-01-02,2.2,1.1
2,2201-01-03,3.3,2.2
3,2201-01-04,4.4,3.3
4,2201-01-05,5.5,4.4


In [12]:
#7.8 Using Rolling Time Windows

# Create datetimes
time_index = pd.date_range('01/01/2010', periods=5, freq='ME')

# Create data frame, set index
df = pd.DataFrame(index=time_index)

# Create feature
df['Stock_Price'] = [1,2,3,4,5]

# Calculate rolling mean
df.rolling(window=2).mean()

Unnamed: 0,Stock_Price
2010-01-31,
2010-02-28,1.5
2010-03-31,2.5
2010-04-30,3.5
2010-05-31,4.5


In [13]:
#7.9 Handling Missing Data in Time Series

# Create date
time_index = pd.date_range('01/01/2010', periods=5, freq='ME')

# Create data frame, set index
df = pd.DataFrame(index=time_index)

# Create feature with a gap of missing values
df['Sales'] = [1.0, 2.0, np.nan, np.nan, 5.0]

# Interpolate missing values
df.interpolate()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.0
2010-04-30,4.0
2010-05-31,5.0


In [14]:
# Forward fill
df.ffill()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,2.0
2010-04-30,2.0
2010-05-31,5.0


In [15]:
# Backfill
df.bfill()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,5.0
2010-04-30,5.0
2010-05-31,5.0


In [16]:
# Interpolate missing values
df.interpolate(method='quadratic')

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.059808
2010-04-30,4.038069
2010-05-31,5.0


In [17]:
# Interpolate missing values
df.interpolate(limit=1, limit_direction='forward')

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.0
2010-04-30,
2010-05-31,5.0
