In [1]:
##Converting Strings to Dates

# Load libraries
import numpy as np
import pandas as pd
# Create strings
date_strings = np.array(['03-04-2005 11:35 PM',
'23-05-2010 12:01 AM',
'04-09-2009 09:09 PM'])
# Convert to datetimes
[pd.to_datetime(date, format='%d-%m-%Y %I:%M %p') for date in date_strings]

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2009-09-04 21:09:00')]

In [2]:
# Convert to datetimes
[pd.to_datetime(date, format="%d-%m-%Y %I:%M %p", errors="coerce")
for date in date_strings]

[Timestamp('2005-04-03 23:35:00'),
 Timestamp('2010-05-23 00:01:00'),
 Timestamp('2009-09-04 21:09:00')]

In [4]:
###  Handling Time Zones
# Load library
import pandas as pd
# Create datetime
pd.Timestamp('2017-05-01 06:00:00', tz='Europe/London')


Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')

In [5]:
## We can add a time zone to a previously created datetime using tz_localize:

# Create datetime
date = pd.Timestamp('2017-05-01 06:00:00')

# Set time zone
date_in_london = date.tz_localize('Europe/London')

# Show datetime
date_in_london

Timestamp('2017-05-01 06:00:00+0100', tz='Europe/London')

In [6]:
# Change time zone
date_in_london.tz_convert('Africa/Abidjan')

Timestamp('2017-05-01 05:00:00+0000', tz='Africa/Abidjan')

In [7]:
## Breaking Up Date Data into Multiple Features

# Load library
import pandas as pd

# Create data frame
dataframe = pd.DataFrame()

# Create five dates
dataframe['date'] = pd.date_range('1/1/2001', periods=150, freq='W')

dataframe

Unnamed: 0,date
0,2001-01-07
1,2001-01-14
2,2001-01-21
3,2001-01-28
4,2001-02-04
...,...
145,2003-10-19
146,2003-10-26
147,2003-11-02
148,2003-11-09


In [8]:
# Create features for year, month, day, hour, and minute
dataframe['year'] = dataframe['date'].dt.year
dataframe['month'] = dataframe['date'].dt.month
dataframe['day'] = dataframe['date'].dt.day
dataframe['hour'] = dataframe['date'].dt.hour
dataframe['minute'] = dataframe['date'].dt.minute


In [9]:
# Show three rows
dataframe.head(8)

Unnamed: 0,date,year,month,day,hour,minute
0,2001-01-07,2001,1,7,0,0
1,2001-01-14,2001,1,14,0,0
2,2001-01-21,2001,1,21,0,0
3,2001-01-28,2001,1,28,0,0
4,2001-02-04,2001,2,4,0,0
5,2001-02-11,2001,2,11,0,0
6,2001-02-18,2001,2,18,0,0
7,2001-02-25,2001,2,25,0,0


In [10]:
## Calculating the Difference Between Dates

# Load library
import pandas as pd

# Create data frame
dataframe = pd.DataFrame()

# Create two datetime features
dataframe['Arrived'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-04-2017')]
dataframe['Left'] = [pd.Timestamp('01-01-2017'), pd.Timestamp('01-06-2017')]
dataframe

Unnamed: 0,Arrived,Left
0,2017-01-01,2017-01-01
1,2017-01-04,2017-01-06


In [11]:
# Calculate duration between features
dataframe['Left'] - dataframe['Arrived']

0   0 days
1   2 days
dtype: timedelta64[ns]

In [12]:
##Often we will want to remove the days output and keep only the numerical value:

# Calculate duration between features
pd.Series(delta.days for delta in (dataframe['Left'] - dataframe['Arrived']))

0    0
1    2
dtype: int64

In [14]:
#Encoding Days of the Week

# Load library
import pandas as pd

# Create dates
dates = pd.Series(pd.date_range("2/2/2002", periods=3, freq="M"))
dates


0   2002-02-28
1   2002-03-31
2   2002-04-30
dtype: datetime64[ns]

In [16]:
 #Show days of the week
dates.dt.weekday

0    3
1    6
2    1
dtype: int64

In [17]:
####  Creating a Lagged Feature

# Load library
import pandas as pd

# Create data frame
dataframe = pd.DataFrame()

In [19]:
# Create data
dataframe["dates"] = pd.date_range("1/1/2001", periods=5, freq="D")
dataframe["stock_price"] = [1.1,2.2,3.3,4.4,5.5]
dataframe

Unnamed: 0,dates,stock_price
0,2001-01-01,1.1
1,2001-01-02,2.2
2,2001-01-03,3.3
3,2001-01-04,4.4
4,2001-01-05,5.5


In [20]:
# Lagged values by one row
dataframe["previous_days_stock_price"] = dataframe["stock_price"].shift(1)

# Show data frame
dataframe

Unnamed: 0,dates,stock_price,previous_days_stock_price
0,2001-01-01,1.1,
1,2001-01-02,2.2,1.1
2,2001-01-03,3.3,2.2
3,2001-01-04,4.4,3.3
4,2001-01-05,5.5,4.4


In [21]:
### Handling Missing Data in Time Series

# Load libraries
import pandas as pd
import numpy as np

# Create date
time_index = pd.date_range("01/01/2010", periods=5, freq="M")
time_index

DatetimeIndex(['2010-01-31', '2010-02-28', '2010-03-31', '2010-04-30',
               '2010-05-31'],
              dtype='datetime64[ns]', freq='M')

In [23]:
# Create data frame, set index
dataframe = pd.DataFrame(index=time_index)
dataframe

2010-01-31
2010-02-28
2010-03-31
2010-04-30
2010-05-31


In [25]:
# Create feature with a gap of missing values
dataframe["Sales"] = [1.0,2.0,np.nan,np.nan,5.0]
dataframe

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,
2010-04-30,
2010-05-31,5.0


In [26]:
# Interpolate missing values
dataframe.interpolate()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.0
2010-04-30,4.0
2010-05-31,5.0


In [27]:
## # Forward-fill
dataframe.ffill()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,2.0
2010-04-30,2.0
2010-05-31,5.0


In [28]:
# Back-fill
dataframe.bfill()

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,5.0
2010-04-30,5.0
2010-05-31,5.0


In [29]:
# Interpolate missing values when we assume the line is quadratic
dataframe.interpolate(method="quadratic")

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.059808
2010-04-30,4.038069
2010-05-31,5.0


In [32]:
# Interpolate missing values
dataframe.interpolate(limit=1, limit_direction="forward")

Unnamed: 0,Sales
2010-01-31,1.0
2010-02-28,2.0
2010-03-31,3.0
2010-04-30,
2010-05-31,5.0
