In [None]:
#import modules
import numpy as np
import pandas as pd

# The Time Series
In this lecture, we'll talk about 4 things:
* What is a time series?
* How does pandas deal with dates/times?
* The Special Time Series Functions (that technically can be used elsewhere)
* The multiindex (which is especially useful with time series)

# What is a time series?


In [None]:
#An example
pd.read_csv('Rainfall.csv')

In [None]:
#Exercise, make date the index col and read in as `data`
data = pd.read_csv('Rainfall.csv',index_col='Date')
data

In [None]:
#Exercise, what the data type of the index column?
#Hint, similar to numpy notation
data.index.dtype

# Making pandas understand we have a date/time

Our index was not of the date time type, and so we don't get access to the special date time operators.
Here is how to conver it.

In [None]:
pd.to_datetime(data.index)

In [None]:
#Exercise, set your index to be the datetime version of your index
data.index = pd.to_datetime(data.index)
data

## Working with date times
Why would we care about doing this. Python/Pandas makes math operations with dates very easy. Check these out.

In [None]:
data.index

In [None]:
data.index.min()

In [None]:
data.index.day

In [None]:
data.index.month

In [None]:
data.index.strftime('%A')

https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior

## Math With Dates

In [None]:
data.index - data.index[0]
#yields time delta
#This could be more useful if you have some experiment and its the number of days running that matters

In [None]:
#can do arithmetic with this object
(data.index - data.index[0])*2

## Range for dates


In [None]:
#Inclusive on both sides
pd.date_range('2015-07-03', '2015-07-10')

In [None]:
#Day by default
pd.date_range('2015-07-03', periods=8)

In [None]:
#What if we wanted to do every hour?
pd.date_range('2015-07-03',periods=8,freq='H')

In [None]:
#Exercise, get me the amount of minutes the government shutdown has been going on for
#Hint: It started on December 22 at midnight and is going on until now
#Hint2: You can google around how to see how to get the current time in python
import datetime
currentDT = datetime.datetime.now()
len(pd.date_range('2018-12-22',pd.datetime.now(),freq='min'))

# Special Datetime Functions

### diff

What if we were interested in the change in rainfall day over day?

In [None]:
data.diff()

In [None]:
#Exercise, look into the documentation of diff to get the difference of today and a week before today
data.diff(7)

In [None]:
#Exericse, use diff to get difference of today and a week after today (i.e. looking into the future)
data.diff(-7)

### Note: We can think of diff (with periods=1) as a discretized form of the derivative

In [None]:
#Exercise: How could we get a discretized form of the second derivative
data.diff().diff()

### shift

In [None]:
data.shift()

In [None]:
data.shift(2)

In [None]:
#Why might you want to use shift?
#Exericse, find a 3 day rolling mean using shift
#Hint: You may need to use a negative number in your shift function
(data + data.shift() + data.shift(-1))/3

## Cumulative Functions

How would we get the total amount of rain so far each day?

In [None]:
data.cumsum()

### Other cumulative functions

In [None]:
data.cumprod()

In [None]:
data.cummin()

In [None]:
data.cummax()

# Changing windows
# 2 options

In [None]:
data

In [None]:
data.asfreq('D')
#acts as if theres value each day

In [None]:
data.asfreq('M')
#takes value at end of each month

What did this do?

In [None]:
data.asfreq('W')
#takes value at end of each week

There are some obvious downsides to using `asfreq`
Lets talk about something more complicated

In [None]:
data.resample('W')

In [None]:
data.resample('W').mean()
#What did this do?

In [None]:
data.resample('W').apply(np.median)
#We can take any aggregation function

## In essence, resample is just a groupby for dates

What if we resampled up? I.e. did a frequency that we didn't have info on?

In [None]:
data.resample('D').median()
# The same as if you called asfreq

# Lets move to a new dataset

In [None]:
data2 = pd.read_csv('lebron.csv')
data2

In [None]:
pd.to_datetime(data2.Season)
#pandas isn't sure what to do

In [None]:
#exercise, use the str split function to only get the second year in each date, make sure you year looks like '2018'
#then drop the old season column and make your index the years you just got (as a datetime)
years = '20' + data2.Season.str.split('-',expand=True)[1]
years = pd.to_datetime(years)
data2 = data2.drop('Season',1)
data2.index = years

In [None]:
#We can rename an index
data2.index.name='Season'
data2

# Rolling windows

In [None]:
data2.PTS

In [None]:
data2.PTS.rolling(3,center=True).mean()
#What happend here?

In [None]:
#Again, we can apply any aggregation function
data2.PTS.rolling(3,center=True).apply(np.median)
#What happend here?

In [None]:
#What about different window sizes?
data2.PTS.rolling(5,center=True).mean()

In [None]:
#What about an even window size?
data2.PTS.rolling(2,center=True).mean()
#defaults to taking value and value before

In [None]:
#How to get rid of nans and just use what we have?
data2.PTS.rolling(3,center=True,min_periods=0).mean()


In [None]:
data2

In [None]:
data2.rolling(3).mean()
#won't work becuase of text features

In [None]:
data2.groupby('Pos').rolling(3).max()

In [None]:
# Something fancy

#What is this doing?
data2.drop(['Lg','Pos'],1).groupby('Tm').rolling(3,center=True,min_periods=0).mean().groupby('Tm').max()

## Missing data in a time series

In [None]:
#lets look back at data
data

In [None]:
#just for fun
data.iloc[4]=np.nan
data

In [None]:
data.fillna(method='bfill')
#What does this do?

In [None]:
data.fillna(method='ffill')
#What does this do?

In [None]:
(data.fillna(method='bfill')+data.fillna(method='ffill'))/2
#Can do both

# The multi index

In [None]:
data_j = pd.read_csv('jordan.csv')
data_j

In [None]:
#Exercise, do the same exercise for seasons that we did for lebron and set it to index
years = '19' + data_j.Season.str.split('-',expand=True)[1]
years = pd.to_datetime(years)
data_j = data_j.drop('Season',1)
data_j.index=years
#Exercise, rename your index as 'Season'
data_j.index.name = 'Season'

In [None]:
#exercise, add a column to the lebron data that is called 'Name' and every entry is 'Lebron'
#exercise, add a column to the jordan data that is called 'Name' and every entry is 'Jordan'
#exercise, concat lebron below jordan (as he should be) into one dataframe called bball
data2['Name']='Lebron'
data_j['Name']='Jordan'
bball = pd.concat([data_j,data2])
bball

We have a problem, the dataframe is 2 dimensional but we require 3 dimensions to analyze this dataset properly.
Why?

In [None]:
#How to index by both name and date (in that order)
#first, reset the index
bball = bball.reset_index()
bball

In [None]:
#bball = bball.set_index(['Name','Season']).drop(['level_0','index'],1)
bball

Wahlah, a fake 3-d array in Pandas

In [None]:
#how to index
#gets us all about jordan
bball.loc['Jordan']

In [None]:
#what if we just wanted lebron, 2004?
bball.loc[('Lebron','2004-01-01')]

In [None]:
#Exericse, just get lebron 2004, 3P% using just one call of `.loc` and a comma
bball.loc[('Lebron','2004-01-01'),'3P%']

### Note: We can have more than 2 things in our multiindex

In [None]:
bball.reset_index().set_index(['Name','Season','Tm'])

# The Power of the Multiindex

In [None]:
#Not what we want!
bball.mean()

In [None]:
bball.groupby('Name').mean()

In [None]:
bball.groupby('Age').mean()

In [None]:
#Bad!
bball.drop(['Tm','Pos','Lg'],1).cumsum()

In [None]:
#Good!
bball.drop(['Tm','Pos','Lg'],1).groupby('Name').cumsum()

In [None]:
#rolling wont understand, #look at lebrons first age
bball.drop(['Tm','Pos','Lg'],1).rolling(3,center=True).mean()

In [None]:
bball.drop(['Tm','Pos','Lg'],1).groupby('Name').rolling(3,center=True).mean()
#Weird name, name thing, but we can deal with this

In [None]:
#Odd behavior, how to we get rid of a 'level' of our index?
temp = bball.drop(['Tm','Pos','Lg'],1).groupby('Name').rolling(3,center=True).mean()
temp

In [None]:
temp.index = temp.index.droplevel()
temp

In [None]:
temp.index = temp.index.droplevel()

In [None]:
temp
#drops from left to right