# Series

In [1]:
import pandas as pd
import numpy as np

# control the number of max row outputs
pd.options.display.max_rows = 10
# control precision of floating point numbers
# pd.set_option('display.precision', 2)
pd.options.display.float_format = '{:.3f}'.format

## Overview

In [2]:
series = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
series

a    0.467
b   -1.403
c   -1.650
d   -1.101
e    0.438
dtype: float64

In [3]:
series.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [4]:
# Create Series from numpy
pd.Series(np.random.randn(5))

0    1.465
1   -0.390
2    0.525
3   -0.111
4    0.619
dtype: float64

In [5]:
# Access like numpy methods
series[:3]

a    0.467
b   -1.403
c   -1.650
dtype: float64

In [6]:
# Create Series from dict
pd.Series({'a' : 45., 'b' : -19.5, 'c' : 4444})

a     45.000
b    -19.500
c   4444.000
dtype: float64

In [7]:
# Access like dict methods
series['b']

-1.403200049787414

In [8]:
np.nan == None  # note that np.nan and None are not the same!

False

In [9]:
pd.Series(['Tiger', 'Bear', None])

0    Tiger
1     Bear
2     None
dtype: object

In [10]:
# Note the NaN value
pd.Series({'a' : 45., 'b' : -19.5, 'c' : 4444}, index=['b', 'c', 'd', 'a'])

b    -19.500
c   4444.000
d        nan
a     45.000
dtype: float64

In [11]:
series.get('a')

0.4674312253736693

In [12]:
series.get('f')   # Nothing happens as there is no 'f'

In [13]:
type(series.get('f'))

NoneType

In [14]:
# If data is a scalar value, an index must be provided. The value will be repeated to match the length of index
pd.Series(5., index=['a', 'b', 'c', 'd', 'e'])

a   5.000
b   5.000
c   5.000
d   5.000
e   5.000
dtype: float64

**Querying a Series**

In [15]:
series

a    0.467
b   -1.403
c   -1.650
d   -1.101
e    0.438
dtype: float64

In [16]:
series[3]

-1.10054275933568

In [17]:
series['d']

-1.10054275933568

In [18]:
series.d

-1.10054275933568

In [19]:
series.iloc[3]

-1.10054275933568

In [20]:
series.loc['d']

-1.10054275933568

## Vector Ops

In [21]:
%%timeit 
s = pd.Series(np.random.randint(0, 1000, size=10000))
for label, value in s.iteritems():    # iterate over key, value
    s.loc[label] = value + 2

6.48 s ± 132 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%%timeit 
s = pd.Series(np.random.randint(0, 1000, size=10000))
s += 2       # Uses broadcasting      

330 µs ± 4.02 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [23]:
%%timeit 
s = pd.Series(np.random.randint(0, 1000, size=10000))
for label, value in s.iteritems():
    s.set_value(label, value+2)

  This is separate from the ipykernel package so we can avoid doing imports until


66.6 ms ± 1.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [27]:
series['f'] = 5.
series

a    0.467
b   -1.403
c   -1.650
d   -1.101
e    0.438
f    5.000
dtype: object

In [28]:
del series['f']   # Delete a row using del

In [29]:
original_sports = pd.Series({'Archery': 'Bhutan',
                             'Golf': 'Scotland',
                             'Sumo': 'Japan',
                             'Taekwondo': 'South Korea'})

cricket_loving_countries = pd.Series(['Australia', 'Barbados', 'Pakistan', 'England'], 
                                   index=['Cricket', 'Cricket', 'Cricket', 'Cricket'])

In [30]:
all_countries = original_sports.append(cricket_loving_countries, )
all_countries

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
Cricket        Australia
Cricket         Barbados
Cricket         Pakistan
Cricket          England
dtype: object

In [31]:
original_sports

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
dtype: object

In [32]:
cricket_loving_countries

Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

In [33]:
all_countries

Archery           Bhutan
Golf            Scotland
Sumo               Japan
Taekwondo    South Korea
Cricket        Australia
Cricket         Barbados
Cricket         Pakistan
Cricket          England
dtype: object

In [34]:
all_countries.loc['Cricket']

Cricket    Australia
Cricket     Barbados
Cricket     Pakistan
Cricket      England
dtype: object

In [35]:
# add Series without loop
series + series

a    0.935
b   -2.806
c   -3.300
d   -2.201
e    0.876
dtype: object

In [36]:
# Series within arithmetic expression
series + 5

a   5.467
b   3.597
c   3.350
d   3.899
e   5.438
dtype: object

In [38]:
# Series used as argument to NumPy function
np.exp(series.astype('int32'))

a   1.000
b   0.368
c   0.368
d   0.368
e   1.000
dtype: float64

A key difference between Series and ndarray is that operations between Series automatically align the data based on
label. Thus, you can write computations without giving consideration to whether the Series involved have the same labels.

In [39]:
series[1:]

b   -1.403
c   -1.650
d   -1.101
e    0.438
dtype: object

In [40]:
series[:-1]

a    0.467
b   -1.403
c   -1.650
d   -1.101
dtype: object

In [41]:
series[1:] + series[:-1]

a      NaN
b   -2.806
c   -3.300
d   -2.201
e      NaN
dtype: object

In [42]:
# Apply Python functions on an element-by-element basis
def multiply_by_ten (i):
    return i * 10.0

series.map(multiply_by_ten)

a     4.674
b   -14.032
c   -16.501
d   -11.005
e     4.382
dtype: float64

Series is equipped with a set of string processing methods that make it easy to operate on each element of the array. Perhaps most importantly, these methods exclude missing/NA values automatically. 

In [43]:
# Vectorized string methods
series_of_strings = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])
series_of_strings.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

## Date Arithmetic


| Type      | Description                                                       |  
|-----------|-------------------------------------------------------------------|  
| date      | Store calendar date (year, month, day) using a Gregorian Calendar |  
| datetime  | Store both date and time                                          |  
| timedelta | Difference between two datetime values                            |  

**Common Date Arithmetic Operations**
- calculate differences between date
- generate sequences of dates and time spans
- convert time series to a particular frequency


| to_datetime(*args, **kwargs)                      | Convert argument to datetime.                                               |   |
|---------------------------------------------------|-----------------------------------------------------------------------------|---|
| to_timedelta(*args, **kwargs)                     | Convert argument to timedelta                                               |   |
| date_range([start, end, periods, freq, tz, ...])  | Return a fixed frequency datetime index, with day (calendar) as the default |   |
| bdate_range([start, end, periods, freq, tz, ...]) | Return a fixed frequency datetime index, with business day as the default   |   |
| period_range([start, end, periods, freq, name])   | Return a fixed frequency datetime index, with day (calendar) as the default |   |
| timedelta_range([start, end, periods, freq, ...]) | Return a fixed frequency timedelta index, with day as the default           |   |
| infer_freq(index[, warn])                         | Infer the most likely frequency given the input index.                      |   |


In [44]:
from datetime import datetime

now = datetime.now()
now

datetime.datetime(2019, 2, 20, 22, 43, 30, 773473)

In [45]:
now.year, now.month, now.day

(2019, 2, 20)

In [46]:
# delta
delta = now - datetime(2001, 1, 1)
delta

datetime.timedelta(days=6624, seconds=81810, microseconds=773473)

In [47]:
delta.days

6624

In [48]:
# parsing timedelta from string
pd.Timedelta('4 days 7 hours')

Timedelta('4 days 07:00:00')

In [49]:
# parsing timedelta from named keyword arguments
pd.Timedelta(days=1, seconds=1)

Timedelta('1 days 00:00:01')

In [50]:
# integers with a unit
pd.Timedelta(1, unit='d')

Timedelta('1 days 00:00:00')

In [51]:
# create a range of dates from Timedelta
rand_date = datetime(2019, 1, 20)
rand_date

datetime.datetime(2019, 1, 20, 0, 0)

In [52]:
rand_date2 = datetime(2019, 9, 10)
rand_date2

datetime.datetime(2019, 9, 10, 0, 0)

In [53]:
rand_date - rand_date2

datetime.timedelta(days=-233)

In [54]:
type(rand_date - rand_date2)

datetime.timedelta

In [56]:
idx = pd.date_range(rand_date, periods=(rand_date2 - rand_date).days, freq='D')
idx

DatetimeIndex(['2019-01-20', '2019-01-21', '2019-01-22', '2019-01-23',
               '2019-01-24', '2019-01-25', '2019-01-26', '2019-01-27',
               '2019-01-28', '2019-01-29',
               ...
               '2019-08-31', '2019-09-01', '2019-09-02', '2019-09-03',
               '2019-09-04', '2019-09-05', '2019-09-06', '2019-09-07',
               '2019-09-08', '2019-09-09'],
              dtype='datetime64[ns]', length=233, freq='D')

In [57]:
time_series = pd.Series(np.random.randn((rand_date2 - rand_date).days), index=idx)
time_series.tail()

2019-09-05    1.942
2019-09-06    0.135
2019-09-07    0.117
2019-09-08    0.021
2019-09-09   -0.929
Freq: D, dtype: float64