In [370]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

#### 5.1 Introduction to pandas Data Structure

In [371]:
import pandas as pd
import numpy as np

In [372]:
# 1. Simplest series is formed from an array of data
pd_series = pd.Series([4, 7, -5, 3])

In [373]:
# 1.1 Stirng representation of a series
pd_series

0    4
1    7
2   -5
3    3
dtype: int64

In [374]:
# 1.2 Array representation of a series
pd_series.values

array([ 4,  7, -5,  3], dtype=int64)

In [375]:
# 1.3 Index object of a series
pd_series.index

RangeIndex(start=0, stop=4, step=1)

In [376]:
# 2. Create a Series with an index identifying each data point with a label
pd_series_ii = pd.Series(data = [4, 7, -5, 3], index = ['d', 'b', 'a', 'c'])

In [377]:
pd_series_ii

d    4
b    7
a   -5
c    3
dtype: int64

In [378]:
pd_series_ii.values

array([ 4,  7, -5,  3], dtype=int64)

In [379]:
pd_series_ii.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [380]:
# 3. We can use labels in index when selecting single value
pd_series_ii['a']

-5

In [381]:
# 4. Selecting a set of values
pd_series_ii[['c','a', 'd']]

c    3
a   -5
d    4
dtype: int64

In [382]:
# 5. Selecting a set of values using NumPy functions or NumPy-like operations
# 5.1 Filtering with a Boolean array
pd_series_ii[pd_series_ii > 0]

d    4
b    7
c    3
dtype: int64

In [383]:
# 5.2 Scalar Multiplication
pd_series_ii * 2

d     8
b    14
a   -10
c     6
dtype: int64

In [384]:
# 5.3 Applying Math functions
# 5.3.1 : Provide String representation of Pandas Series
np.exp(pd_series_ii)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [385]:
# 5.3.3 : Provide Array representation of Pandas Series
np.exp(pd_series_ii.values)

array([5.45981500e+01, 1.09663316e+03, 6.73794700e-03, 2.00855369e+01])

In [386]:
# 5.4 Pandas series can be used in many contexts where you might use a python dict
'd' in pd_series_ii

True

In [387]:
# 5.5 You may create Pandas Series fom Python Dictionary by passing it as parameter.
# 5.5.1 When index is not provided, then in resulting series index will have dict's keys in sorted order
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
pd_series_3 = pd.Series(sdata)
pd_series_3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [388]:
# 5.5.2 We may provide dict's keys separately in the order we want them to appear in resulting series
pd_series_4 = pd.Series(data = sdata, index = ['Utah', 'Oregon', 'Ohio', 'Texas', 'California' ])
pd_series_4

Utah           5000.0
Oregon        16000.0
Ohio          35000.0
Texas         71000.0
California        NaN
dtype: float64

In [389]:
# 5.6 isnull and notnull functions must be used in order to detect missing values
# 5.6.1 isnull
pd.isnull(pd_series_4)

Utah          False
Oregon        False
Ohio          False
Texas         False
California     True
dtype: bool

In [390]:
# 5.6.2 isnull
pd.notnull(pd_series_4)

Utah           True
Oregon         True
Ohio           True
Texas          True
California    False
dtype: bool

In [391]:
# 5.6.3 Series also has instance method
pd_series_4.isnull()

Utah          False
Oregon        False
Ohio          False
Texas         False
California     True
dtype: bool

In [392]:
# 5.7 Pandas Series object itself and its index have a name attribute, which
# integrates with other key areas of Pandas functionality.
pd_series_4.name = 'State'
pd_series_4.index.name = 'Population'

In [393]:
pd_series_4

Population
Utah           5000.0
Oregon        16000.0
Ohio          35000.0
Texas         71000.0
California        NaN
Name: State, dtype: float64

In [394]:
# 5.8 Pandas Series index can be altered in-place by assignment
pd_series_4.index = ['Utah_alt', 'Oregon_alt', 'Ohio_alt', 'Texas_alt', 'California_alt']
pd_series_4

Utah_alt           5000.0
Oregon_alt        16000.0
Ohio_alt          35000.0
Texas_alt         71000.0
California_alt        NaN
Name: State, dtype: float64