In [1]:
# Pandas contains data structures and data manipulation tools
# designed to make data cleaning and analysis fast and easy in Python. 
# While pandas adopts many coding idioms from NumPy, 
# the biggest difference is that pandas is designed
# for working with tabular or heterogeneous data.
# NumPy, by contrast, is best suited for working with homogeneous numerical array data.

In [2]:
# Introduction to pandas Data Structures.

In [21]:
import pandas as pd
import numpy as np

In [None]:
# Series
# A Series is a one-dimensional array-like object containing a sequence of values
# (of similar types to NumPy types) and an associated array of data labels, called its index.

In [4]:
obj = pd.Series([4, 7, -5, 3])

In [6]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [7]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [8]:
# Often it will be desirable to create a Series with an index identifying each data point
# with a label:
obj2 = pd.Series([4, 7, -5, 3], index=['a', 'b', 'c', 'd'])

In [9]:
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [10]:
obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [11]:
# Compared with NumPy arrays, you can use labels in the index when selecting
# single values or a set of values:
obj2['a']

4

In [12]:
obj[1]

7

In [14]:
obj2[['c', 'a', 'd']] # Here ['c', 'a', 'd'] is interpreted as a list of indices,
                      # even though it contains strings instead of integers.

c   -5
a    4
d    3
dtype: int64

In [16]:
obj > 3

0     True
1     True
2    False
3    False
dtype: bool

In [17]:
obj[obj > 3]

0    4
1    7
dtype: int64

In [18]:
obj * 2

0     8
1    14
2   -10
3     6
dtype: int64

In [19]:
pd.exp

AttributeError: module 'pandas' has no attribute 'exp'

In [22]:
np.exp(obj)

0      54.598150
1    1096.633158
2       0.006738
3      20.085537
dtype: float64

In [32]:
# you can create a Series from a python dict:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)

In [33]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [34]:
states = ['California', 'Ohio', 'Oregon', 'Texas']

In [35]:
obj4 = pd.Series(sdata, index=states)

In [37]:
# Here, three values found in sdata were placed in the appropriate locations,
# but since no value for 'California' was found, it appears as NaN (not a number),
# which is con‐ sidered in pandas to mark missing or NA values.
# Since 'Utah' was not included in states, it is excluded from the resulting object.
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [39]:
# I will use the terms “missing” or “NA” interchangeably to refer to missing data.
# The isnull and notnull functions in pandas should be used to detect missing data:
pd.isnull(obj4) # or obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [40]:
pd.notnull(obj4) # or obj4.notnull()

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [42]:
# A useful Series feature for many applications is that it automatically aligns by index
# label in arithmetic operations:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [43]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [44]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [47]:
# Both the Series object itself and its index have a name attribute,
# which integrates with other key areas of pandas functionality:
obj4.name = 'population'

In [48]:
obj4.index.name = 'state'

In [49]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64