< [Online Version Python Data Science Handbook](https://github.com/jakevdp/PythonDataScienceHandbook)| [Menu](https://)>
# 3. Data Manipulation with Pandas 
* Pandas is a newer package built on top of NumPy, and provides an efficient implementation of a DataFrame.
* In this chapter, we will focus on the mechanics of using **Series**, **DataFrame**, and related structures effectively
* More detailed documentation, along with tutorials and other resources, can be found at http://pandas.pydata.org/.

## Data Indexing and Selection
* Data Selection in Series
* Data Selection in DataFrame

### Data Selection in Series

In [6]:
# Series as dictionary
import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0],
    index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [2]:
data['b']

0.5

In [3]:
'a' in data

True

In [12]:
data.index
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [14]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [17]:
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [95]:
# Select all values as array
data.values

array([0.25, 0.5 , 0.75, 1.  , 1.25])

In [93]:
# Select rows by python index or explicit index from-to
data[0] 
data[0:3] 
data.loc['a':'c']
data.iloc[0:3]

0.25

In [31]:
# Select rows by explicit index
data[['a', 'e']]

a    0.25
e    1.25
dtype: float64

In [28]:
# Select rows by value
data[(data > 0.3) & (data < 0.8)] 

b    0.50
c    0.75
dtype: float64

In [53]:
# Select row value by python index or explicit index
data['c']
data.loc['c']
data.iloc[2]

0.75

### Data Selection in DataFrame

In [40]:
population_dict = {'California': 38332521,
    'Texas': 26448193,
    'New York': 19651127,
    'Florida': 19552860,
    'Illinois': 12882135}
population = pd.Series(population_dict)
area_dict = {'California': 423967, 
    'Texas': 695662, 
    'New York': 141297,
    'Florida': 170312, 
    'Illinois': 149995}
area = pd.Series(area_dict)
states = pd.DataFrame({'population': population,
    'area': area})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [8]:
# Select column as Series
states.area
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [9]:
states.area is states['area']

True

In [46]:
# Add column
states['density'] = states['population'] / states['area']
states

Unnamed: 0,population,area,density
California,38332521,423967,90.413926
Texas,26448193,695662,38.01874
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [11]:
# Select all values as array
states.values

array([[3.83325210e+07, 4.23967000e+05, 9.04139261e+01],
       [2.64481930e+07, 6.95662000e+05, 3.80187404e+01],
       [1.96511270e+07, 1.41297000e+05, 1.39076746e+02],
       [1.95528600e+07, 1.70312000e+05, 1.14806121e+02],
       [1.28821350e+07, 1.49995000e+05, 8.58837628e+01]])

In [12]:
# Select rows by python index from-to > array
states.values[0]
states.values[0:3]

array([[3.83325210e+07, 4.23967000e+05, 9.04139261e+01],
       [2.64481930e+07, 6.95662000e+05, 3.80187404e+01],
       [1.96511270e+07, 1.41297000e+05, 1.39076746e+02]])

In [67]:
# Select rows by python index from-to > dataframe
states[0:3]
states.iloc[0:3]
states.iloc[:3]
states.iloc[[1,2,3]]
states['California':'New York']
states.loc[:'New York']
states.loc['California':'New York']
states.loc[['California', 'Texas', 'New York']]

Unnamed: 0,population,area,density
California,38332521,423967,95.0
Texas,26448193,695662,95.0
New York,19651127,141297,92.0


In [80]:
# Select columns by python index from-to > dataframe
states[['population','area']]
states.iloc[:, 0:2]
states.iloc[:, :2]
states.iloc[:, [0,1]]
states.loc[:, :'area']
states.loc[:, 'population':'area']
states.loc[:, ['population', 'area']]

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [38]:
# Select rows and columns by python index from-to > dataframe
states.iloc[0:3, 0:2]
states.iloc[:3, :2]
states.iloc[[1,2,3], [1,2]]
states.loc[:'New York', :'area']
states.loc['California':'New York', 'population':'area']
states.loc[['California', 'Texas', 'New York'], ['population', 'area']]

Unnamed: 0,population,area
California,38332521,94
Texas,26448193,94
New York,19651127,141297


In [39]:
# Select rows filtering by column > dataframe
states[(states.density > 90) & (states.density < 140)]
states.loc[(states.density > 90) & (states.density < 140)]
states.loc[(states.density > 90) & (states.density < 140), ['population', 'area', 'density']]
states.loc[(states.density > 90) & (states.density < 140), 'population':'density']
states.loc[(states.density > 90) & (states.density < 140), :'density']

Unnamed: 0,population,area,density
California,38332521,94,94.0
Texas,26448193,94,94.0
New York,19651127,141297,139.076746
Florida,19552860,170312,114.806121


In [66]:
# Modify rows / columns > dataframe
states.iloc[0, 2] = 90
states.iloc[0:2, 2] = 90
states.iloc[:2, 2] = 90
states.iloc[[0,1], 2] = 90
states.loc[:'Texas', 'density'] = 90
states.loc['California':'Texas', 'density'] = 90
states.loc[['California', 'Texas'], 'density'] = 90

Unnamed: 0,population,area,density
California,38332521,423967,95.0
Texas,26448193,695662,95.0
New York,19651127,141297,92.0
Florida,19552860,170312,114.806121
Illinois,12882135,149995,85.883763


In [16]:
# Pivot table
states.T

Unnamed: 0,California,Texas,New York,Florida,Illinois
population,38332520.0,26448190.0,19651130.0,19552860.0,12882140.0
area,423967.0,695662.0,141297.0,170312.0,149995.0
density,90.41393,38.01874,139.0767,114.8061,85.88376
