# Pandas

In [1]:
import pandas as pd
import numpy as np
print("Pandas version:",pd.__version__)
print("Numpy version:",np.__version__)

import sys
print ("Python version:", sys.version)

Pandas version: 0.19.2
Numpy version: 1.11.3
Python version: 3.6.0 |Anaconda custom (x86_64)| (default, Dec 23 2016, 13:19:00) 
[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]


### Pandas Series objects

In [2]:
data = pd.Series([5, 10, 15, 20, 25])
data

0     5
1    10
2    15
3    20
4    25
dtype: int64

In [3]:
# to access the values of pandas series
data.values

array([ 5, 10, 15, 20, 25])

In [4]:
# access the index of pandas series
data.index

RangeIndex(start=0, stop=5, step=1)

In [5]:
# access data via associated index
print(data[1])
print('--------')
print(data[3:5])

10
--------
3    20
4    25
dtype: int64


In [6]:
#index need not be an integer
dt = pd.Series([1, 2, 3, 4, 5],
              index=['a', 'b', 'c', 'd', 'e'])
dt

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [7]:
# access the data as expected 
dt['c']

3

In [8]:
# Series as specialized dictionary
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Florida       19552860
Illinois      12882135
New York      19651127
Texas         26448193
dtype: int64

In [9]:
population['California']

38332521

In [10]:
# unlike dictionary, Series support slicing
population['California' : 'Illinous']

California    38332521
Florida       19552860
Illinois      12882135
dtype: int64

#### Constracting Series objects (pd.Series(data, index=index))

In [11]:
# 1- from a list or numpy array
pd.Series([1, 2, 4])

0    1
1    2
2    4
dtype: int64

In [12]:
# 2a- from a dictionary
pd.Series({1:'a', 5:'c' , 2:'g'}, index=[1,5,2])

1    a
5    c
2    g
dtype: object

In [13]:
# 2b- from a dictionary
pd.Series({'a': 1, 'c':5, 'b':67}, index=['a','b','c'])

a     1
b    67
c     5
dtype: int64

### Pandas DataFrame object

In [14]:
# lets create another pandas series object first
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
dtype: int64

In [15]:
# use dictionary to create a DataFrame from population and area Series objects
states = pd.DataFrame({'population': population , 'area':area})
states

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [16]:
# access the index
states.index

Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')

In [17]:
# access column names
states.columns

Index(['area', 'population'], dtype='object')

In [18]:
# access values
states.values

array([[  423967, 38332521],
       [  170312, 19552860],
       [  149995, 12882135],
       [  141297, 19651127],
       [  695662, 26448193]])

In [19]:
# access a column
states['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

#### Constracting DataFrame objects

In [20]:
# 1- from a Series object
pd.DataFrame(population, columns=['population'])

Unnamed: 0,population
California,38332521
Florida,19552860
Illinois,12882135
New York,19651127
Texas,26448193


In [21]:
# 2- from a list of dictionary
data = [{'a': 1, 'b':2}
       for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,1,2
1,1,2
2,1,2


In [22]:
# 3- from a dictionary of Series object
pd.DataFrame({'population':population , 'area':area})

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [23]:
# 4- from 2d Numpy array
pd.DataFrame(np.random.rand(3,2),
            columns=['aaa', 'bbb'],
            index=['a','b','c'])

Unnamed: 0,aaa,bbb
a,0.366499,0.479924
b,0.210873,0.069685
c,0.866151,0.17063


### Pandas Index object

In [24]:
ind = pd.Index([1,2,5,16])
ind

Int64Index([1, 2, 5, 16], dtype='int64')

In [25]:
# access index values
ind[3]

16

In [26]:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

4 (4,) 1 int64


In [27]:
# indexes are immutable. code below would throw an error message
#ind[3] = 0

## Data Indexing and Selection

In [28]:
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [29]:
data['b']

0.5

In [30]:
'a' in data

True

In [31]:
1.0 in data

False

In [32]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [33]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [34]:
# assign a new index and an index value
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [35]:
# slicing explicitly
data['b':'e']

b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [36]:
# slicing implicitly
data[0:2]

a    0.25
b    0.50
dtype: float64

In [37]:
# masking
data[(data > 0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

In [38]:
# fancy indexing
data[['a', 'e']]

a    0.25
e    1.25
dtype: float64

In [39]:
data.loc['a']

0.25

In [40]:
data.loc[['a', 'c']]

a    0.25
c    0.75
dtype: float64

In [41]:
data.loc['a':'d']

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [42]:
data.iloc[1]

0.5

In [43]:
data.iloc[1:3]

b    0.50
c    0.75
dtype: float64

### Data Selection in DataFrames

In [44]:
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'population':pop})
data

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [45]:
# access the individual Series that make up the column
data['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [46]:
# equivalent code
data.area

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [47]:
data.area is data['area']

True

In [48]:
data.area == data['area']

California    True
Florida       True
Illinois      True
New York      True
Texas         True
Name: area, dtype: bool

In [49]:
data['density'] = data['population'] / data['area']
data

Unnamed: 0,area,population,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [50]:
data.values

array([[  4.23967000e+05,   3.83325210e+07,   9.04139261e+01],
       [  1.70312000e+05,   1.95528600e+07,   1.14806121e+02],
       [  1.49995000e+05,   1.28821350e+07,   8.58837628e+01],
       [  1.41297000e+05,   1.96511270e+07,   1.39076746e+02],
       [  6.95662000e+05,   2.64481930e+07,   3.80187404e+01]])

In [51]:
# 2D DataFrame
data.T

Unnamed: 0,California,Florida,Illinois,New York,Texas
area,423967.0,170312.0,149995.0,141297.0,695662.0
population,38332520.0,19552860.0,12882140.0,19651130.0,26448190.0
density,90.41393,114.8061,85.88376,139.0767,38.01874


In [52]:
data.values[0]

array([  4.23967000e+05,   3.83325210e+07,   9.04139261e+01])

In [53]:
data.values[1]

array([  1.70312000e+05,   1.95528600e+07,   1.14806121e+02])

In [54]:
data['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [55]:
data

Unnamed: 0,area,population,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [56]:
# iloc
data.iloc[1]

area          1.703120e+05
population    1.955286e+07
density       1.148061e+02
Name: Florida, dtype: float64

In [57]:
data.iloc[1:3]

Unnamed: 0,area,population,density
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [58]:
data.iloc[1:3, 0:2]

Unnamed: 0,area,population
Florida,170312,19552860
Illinois,149995,12882135


In [59]:
data.loc['Florida']

area          1.703120e+05
population    1.955286e+07
density       1.148061e+02
Name: Florida, dtype: float64

In [60]:
data.loc[['Florida', 'Illinois']]

Unnamed: 0,area,population,density
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763


In [61]:
data.loc[['Florida','Illinois']][['area']]

Unnamed: 0,area
Florida,170312
Illinois,149995


In [62]:
data.loc['Florida':'Illinois', :'population']

Unnamed: 0,area,population
Florida,170312,19552860
Illinois,149995,12882135


In [63]:
# ix indexer is hybrid of loc and iloc
data.ix[1:3 , :'population']

Unnamed: 0,area,population
Florida,170312,19552860
Illinois,149995,12882135


In [64]:
data.ix[1:3 , ['area', 'population']]

Unnamed: 0,area,population
Florida,170312,19552860
Illinois,149995,12882135


In [65]:
# use masking and fancy indexing with loc
data.loc[data['density'] > 100 , ['area', 'density']]

Unnamed: 0,area,density
Florida,170312,114.806121
New York,141297,139.076746


In [66]:
# more examples 
data

Unnamed: 0,area,population,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [67]:
data.iloc[0,0]

423967.0

In [68]:
data.sort_values('population', ascending=False).ix[0:1, ['area', 'population']]

Unnamed: 0,area,population
California,423967,38332521


### Operating on Data in Pandas

In [69]:
# Create a series 
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10,  4))
ser

0    6
1    3
2    7
3    4
dtype: int64

In [70]:
# create a df
df = pd.DataFrame(rng.randint(0, 10, (3,4)), columns=['A', 'B','C', 'D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [71]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [72]:
np.max(ser)

7

In [73]:
df + 1

Unnamed: 0,A,B,C,D
0,7,10,3,7
1,8,5,4,8
2,8,3,6,5


In [74]:
np.sin(df * (-1))

Unnamed: 0,A,B,C,D
0,0.279415,-0.412118,-0.909297,0.279415
1,-0.656987,0.756802,-0.14112,-0.656987
2,-0.656987,-0.909297,0.958924,0.756802
