In [5]:
#The Pandas Dataframe Object
#Arrays or dictionaries? 

ImportError: No module named 'theano'

In [2]:
#DataFrame as generalized NumPy array
#If a Series is an analog of a one-dimensional array with 
#flexible indices, a DataFrame is an analog of a two-dimensional 
#array with both flexible row indices and flexible column names

import numpy as np
import pandas as pd

area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}

area = pd.Series(area_dict)

In [3]:
area

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
dtype: int64

In [4]:
area_dict

{'California': 423967,
 'Florida': 170312,
 'Illinois': 149995,
 'New York': 141297,
 'Texas': 695662}

In [10]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)

In [11]:
states = pd.DataFrame({'population': population, 
                       'area': area})

In [12]:
states

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [13]:
states.index

Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')

In [14]:
states.columns

Index(['area', 'population'], dtype='object')

In [15]:
#DataFrame as specialized dictionary
# Where a dictionary maps a key to a value, a DataFrame 
#maps a column name to a Series of column data

states['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [17]:
#Constructing DataFrame objects
pd.DataFrame(population, columns = ['populations'])

Unnamed: 0,populations
California,38332521
Florida,19552860
Illinois,12882135
New York,19651127
Texas,26448193


In [18]:
#From a list of dicts
data = [{'a': i, 'b':2*i}
       for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [19]:

pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [20]:
#From a dictionary of Series objects
pd.DataFrame({'population': population,
             'area': area})

Unnamed: 0,area,population
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [21]:
#From a two-dimensional NumPy array
pd.DataFrame(np.random.rand(3,2),
            columns = ['foo', 'bar'],
            index = ['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.940781,0.12506
b,0.301587,0.259845
c,0.642886,0.341834


In [23]:
#From a NumPy structured array
#A Pandas DataFrame operates much like a structured array, 
#and can be created directly from one:
A= np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
A

array([(0, 0.0), (0, 0.0), (0, 0.0)], 
      dtype=[('A', '<i8'), ('B', '<f8')])

In [24]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


In [25]:
#The Pandas Index Object
ind = pd.Index([2, 3, 5, 7, 11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

In [26]:
#Index as immutable array
ind[1]

3

In [27]:
ind[::2]

Int64Index([2, 5, 11], dtype='int64')

In [28]:
#Index objects also have many of the attributes familiar from NumPy arrays:
print(ind.size, ind.shape, ind.ndim, ind.dtype)

5 (5,) 1 int64


In [29]:
#One difference between Index objects and NumPy arrays is that 
#indices are immutable–that is, they cannot be modified via the normal means:
ind[1] = 0

TypeError: Index does not support mutable operations

In [30]:
#Index as Ordered Set
#Pandas objects are designed to facilitate operations such as 
#joins across datasets, which depend on many aspects of set arithmetic. 
indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 6, 11])

In [31]:
indA & indB #intersection

Int64Index([3, 5], dtype='int64')

In [32]:
indA | indB #union

Int64Index([1, 2, 3, 5, 6, 7, 9, 11], dtype='int64')

In [33]:
indA ^ indB #symmetric difference, aligns unequal indices

Int64Index([1, 2, 6, 7, 9, 11], dtype='int64')

In [None]:
#Data Indexing and Selection

In [34]:
#Data Selection in Series
#As we saw in the previous section, a Series object acts in many ways like a one-dimensional NumPy array, 
#and in many ways like a standard Python dictionary.

In [36]:
#Series as dictionary
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                index = ['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [37]:
data['b']

0.5

In [38]:
#We can also use dictionary-like Python expressions and methods 
#to examine the keys/indices and values:
'a' in data

True

In [39]:
data.keys()

Index(['a', 'b', 'c', 'd'], dtype='object')

In [40]:
list(data.items())

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

In [41]:
#Series objects can even be modified with a dictionary-like syntax.
data['e'] = 1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [42]:
# Series as one-dimesnional array
# A Series builds on this dictionary-like interface and 
#provides array-style item selection via the same 
#basic mechanisms as NumPy arrays – that is, slices, masking, and fancy indexing.

In [43]:
#slicing by explicit index
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [44]:
#slicing by implicit integer index
data[0:2]

a    0.25
b    0.50
dtype: float64

In [45]:
#masking
data[(data >0.3) & (data < 0.8)]

b    0.50
c    0.75
dtype: float64

In [46]:
# Fancy indexing
data[['a', 'e']]

a    0.25
e    1.25
dtype: float64

In [48]:
#Indexers: loc, iloc, and ix
data = pd.Series(['a', 'b', 'c'], index = [1,3,5])
data

1    a
3    b
5    c
dtype: object

In [49]:
#explicit index when indexing
data[1]

'a'

In [50]:
#implicit index when slicing
data[1:3]

3    b
5    c
dtype: object

In [51]:
# First, the loc attribute allows indexing and slicing that 
#always references the explicit index:
data.loc[1]

'a'

In [52]:
data.loc[1:3]

1    a
3    b
dtype: object

In [53]:
data.loc[3:5]

3    b
5    c
dtype: object

In [54]:
#The iloc attribute allows indexing and slicing that always references 
#the implicit Python-style index:
data.iloc[1]

'b'

In [55]:
data.iloc[1:3]

3    b
5    c
dtype: object

In [56]:
#Data Selection in DataFrame

In [57]:
#DataFrame as dictionary
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135
New York,141297,19651127
Texas,695662,26448193


In [58]:
data['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [59]:
#Equivalently, we can use attribute-style 
#access with column names that are strings:
data.area

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [60]:
#This attribute-style column access actually accesses the exact same object 
#as the dictionary-style access:
data.area is data['area']

True

In [None]:
#In particular, you should avoid the temptation to try 
#column assignment via attribute

In [61]:
data['density'] = data['pop']/ data['area']

In [62]:
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.413926
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [63]:
#DataFrame as two dimensional array
data.values

array([[  4.23967000e+05,   3.83325210e+07,   9.04139261e+01],
       [  1.70312000e+05,   1.95528600e+07,   1.14806121e+02],
       [  1.49995000e+05,   1.28821350e+07,   8.58837628e+01],
       [  1.41297000e+05,   1.96511270e+07,   1.39076746e+02],
       [  6.95662000e+05,   2.64481930e+07,   3.80187404e+01]])

In [64]:
data.T

Unnamed: 0,California,Florida,Illinois,New York,Texas
area,423967.0,170312.0,149995.0,141297.0,695662.0
pop,38332520.0,19552860.0,12882140.0,19651130.0,26448190.0
density,90.41393,114.8061,85.88376,139.0767,38.01874


In [65]:
data.values[0]

array([  4.23967000e+05,   3.83325210e+07,   9.04139261e+01])

In [66]:
data['area']

California    423967
Florida       170312
Illinois      149995
New York      141297
Texas         695662
Name: area, dtype: int64

In [67]:
#use loc, iloc and ix to index this 3 d array
data.iloc[:3, :2] #first three rows, first 2 columns

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135


In [68]:
data.index

Index(['California', 'Florida', 'Illinois', 'New York', 'Texas'], dtype='object')

In [69]:
data.loc[:'Illinois', :'pop'] #same thing, just being more explicit

Unnamed: 0,area,pop
California,423967,38332521
Florida,170312,19552860
Illinois,149995,12882135


In [70]:
#The ix indexer allows a hybrid of these two approaches:
data.ix[:3, 'pop']

California    38332521
Florida       19552860
Illinois      12882135
Name: pop, dtype: int64

In [71]:
#combining masking and fancy indexing
data.loc[data.density > 100, ['pop', 'density']]

Unnamed: 0,pop,density
Florida,19552860,114.806121
New York,19651127,139.076746


In [72]:
data.loc[data.density > 100]

Unnamed: 0,area,pop,density
Florida,170312,19552860,114.806121
New York,141297,19651127,139.076746


In [73]:
#indexing can be used to modify values
data.iloc[0,2] = 90
data

Unnamed: 0,area,pop,density
California,423967,38332521,90.0
Florida,170312,19552860,114.806121
Illinois,149995,12882135,85.883763
New York,141297,19651127,139.076746
Texas,695662,26448193,38.01874


In [78]:
#Operating on Data in Pandas
#Ufuncs: Index Preservation
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0, 10 ,4)) # series of 4 random numbers from 0-9
ser

0    6
1    3
2    7
3    4
dtype: int32

In [79]:
rng

<mtrand.RandomState at 0x2910e5513f0>

In [82]:
df = pd.DataFrame(rng.randint(0, 10, (3,4)),
                 columns = ['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,1,7,5,1
1,4,0,9,5
2,8,0,9,2


In [83]:
#If we apply a NumPy ufunc on either of these objects, 
#the result will be another Pandas object with the indices preserved:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [84]:
np.sin(df* np.pi/4)

Unnamed: 0,A,B,C,D
0,0.7071068,-0.707107,-0.707107,0.707107
1,1.224647e-16,0.0,0.707107,-0.707107
2,-2.449294e-16,0.0,0.707107,1.0


In [87]:
#UFuncs: Index Alignment 

area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')

In [88]:
#Let's see what happens when we divide these to compute the population density:
population/ area

Alaska              NaN
California    90.413926
New York            NaN
Texas         38.018740
dtype: float64

In [89]:
area.index | population.index

Index(['Alaska', 'California', 'New York', 'Texas'], dtype='object')

In [90]:
A = pd.Series([2,4,6], index = [0,1,2])
B = pd.Series([1,3,5], index = [1,2,3])
A+B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [91]:
A.add(B, fill_value = 0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

In [93]:
#Index Alignment in DataFrame
#A similar type of alignment takes place for both columns and 
#indices when performing operations on DataFrames:
A = pd.DataFrame(rng.randint(0,20,(2,2)),
                columns=list('AB'))
A

Unnamed: 0,A,B
0,18,6
1,8,6


In [95]:
B = pd.DataFrame(rng.randint(0,10,(3,3)),
                columns = list('BAC'))
B

Unnamed: 0,B,A,C
0,3,6,7
1,2,0,3
2,1,7,3


In [96]:
A+B

Unnamed: 0,A,B,C
0,24.0,9.0,
1,8.0,8.0,
2,,,


In [97]:
fill = A.stack().mean()
A.add(B, fill_value=fill)

Unnamed: 0,A,B,C
0,24.0,9.0,16.5
1,8.0,8.0,12.5
2,16.5,10.5,12.5


In [99]:
#Ufuncs: Operations Between DataFrame and Series
A = rng.randint(10, size = (3,4))
A

array([[6, 8, 7, 4],
       [1, 4, 7, 9],
       [8, 8, 0, 8]])

In [100]:
A- A[0] #applies Row at index 0 to rest of array
#According to NumPy's broadcasting rules,
#subtraction between a two-dimensional array and one of its 
#rows is applied row-wise.

array([[ 0,  0,  0,  0],
       [-5, -4,  0,  5],
       [ 2,  0, -7,  4]])

In [102]:
#In Pandas, the convention similarly operates row-wise by default:
df = pd.DataFrame(A, columns = list('QRST'))
df

Unnamed: 0,Q,R,S,T
0,6,8,7,4
1,1,4,7,9
2,8,8,0,8


In [103]:
df - df.iloc[0]

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,-5,-4,0,5
2,2,0,-7,4


In [105]:
#operating column wiase
df.subtract(df['R'], axis = 0)

Unnamed: 0,Q,R,S,T
0,-2,0,-1,-4
1,-3,0,3,5
2,0,0,-8,0


In [109]:
#Note that these DataFrame/Series operations, like the operations 
#discussed above, will automatically align indices between the two elements:
halfrow = df.iloc[0, ::2]
halfrow

Q    6
S    7
Name: 0, dtype: int32

In [114]:
df.iloc[0, ::2]

Q    6
S    7
Name: 0, dtype: int32