In [68]:
import pandas as pd
import numpy as np

### Series and Dataframes

#### Series
s = pd.Series(np.random(randn(100))
s.index
s.values
pd.Series([1,2,3,4],index=['a','b','c','d'])
#### len(s) v. s.count
    len(s) = total number of elements
    s.count() = total number of non-NaN elements
    
#### Other useful Series Functions
- s.shape()
- s.unique()
- s.value_counts() = sorted-by-counts histogram of each value in the Series

### Data Frames

#### Creation from a NumPy arrray
pd.DataFrame(np.array([[10,11],[20,21]]))

#### Creation from Pandas Series
pd.DataFrame([pd.Series(np.arange(10,15)), pd.Series(np.arange(15, 20))])

#### Customizing Column and Index names
df = pd.DataFrame(
                 <br>np.array([[0, 1],[2, 3]]),
                 <br>columns=['c1', 'c2'],      
                 index=['r1', 'r2']
                 <br>)

#### Useful DataFrame functions
- df.shape - tuple of (# of rows, # of columns)
- df.columns - print column names
- df.columns = ['new col 1', 'new col 2'] - change column names
- df.values
- df.index

#### Column selection
- df[['col1', 'col2']]
- df.col1
- df.T.iloc[[1, 2]].T

#### Selecting rows using the Index
- **sp500[:3] or sp500['XYL':'YUM']**
 - Slicing is slow and easily confused as selecting columns.  
 - Use it sparingly for selection (outside of Time Series which will be discussed later)
 


- **sp500.loc['XYL']**
 - Is faster
 - The loc keyword distinguishes clearly from column selection
 
 
- **sp500.loc[['XYL', 'YUM']]**
 - Only the specific values passed are selected; not a range of values
 
 
- **sp500.at['MSFT', 'Price']**
 - Retrieve a single value
 - This does not generalize to passing a list like df.loc[]
 
#### Broadcasting

- DataFrame / Series interactions: the given operation is applied ("broadcast") row-by-row through the data frame
  - Each Series item is aligned with a DataFrame item at the same index label (i.e. row-by-row)


- DataFrame / DataFrame operations: align across both the column and index labels
~~~~
# Step 1: Create a subframe
# Step 2: Subtract it from the original frame
# Step 3: Note the NaN values where subframe was missing equal column and index values
subframe = df[1:4][['B','C']]
print subframe
print df - subframe
~~~~


- DataFrames arithmetic is best done with built-in DataFrame methods.
  - Those give access to specifications such as using different axes
  - Allows you to **Broadcast Across the Columns**
~~~~
print 'a_col is the A\'th column so has index values of 0, 1, 2, 3, 4'
a_col = df['A']
print a_col
print 'Above, df.iloc[0] had index values A, B, C, D which were used to align to the columns'
print 'If we naively subtract, we\'re saying try to find columns 0, 1, 2, 3, 4 in the df -- and those don\'t exist'
print  df - a_col
print 'DataFrame arithmetic methods allow us to specify the axis where the index alignment should take place'
print 'By saying axis=0, pandas will align along axis = 0 (i.e. the rows) and then broadcast across the columns'
~~~~
> **df.sub(a_col, axis=0)**

#### Reindexing

s.index = [my_list]
s.reindex

#### Use index and reindex to create alignment between Series and DataFrames


#### Replacing NaN's
- ffill
- bfill
- fill_value
~~~~
s2 = s.copy()
s2.reindex(['a','f'], fill_value = 0)
s3 = pd.Series(['red', 'green', 'blue'], index=[0, 3, 5])
pd.DataFrame([s3, s3.reindex(np.arange(0,7)).ffill(), s3.reindex(np.arange(0,7)).bfill()], index=['orig','ffill','bfill']).T
~~~~

In [69]:
np.random.seed(1)
s = pd.Series(np.random.randn(100))
s

0     1.624345
1    -0.611756
2    -0.528172
3    -1.072969
4     0.865408
5    -2.301539
6     1.744812
7    -0.761207
8     0.319039
9    -0.249370
10    1.462108
11   -2.060141
12   -0.322417
13   -0.384054
14    1.133769
15   -1.099891
16   -0.172428
17   -0.877858
18    0.042214
19    0.582815
20   -1.100619
21    1.144724
22    0.901591
23    0.502494
24    0.900856
25   -0.683728
26   -0.122890
27   -0.935769
28   -0.267888
29    0.530355
        ...   
70   -1.444114
71   -0.504466
72    0.160037
73    0.876169
74    0.315635
75   -2.022201
76   -0.306204
77    0.827975
78    0.230095
79    0.762011
80   -0.222328
81   -0.200758
82    0.186561
83    0.410052
84    0.198300
85    0.119009
86   -0.670662
87    0.377564
88    0.121821
89    1.129484
90    1.198918
91    0.185156
92   -0.375285
93   -0.638730
94    0.423494
95    0.077340
96   -0.343854
97    0.043597
98   -0.620001
99    0.698032
Length: 100, dtype: float64

In [70]:
print s[3]
print '---'
print 'End values are NOT inclusive'
print s[2:4]
print '---'
print s[[2,4,20]]

-1.07296862216
---
End values are NOT inclusive
2   -0.528172
3   -1.072969
dtype: float64
---
2    -0.528172
4     0.865408
20   -1.100619
dtype: float64


In [71]:
s.index

RangeIndex(start=0, stop=100, step=1)

In [72]:
s.values

array([ 1.62434536, -0.61175641, -0.52817175, -1.07296862,  0.86540763,
       -2.3015387 ,  1.74481176, -0.7612069 ,  0.3190391 , -0.24937038,
        1.46210794, -2.06014071, -0.3224172 , -0.38405435,  1.13376944,
       -1.09989127, -0.17242821, -0.87785842,  0.04221375,  0.58281521,
       -1.10061918,  1.14472371,  0.90159072,  0.50249434,  0.90085595,
       -0.68372786, -0.12289023, -0.93576943, -0.26788808,  0.53035547,
       -0.69166075, -0.39675353, -0.6871727 , -0.84520564, -0.67124613,
       -0.0126646 , -1.11731035,  0.2344157 ,  1.65980218,  0.74204416,
       -0.19183555, -0.88762896, -0.74715829,  1.6924546 ,  0.05080775,
       -0.63699565,  0.19091548,  2.10025514,  0.12015895,  0.61720311,
        0.30017032, -0.35224985, -1.1425182 , -0.34934272, -0.20889423,
        0.58662319,  0.83898341,  0.93110208,  0.28558733,  0.88514116,
       -0.75439794,  1.25286816,  0.51292982, -0.29809284,  0.48851815,
       -0.07557171,  1.13162939,  1.51981682,  2.18557541, -1.39

In [73]:
pd.Series([1,2,3,4],index=['a','b','c','d'])

a    1
b    2
c    3
d    4
dtype: int64

In [74]:
pd.Series({'a':1,'b':2,'c':3,'d':4,'e':5})

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [75]:
s = pd.Series([10, 0, 1, 1, 2, 3, 4, 5, 6, np.nan])
s

0    10.0
1     0.0
2     1.0
3     1.0
4     2.0
5     3.0
6     4.0
7     5.0
8     6.0
9     NaN
dtype: float64

In [76]:
print 'len(s): ' + str(len(s))
print 's.count(): ' + str(s.count())
print 'len(s.unique()): ' + str(len(s.unique()))
print '----'
print 's.value_counts(): \n' + str(s.value_counts())

len(s): 10
s.count(): 9
len(s.unique()): 9
----
s.value_counts(): 
1.0     2
6.0     1
5.0     1
4.0     1
3.0     1
2.0     1
0.0     1
10.0    1
dtype: int64


In [77]:
s.shape

(10,)

#### Alignment via index labels makes actions between Series objects super-intuitive


In [78]:
s3 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s3

a    1
b    2
c    3
d    4
dtype: int64

In [79]:
s4 = pd.Series([4, 3, 2, 1], index=['d', 'c', 'b', 'a'])
s4

d    4
c    3
b    2
a    1
dtype: int64

In [80]:
s3 + s4

a    2
b    4
c    6
d    8
dtype: int64

In [81]:
pd.DataFrame(np.array([[10,11],[20,21]]))

Unnamed: 0,0,1
0,10,11
1,20,21


In [82]:
pd.DataFrame()

In [83]:
df1 = pd.DataFrame([pd.Series(np.arange(10,15)), pd.Series(np.arange(15, 20))])
df1

Unnamed: 0,0,1,2,3,4
0,10,11,12,13,14
1,15,16,17,18,19


In [84]:
df1.shape

(2, 5)

In [85]:
df = pd.DataFrame(np.array([[10, 11], [20, 21]]), columns=['a', 'b'])
df

Unnamed: 0,a,b
0,10,11
1,20,21


In [86]:
df.columns

Index([u'a', u'b'], dtype='object')

In [87]:
df.columns = ['new col 1', 'new col 2']
df

Unnamed: 0,new col 1,new col 2
0,10,11
1,20,21


#### Index labels

In [88]:
df = pd.DataFrame(np.array([[0, 1],[2, 3]]),
                 columns=['c1', 'c2'],
                 index=['r1', 'r2'])
df

Unnamed: 0,c1,c2
r1,0,1
r2,2,3


In [89]:
df.index

Index([u'r1', u'r2'], dtype='object')

In [90]:
df.values

array([[0, 1],
       [2, 3]])

In [91]:
s1 = pd.Series(np.arange(1, 6, step=1))
s2 = pd.Series(np.arange(6, 11, step=1))
print s1
print s2
pd.DataFrame({'c1': s1, 'c2': s2})

0    1
1    2
2    3
3    4
4    5
dtype: int64
0     6
1     7
2     8
3     9
4    10
dtype: int64


Unnamed: 0,c1,c2
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


#### A DataFrame aligns itself based upon shared indexes

In [92]:
s3 = pd.Series(np.arange(12, 14), index=[1,2])
pd.DataFrame({'c1':s1, 'c2': s2, 'c3':s3})

Unnamed: 0,c1,c2,c3
0,1,6,
1,2,7,12.0
2,3,8,13.0
3,4,9,
4,5,10,


In [94]:
sp500 = pd.read_csv("data/Chapter02/sp500.csv", index_col='Symbol', usecols=[0, 2 , 3, 7])
sp500.head()

Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABT,Health Care,39.6,15.573
ABBV,Health Care,53.95,2.954
ACN,Information Technology,79.79,8.326
ACE,Financials,102.91,86.897


In [97]:
sp500.index

Index([u'MMM', u'ABT', u'ABBV', u'ACN', u'ACE', u'ACT', u'ADBE', u'AES',
       u'AET', u'AFL',
       ...
       u'XEL', u'XRX', u'XLNX', u'XL', u'XYL', u'YHOO', u'YUM', u'ZMH',
       u'ZION', u'ZTS'],
      dtype='object', name=u'Symbol', length=500)

#### Selecting specific columns

In [120]:
print 'By column number -- DO NOT DO THIS'
sp500.T.iloc[[1,2]].T.head(3)


By column number -- DO NOT DO THIS


Unnamed: 0_level_0,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1
MMM,141.14,26.668
ABT,39.6,15.573
ABBV,53.95,2.954


In [125]:
print 'By column name --- DO THIS'
sp500[['Price','Book Value']].head(3)

By column name --- DO THIS


Unnamed: 0_level_0,Price
Symbol,Unnamed: 1_level_1
MMM,141.14
ABT,39.6
ABBV,53.95


In [127]:
print 'As an attribute -- No good reason to do this unless you need a Series. Doesn\'t work with spaces'
sp500.Price.head(3)

As an attribute -- No good reason to do this unless you need a Series. Doesn't work with spaces


Symbol
MMM     141.14
ABT      39.60
ABBV     53.95
Name: Price, dtype: float64

#### Selecting rows using the Index
- **sp500[:3] or sp500['XYL':'YUM']**
 - Slicing is slow and easily confused as selecting columns.  
 - Use it sparingly for selection (outside of Time Series which will be discussed later)
 


- **sp500.loc['XYL']**
 - Is faster
 - The loc keyword distinguishes clearly from column selection
 
 
- **sp500.loc[['XYL', 'YUM']]**
 - Only the specific values passed are selected; not a range of values
 
 
- **sp500.at['MSFT', 'Price']**
 - Retrieve a single value
 - This does not generalize to passing a list like df.loc[]

In [128]:
print "Slicing with [] -- get the last 3 rows"
sp500[:3]

Slicing with []


Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,Industrials,141.14,26.668
ABT,Health Care,39.6,15.573
ABBV,Health Care,53.95,2.954


In [129]:
print "Slicing with [] -- get rows between two given rows"
sp500['XYL' : 'YUM']

Slicing with [] -- get rows between two given rows


Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
XYL,Industrials,38.42,12.127
YHOO,Information Technology,35.02,12.768
YUM,Consumer Discretionary,74.77,5.147


In [131]:
print "Retrieve rows via the index label value"
sp500.loc['MMM']

Retrieve rows via the index label value


Sector        Industrials
Price              141.14
Book Value         26.668
Name: MMM, dtype: object

In [133]:
print 'Loc retrives only what you send it in the list'
sp500.loc[['XYL', 'YUM']]

Loc retrives only what you send it in the list


Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
XYL,Industrials,38.42,12.127
YUM,Consumer Discretionary,74.77,5.147


In [135]:
print 'You can pass a list ot the Locs!'
pass_these = ['XYL', 'YUM']
sp500.loc[pass_these]

You can pass a list ot the Locs!


Unnamed: 0_level_0,Sector,Price,Book Value
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
XYL,Industrials,38.42,12.127
YUM,Consumer Discretionary,74.77,5.147


In [142]:
print 'Find single values using df.at[]'
print 'This does NOT generalize to multiple values'
sp500.loc['MMM','Price']

Find single values using df.at[]
This does NOT generalize to multiple values


141.13999999999999

#### Boolean Selection - Probably the most important thing about selection

> * Step 1: Boolean for prices such that 0 < price < 10
> * Step 2: Send that Boolean to the dataframe --> becomes a Boolean filter, returning the True rows
> * Step 3: Select just the Price column from the resulting rows
> > **sp500[(sp500['Price'] < 10) & (sp500['Price'] > 0)][['Price']]**


In [151]:
print 'Step 1: Boolean for prices such that 0 < price < 10'
print 'Step 2: Send that Boolean to the dataframe --> become a Boolean filter, returning the True rows'
print 'Step 3: Select just the Price column from the resulting rows'
sp500[(sp500['Price'] < 10) & (sp500['Price'] > 0)][['Price']]

Step 1: Boolean for prices such that 0 < price < 10
Step 2: Send that Boolean to the dataframe --> become a Boolean filter, returning the True rows
Step 3: Select just the Price column from the resulting rows


Unnamed: 0_level_0,Price
Symbol,Unnamed: 1_level_1
FTR,5.81
HCBK,9.8
HBAN,9.1
SLM,8.82
WIN,9.38


#### Arithmetic in a DF

In [153]:
np.random.seed(123456)
df = pd.DataFrame(np.random.randn(5, 4), columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
0,0.469112,-0.282863,-1.509059,-1.135632
1,1.212112,-0.173215,0.119209,-1.044236
2,-0.861849,-2.104569,-0.494929,1.071804
3,0.721555,-0.706771,-1.039575,0.27186
4,-0.424972,0.56702,0.276232,-1.087401


In [157]:
print 'Scalar operations apply to each row'
df * 2

Scalar operations apply to each row


Unnamed: 0,A,B,C,D
0,0.938225,-0.565727,-3.018117,-2.271265
1,2.424224,-0.346429,0.238417,-2.088472
2,-1.723698,-4.209138,-0.989859,2.143608
3,1.44311,-1.413542,-2.07915,0.54372
4,-0.849945,1.134041,0.552464,-2.174801


#### Broadcasting

- DataFrame / Series interactions: the given operation is applied ("broadcast") row-by-row through the data frame
  - Each Series item is aligned with a DataFrame item at the same index label (i.e. row-by-row)


- DataFrame / DataFrame operations: align across both the column and index labels
~~~~
# Step 1: Create a subframe
# Step 2: Subtract it from the original frame
# Step 3: Note the NaN values where subframe was missing equal column and index values
subframe = df[1:4][['B','C']]
print subframe
print df - subframe
~~~~


- DataFrames arithmetic is best done with built-in DataFrame methods.
  - Those give access to specifications such as using different axes
  - Allows you to **Broadcast Across the Columns**
~~~~
print 'a_col is the A\'th column so has index values of 0, 1, 2, 3, 4'
a_col = df['A']
print a_col
print 'Above, df.iloc[0] had index values A, B, C, D which were used to align to the columns'
print 'If we naively subtract, we\'re saying try to find columns 0, 1, 2, 3, 4 in the df -- and those don\'t exist'
print  df - a_col
print 'DataFrame arithmetic methods allow us to specify the axis where the index alignment should take place'
print 'By saying axis=0, pandas will align along axis = 0 (i.e. the rows) and then broadcast across the columns'
~~~~
> **df.sub(a_col, axis=0)**

In [187]:
df.iloc[0]

A    0.469112
B   -0.282863
C   -1.509059
D   -1.135632
Name: 0, dtype: float64

In [164]:
print 'The value of the 0th row is subtracted from each row'
df - df.iloc[0]

The value of the 0th row is subtracted from each row


Unnamed: 0,A,B,C,D
0,0.0,0.0,0.0,0.0
1,0.743,0.109649,1.628267,0.091396
2,-1.330961,-1.821706,1.014129,2.207436
3,0.252443,-0.423908,0.469484,1.407492
4,-0.894085,0.849884,1.785291,0.048232


In [170]:
print 'Step 1: Create a subframe'
print 'Step 2: Subtract it from the original frame'
print 'Step 3: Note the NaN values where subframe was missing equal column and index values'
subframe = df[1:4][['B','C']]
print subframe
print df - subframe

Step 1: Create a subframe
Step 2: Subtract it from the original frame
Step 3: Note the NaN values where subframe was missing equal column and index values
          B         C
1 -0.173215  0.119209
2 -2.104569 -0.494929
3 -0.706771 -1.039575
    A    B    C   D
0 NaN  NaN  NaN NaN
1 NaN  0.0  0.0 NaN
2 NaN  0.0  0.0 NaN
3 NaN  0.0  0.0 NaN
4 NaN  NaN  NaN NaN


In [188]:
print 'a_col is the A\'th column so has index values of 0, 1, 2, 3, 4'
a_col = df['A']
a_col

a_col is the A'th column so has index values of 0, 1, 2, 3, 4
Above, df.iloc[0] had index values A, B, C, D which were used to align to the columns
In this example, if we naively subtract


0    0.469112
1    1.212112
2   -0.861849
3    0.721555
4   -0.424972
Name: A, dtype: float64

In [189]:
print 'Above, df.iloc[0] had index values A, B, C, D which were used to align to the columns'
print 'If we naively subtract, we\'re saying try to find columns 0, 1, 2, 3, 4 in the df -- and those don\'t exist'
df - a_col

Above, df.iloc[0] had index values A, B, C, D which were used to align to the columns
In this example, if we naively subtract


Unnamed: 0,0,1,2,3,4,A,B,C,D
0,,,,,,,,,
1,,,,,,,,,
2,,,,,,,,,
3,,,,,,,,,
4,,,,,,,,,


In [190]:
print 'DataFrame arithmetic methods allow us to specify the axis where the index alignment should take place'
print 'By saying axis=0, pandas will align along axis = 0 (i.e. the rows) and then broadcast across the columns'
df.sub(a_col, axis=0)

DataFrame arithmetic methods allow us to specify the axis where the index alignment should take place
By saying axis=0, pandas will align along axis = 0 (i.e. the rows) and then broadcast across the columns


Unnamed: 0,A,B,C,D
0,0.0,-0.751976,-1.978171,-1.604745
1,0.0,-1.385327,-1.092903,-2.256348
2,0.0,-1.24272,0.36692,1.933653
3,0.0,-1.428326,-1.76113,-0.449695
4,0.0,0.991993,0.701204,-0.662428


In [185]:
a_col

0    0.469112
1    1.212112
2   -0.861849
3    0.721555
4   -0.424972
Name: A, dtype: float64

#### Reindexing

s.index = [my_list]
s.reindex

#### Use index and reindex to create alignment between Series and DataFrames


#### Replacing NaN's
- ffill
- bfill
- fill_value
~~~~
s2 = s.copy()
s2.reindex(['a','f'], fill_value = 0)
s3 = pd.Series(['red', 'green', 'blue'], index=[0, 3, 5])
pd.DataFrame([s3, s3.reindex(np.arange(0,7)).ffill(), s3.reindex(np.arange(0,7)).bfill()], index=['orig','ffill','bfill']).T
~~~~

In [194]:
print 'Add a new index'
np.random.seed(1)
s = pd.Series(np.random.randn(5))
s.index = ['a', 'b', 'c', 'd', 'e']
s

Add a new index


a    1.624345
b   -0.611756
c   -0.528172
d   -1.072969
e    0.865408
dtype: float64

In [203]:
'Step 1: Create a new series that takes as its index the value of s at those indices'
'Step 2: If s didn\'t have a value at that index, then at that index s2 is set to NaN'
'Step 3: s2 is actually a copy of s, so changes to s2 don\'t affect s'
s2 = s.reindex(['a', 'c', 'e', 'g'])
s2['a'] = 0
print s2
print s['a']

a    0.000000
c   -0.528172
e    0.865408
g         NaN
dtype: float64

In [213]:
print 'No alignment!  Sad!'
s1 = pd.Series([0, 1, 2], index=[0, 1, 2])
s2 = pd.Series([3, 4, 5], index=['0','1','2'])
s1 + s2

No alignment!  Sad!


0   NaN
1   NaN
2   NaN
0   NaN
1   NaN
2   NaN
dtype: float64

In [218]:
print 'Call the s2 index, select the values from that list, and cast them to int'
s1 + s2.index.values.astype(int)

Call the s2 index, select the values from that list, and cast them to int


0    0
1    2
2    4
dtype: int64

In [221]:
print 'Change the fill value from NaN to something else'
s2 = s.copy()
s2.reindex(['a','f'], fill_value = 0)

Change the fill value from NaN to something else


a    1.624345
f    0.000000
dtype: float64

In [244]:
s3 = pd.Series(['red', 'green', 'blue'], index=[0, 3, 5])
pd.DataFrame([s3, s3.reindex(np.arange(0,7)).ffill(), s3.reindex(np.arange(0,7)).bfill()], index=['orig','ffill','bfill']).T

Unnamed: 0,orig,ffill,bfill
0,red,red,red
1,,red,green
2,,red,green
3,green,green,green
4,,green,blue
5,blue,blue,blue
6,,blue,
