In [1]:
import pandas as pd
import numpy as np

### Series and Dataframes

#### Series
s = pd.Series(np.random(randn(100))
s.index
s.values
pd.Series([1,2,3,4],index=['a','b','c','d'])
#### len(s) v. s.count
    len(s) = total number of elements
    s.count() = total number of non-NaN elements
    
#### Other useful Series Functions
- s.shape()
- s.unique()
- s.value_counts() = sorted-by-counts histogram of each value in the Series

In [2]:
np.random.seed(1)
s = pd.Series(np.random.randn(100))
s

0     1.624345
1    -0.611756
2    -0.528172
3    -1.072969
4     0.865408
5    -2.301539
6     1.744812
7    -0.761207
8     0.319039
9    -0.249370
10    1.462108
11   -2.060141
12   -0.322417
13   -0.384054
14    1.133769
15   -1.099891
16   -0.172428
17   -0.877858
18    0.042214
19    0.582815
20   -1.100619
21    1.144724
22    0.901591
23    0.502494
24    0.900856
25   -0.683728
26   -0.122890
27   -0.935769
28   -0.267888
29    0.530355
        ...   
70   -1.444114
71   -0.504466
72    0.160037
73    0.876169
74    0.315635
75   -2.022201
76   -0.306204
77    0.827975
78    0.230095
79    0.762011
80   -0.222328
81   -0.200758
82    0.186561
83    0.410052
84    0.198300
85    0.119009
86   -0.670662
87    0.377564
88    0.121821
89    1.129484
90    1.198918
91    0.185156
92   -0.375285
93   -0.638730
94    0.423494
95    0.077340
96   -0.343854
97    0.043597
98   -0.620001
99    0.698032
Length: 100, dtype: float64

In [7]:
print s[3]
print '---'
print 'End values are NOT inclusive'
print s[2:4]
print '---'
print s[[2,4,20]]

-1.07296862216
---
End values are NOT inclusive
2   -0.528172
3   -1.072969
dtype: float64
---
2    -0.528172
4     0.865408
20   -1.100619
dtype: float64


In [8]:
s.index

RangeIndex(start=0, stop=100, step=1)

In [9]:
s.values

array([ 1.62434536, -0.61175641, -0.52817175, -1.07296862,  0.86540763,
       -2.3015387 ,  1.74481176, -0.7612069 ,  0.3190391 , -0.24937038,
        1.46210794, -2.06014071, -0.3224172 , -0.38405435,  1.13376944,
       -1.09989127, -0.17242821, -0.87785842,  0.04221375,  0.58281521,
       -1.10061918,  1.14472371,  0.90159072,  0.50249434,  0.90085595,
       -0.68372786, -0.12289023, -0.93576943, -0.26788808,  0.53035547,
       -0.69166075, -0.39675353, -0.6871727 , -0.84520564, -0.67124613,
       -0.0126646 , -1.11731035,  0.2344157 ,  1.65980218,  0.74204416,
       -0.19183555, -0.88762896, -0.74715829,  1.6924546 ,  0.05080775,
       -0.63699565,  0.19091548,  2.10025514,  0.12015895,  0.61720311,
        0.30017032, -0.35224985, -1.1425182 , -0.34934272, -0.20889423,
        0.58662319,  0.83898341,  0.93110208,  0.28558733,  0.88514116,
       -0.75439794,  1.25286816,  0.51292982, -0.29809284,  0.48851815,
       -0.07557171,  1.13162939,  1.51981682,  2.18557541, -1.39

In [10]:
pd.Series([1,2,3,4],index=['a','b','c','d'])

a    1
b    2
c    3
d    4
dtype: int64

In [11]:
pd.Series({'a':1,'b':2,'c':3,'d':4,'e':5})

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [13]:
s = pd.Series([10, 0, 1, 1, 2, 3, 4, 5, 6, np.nan])
s

0    10.0
1     0.0
2     1.0
3     1.0
4     2.0
5     3.0
6     4.0
7     5.0
8     6.0
9     NaN
dtype: float64

In [25]:
print 'len(s): ' + str(len(s))
print 's.count(): ' + str(s.count())
print 'len(s.unique()): ' + str(len(s.unique()))
print '----'
print 's.value_counts(): \n' + str(s.value_counts())

len(s): 10
s.count(): 9
len(s.unique()): 9
----
s.value_counts(): 
1.0     2
6.0     1
5.0     1
4.0     1
3.0     1
2.0     1
0.0     1
10.0    1
dtype: int64


In [15]:
s.shape

(10,)

#### Alignment via index labels makes actions between Series objects super-intuitive


In [26]:
s3 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s3

a    1
b    2
c    3
d    4
dtype: int64

In [28]:
s4 = pd.Series([4, 3, 2, 1], index=['d', 'c', 'b', 'a'])
s4

d    4
c    3
b    2
a    1
dtype: int64

In [34]:
s3 + s4

a    2
b    4
c    6
d    8
dtype: int64

### Data Frames

#### Creation from a NumPy arrray
pd.DataFrame(np.array([[10,11],[20,21]]))

#### Creation from Pandas Series
pd.DataFrame([pd.Series(np.arange(10,15)), pd.Series(np.arange(15, 20))])

#### Customizing Column and Index names
df = pd.DataFrame(
                 <br>np.array([[0, 1],[2, 3]]),
                 <br>columns=['c1', 'c2'],      
                 index=['r1', 'r2']
                 <br>)

#### Useful DataFrame functions
- df.shape - tuple of (# of rows, # of columns)
- df.columns - print column names
- df.columns = ['new col 1', 'new col 2'] - change column names
- df.values
- df.index

In [35]:
pd.DataFrame(np.array([[10,11],[20,21]]))

Unnamed: 0,0,1
0,10,11
1,20,21


In [37]:
pd.DataFrame()

0    10
1    15
dtype: int64

In [41]:
df1 = pd.DataFrame([pd.Series(np.arange(10,15)), pd.Series(np.arange(15, 20))])
df1

Unnamed: 0,0,1,2,3,4
0,10,11,12,13,14
1,15,16,17,18,19


In [42]:
df1.shape

(2, 5)

In [43]:
df = pd.DataFrame(np.array([[10, 11], [20, 21]]), columns=['a', 'b'])
df

Unnamed: 0,a,b
0,10,11
1,20,21


In [44]:
df.columns

Index([u'a', u'b'], dtype='object')

In [46]:
df.columns = ['new col 1', 'new col 2']
df

Unnamed: 0,new col 1,new col 2
0,10,11
1,20,21


#### Index labels

In [57]:
df = pd.DataFrame(np.array([[0, 1],[2, 3]]),
                 columns=['c1', 'c2'],
                 index=['r1', 'r2'])
df

Unnamed: 0,c1,c2
r1,0,1
r2,2,3


In [58]:
df.index

Index([u'r1', u'r2'], dtype='object')

In [59]:
df.values

array([[0, 1],
       [2, 3]])

In [64]:
s1 = pd.Series(np.arange(1, 6, step=1))
s2 = pd.Series(np.arange(6, 11, step=1))
print s1
print s2
pd.DataFrame({'c1': s1, 'c2': s2})

0    1
1    2
2    3
3    4
4    5
dtype: int64
0     6
1     7
2     8
3     9
4    10
dtype: int64


Unnamed: 0,c1,c2
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


#### A DataFrame aligns itself based upon the indexes

In [67]:
s3 = pd.Series(np.arange(12, 14), index=[1,2])
pd.DataFrame({'c1':s1, 'c2': s2, 'c3':s3})

Unnamed: 0,c1,c2,c3
0,1,6,
1,2,7,12.0
2,3,8,13.0
3,4,9,
4,5,10,
