# Getting started with pandas

In [10]:
import numpy as np
import pandas as pd

## Playing around with Series

To start, create a series from the following dict:

In [11]:
d = dict({'a': 'Poisson', 'b': 'Binomial', 'c': 'Geometric', 'd': 'Logistic'})
d

{'a': 'Poisson', 'b': 'Binomial', 'c': 'Geometric', 'd': 'Logistic'}

In [12]:
s = pd.Series(d)
s

a      Poisson
b     Binomial
c    Geometric
d     Logistic
dtype: object

Convert the function names to lower case:

In [13]:
s = s.map(lambda x: x.lower())

Now sort the series by value:

In [14]:
s.sort_values(inplace = True)

In [15]:
s

b     binomial
c    geometric
d     logistic
a      poisson
dtype: object

In [16]:
s

b     binomial
c    geometric
d     logistic
a      poisson
dtype: object

## DataFrame: Indexing

Create a 2-dimensional DataFrame with values from a standard normal distribution, in 6 rows and 3 columns, the column names being 'col1', 'col2' and 'col3', and the rows indexed by month names. 

In [26]:
df = pd.DataFrame(np.random.randn(18).reshape(6,3), 
                  index = ['jan', 'feb', 'march', 'april', 'mai', 'june'],
                  columns = ['col1', 'col2', 'col3'])
df

Unnamed: 0,col1,col2,col3
jan,-1.054131,-1.055594,-0.436361
feb,0.53845,0.416906,1.202706
march,-0.93219,0.369762,-2.05827
april,-3.041358,0.740488,-0.59747
mai,-1.108838,-1.341963,0.8674
june,-2.583482,-0.879425,0.957292


Now, display 
- col1 only
- march only
- col1 of march only
- row 2 only (row 1 in 0-based thinking)
- row 2, col2 only

In [35]:
df['col1']
# or: df.loc[:,'col1']

jan     -1.054131
feb      0.538450
march   -0.932190
april   -3.041358
mai     -1.108838
june    -2.583482
Name: col1, dtype: float64

In [39]:
df.loc['march']

col1   -0.932190
col2    0.369762
col3   -2.058270
Name: march, dtype: float64

In [41]:
df.loc['march', 'col1']

-0.93219045293299085

In [43]:
df.iloc[1]

col1    0.538450
col2    0.416906
col3    1.202706
Name: feb, dtype: float64

In [46]:
df.ix[1, 'col2']
# df.iloc[1,1]

0.41690593750084004

Now add 10 to col2, and multiply march by 7:

In [49]:
df.loc['march'] = df.loc['march'] * 7
df

Unnamed: 0,col1,col2,col3
jan,-1.054131,-1.055594,-0.436361
feb,0.53845,0.416906,1.202706
march,-45.677332,18.118332,-100.855233
april,-3.041358,0.740488,-0.59747
mai,-1.108838,-1.341963,0.8674
june,-2.583482,-0.879425,0.957292


In [50]:
df['col2'] = df['col2'] + 10
df

Unnamed: 0,col1,col2,col3
jan,-1.054131,8.944406,-0.436361
feb,0.53845,10.416906,1.202706
march,-45.677332,28.118332,-100.855233
april,-3.041358,10.740488,-0.59747
mai,-1.108838,8.658037,0.8674
june,-2.583482,9.120575,0.957292


Now display statistical summary values:

In [52]:
df.describe()

Unnamed: 0,col1,col2,col3
count,6.0,6.0,6.0
mean,-8.821115,12.666457,-16.476944
std,18.100282,7.616097,41.343688
min,-45.677332,8.658037,-100.855233
25%,-2.926889,8.988449,-0.557193
50%,-1.84616,9.768741,0.21552
75%,-1.067808,10.659592,0.934819
max,0.53845,28.118332,1.202706


tbd
- join
- row/col means
- groupby was ist das für ein df??