In [1]:
import numpy as np
import pandas as pd

In [3]:
# prepare df
dates = pd.date_range("20130101", periods=6)

df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.341393,-0.160869,-1.55738,0.373125
2013-01-02,0.549646,1.516624,-0.267676,-1.125485
2013-01-03,-1.174385,-0.314151,1.127541,0.145942
2013-01-04,-0.45956,-0.005213,1.02315,0.284798
2013-01-05,-0.63795,0.767955,-0.058566,0.75221
2013-01-06,-0.015644,-1.315343,-0.541528,1.20327


## Statistic
`df.corr()`  
`df.mean()`  
`df.median()`  
`df.mode()`  
`df.std()`  
`df.count()`  


In [4]:
df.corr() # calculate the relationship between each column

Unnamed: 0,A,B,C,D
A,1.0,0.253291,-0.762873,-0.359491
B,0.253291,1.0,0.035668,-0.775979
C,-0.762873,0.035668,1.0,-0.112289
D,-0.359491,-0.775979,-0.112289,1.0


In [13]:
df.mean()
df.median()
df.mode()
df.std()
df.count()


filter1 = df['D']>0
filter1.any() 

True

## Apply (Applying functions to the data)
`df.apply(function)`

In [16]:
a = np.array([[1,2,3], [4,5,6]])
np.cumsum(a)
np.cumsum(a, axis = 0) # sum over rows for each of the 3 columns
np.cumsum(a, axis = 1) # sum over columns for each of the 2 rows

array([[1, 2, 3],
       [4, 5, 6]])

In [20]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2013-01-01,0.341393,-0.160869,-1.55738,0.373125
2013-01-02,0.891039,1.355755,-1.825057,-0.75236
2013-01-03,-0.283346,1.041604,-0.697516,-0.606418
2013-01-04,-0.742906,1.036392,0.325633,-0.32162
2013-01-05,-1.380856,1.804346,0.267067,0.430589
2013-01-06,-1.3965,0.489004,-0.27446,1.63386


In [26]:
df.apply(lambda x: x.max() - x.min(), axis = 0) # applying self-defined function to df

A    1.724031
B    2.831967
C    2.684921
D    2.328755
dtype: float64

## Histogramming¶
`df.value_counts()`

In [29]:
df['A'].value_counts()

-0.459560    1
 0.341393    1
-0.015644    1
-0.637950    1
-1.174385    1
 0.549646    1
Name: A, dtype: int64

## String methods
`df[].str.lower()`  
`df[].str.upper()`  
`df[].str.split()`  
`df[].str.replace()`  
`df[].str.contains()`  
`df[].str.extract()`

In [35]:
df1 = pd.DataFrame({'a':['a_1','a_2','a_3'], 'b':['b(1)','b(2)','b(3)']})
df1

Unnamed: 0,a,b
0,a_1,b(1)
1,a_2,b(2)
2,a_3,b(3)


In [37]:
df1['a'].str.len()

0    3
1    3
2    3
Name: a, dtype: int64

In [43]:
#The strip() method removes any leading (spaces at the beginning) and trailing (spaces at the end) characters 
#(space is the default leading character to remove)
df1.columns.str.strip()

Index(['a', 'b'], dtype='object')

In [44]:
df1['a'].str.split('_', expand = True)

Unnamed: 0,0,1
0,a,1
1,a,2
2,a,3


In [45]:
df1['a'].str.replace('_', '+')

0    a+1
1    a+2
2    a+3
Name: a, dtype: object

In [47]:
df1['a'].str.contains('2')

0    False
1     True
2    False
Name: a, dtype: bool

In [48]:
df1['b'].str.extract('([a-z]\w{0,})')

Unnamed: 0,0
0,b
1,b
2,b


In [50]:
df1['b'].str.extract('(\d)') 

Unnamed: 0,0
0,1
1,2
2,3


In [51]:
df1['b'].str.extract('.*\((.*)\).*', expand = True)

Unnamed: 0,0
0,1
1,2
2,3
