In [1]:
%autosave 9999

Autosaving every 9999 seconds


In [2]:
import numpy as np
import pandas as pd

# Series

In [3]:
s1 = pd.Series(np.random.randn(3,),index=['a','b','c'])
s1.name = 'S1'
s1.index.name = 'Values'
s1

Values
a   -0.684337
b    0.572485
c    0.405541
Name: S1, dtype: float64

# Series: values, index, get values through index and mask operations and element operations

In [4]:
print(s1.values,s1.index,sep='\n')
print(s1['a'],s1[['b','c']],sep='\n')
print(s1[s1>0])
print(np.abs(s1),np.exp(s1),sep='\n')

[-0.68433733  0.57248485  0.40554115]
Index(['a', 'b', 'c'], dtype='object', name='Values')
-0.684337326901
Values
b    0.572485
c    0.405541
Name: S1, dtype: float64
Values
b    0.572485
c    0.405541
Name: S1, dtype: float64
Values
a    0.684337
b    0.572485
c    0.405541
Name: S1, dtype: float64
Values
a    0.504424
b    1.772666
c    1.500114
Name: S1, dtype: float64


# DataFrame

In [5]:
data_raw = {'a':np.arange(1,4),'b':np.arange(4,7),'c':np.arange(7,10)}
df1 = pd.DataFrame(data_raw,index=['K','L',1])
df1.index.name = 'Index'
df1.columns.name = 'Columns'
print(df1)

df2 = pd.DataFrame(data=np.arange(9).reshape(3,3),index=['K','L',1],columns=['a','b','c'])
df2.index.name = 'Index'
df2.columns.name = 'Columns'
print(df2)

Columns  a  b  c
Index           
K        1  4  7
L        2  5  8
1        3  6  9
Columns  a  b  c
Index           
K        0  1  2
L        3  4  5
1        6  7  8


# DataFrame: values, index, columns, get values through index and columns 

In [6]:
print(df1.values,df1.index,df1.columns,sep='\n')
print(df1['a'],df1[['a','b']],sep='\n') # columns
print(df1.iloc[1],df1.loc[1],sep='\n') # iloc the 1st row, loc the row with index 1

[[1 4 7]
 [2 5 8]
 [3 6 9]]
Index(['K', 'L', 1], dtype='object', name='Index')
Index(['a', 'b', 'c'], dtype='object', name='Columns')
Index
K    1
L    2
1    3
Name: a, dtype: int64
Columns  a  b
Index        
K        1  4
L        2  5
1        3  6
Columns
a    2
b    5
c    8
Name: L, dtype: int64
Columns
a    3
b    6
c    9
Name: 1, dtype: int64


# DataFrame: add columns

In [7]:
df1['d'] = np.NaN
df1['e'] = 16.5
df1['f'] = np.arange(1,4)
df1

Columns,a,b,c,d,e,f
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
K,1,4,7,,16.5,1
L,2,5,8,,16.5,2
1,3,6,9,,16.5,3


# DataFrame: add Series as new columns

In [8]:
s1 = pd.Series(['ssK','ss1'], index = ['K',1])
df1['s1'] = s1
df1

Columns,a,b,c,d,e,f,s1
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
K,1,4,7,,16.5,1,ssK
L,2,5,8,,16.5,2,
1,3,6,9,,16.5,3,ss1


# DataFrame: create new columns based on old columns, and dataframe transpose

In [9]:
df1['s1_nan?'] = df1['s1'].isnull()
print(df1,df1.T,sep='\n')

Columns  a  b  c   d     e  f   s1  s1_nan?
Index                                      
K        1  4  7 NaN  16.5  1  ssK    False
L        2  5  8 NaN  16.5  2  NaN     True
1        3  6  9 NaN  16.5  3  ss1    False
Index        K     L      1
Columns                    
a            1     2      3
b            4     5      6
c            7     8      9
d          NaN   NaN    NaN
e         16.5  16.5   16.5
f            1     2      3
s1         ssK   NaN    ss1
s1_nan?  False  True  False


# Reindexing-Series: Change rows, add NaN or fill with values

In [10]:
s1 = pd.Series(range(1,4),index=['a','b','c'])
print(s1.reindex(['b','c','a']))
print(s1.reindex(['b','e']))
print(s1.reindex(['b','e'],fill_value = 0))

s2 = pd.Series(range(1,4),index=[1,3,5])
print(s2.reindex(range(6)))
print(s2.reindex(range(6),method='ffill'))
print(s2.reindex(range(6),method='bfill'))

b    2
c    3
a    1
dtype: int64
b    2.0
e    NaN
dtype: float64
b    2
e    0
dtype: int64
0    NaN
1    1.0
2    NaN
3    2.0
4    NaN
5    3.0
dtype: float64
0    NaN
1    1.0
2    1.0
3    2.0
4    2.0
5    3.0
dtype: float64
0    1
1    1
2    2
3    2
4    3
5    3
dtype: int64


# Reindexing-DataFrame: Change rows, columns add NaN or fill with values

In [11]:
df3 = pd.DataFrame(np.arange(9).reshape(3,3),index = [1,2,3],columns = ['a','b','c'])
print(df3.reindex(index=[2,3,1,4],columns=['b','c','a','e']))
print(df3.reindex(index=[2,3,1,4],columns=['b','c','a','e'],fill_value=0))
print(df3.reindex(index=[2,3,1,4],columns=['b','c','a','e']).fillna(0))

     b    c    a   e
2  4.0  5.0  3.0 NaN
3  7.0  8.0  6.0 NaN
1  1.0  2.0  0.0 NaN
4  NaN  NaN  NaN NaN
   b  c  a  e
2  4  5  3  0
3  7  8  6  0
1  1  2  0  0
4  0  0  0  0
     b    c    a    e
2  4.0  5.0  3.0  0.0
3  7.0  8.0  6.0  0.0
1  1.0  2.0  0.0  0.0
4  0.0  0.0  0.0  0.0


# Dropping-Series: drop elements and NaN

In [12]:
s1 = pd.Series([1,2,3,np.NaN,5],index = ['a','b','c','d','e'])
print(s1)
print(s1.drop(['a','e']))
print(s1.dropna())

a    1.0
b    2.0
c    3.0
d    NaN
e    5.0
dtype: float64
b    2.0
c    3.0
d    NaN
dtype: float64
a    1.0
b    2.0
c    3.0
e    5.0
dtype: float64


# Dropping-DataFrame: drop rows, columns and NaN

In [13]:
df4=df3.reindex(columns=['a','b','c','d'],index=[0,1,2,3])
print(df4)
print(df4.drop([1,2]))
print(df4.drop(['a','b'],axis=1))
print(df4.dropna(how='all'))
print(df4.dropna(how='all',axis=1))

     a    b    c   d
0  NaN  NaN  NaN NaN
1  0.0  1.0  2.0 NaN
2  3.0  4.0  5.0 NaN
3  6.0  7.0  8.0 NaN
     a    b    c   d
0  NaN  NaN  NaN NaN
3  6.0  7.0  8.0 NaN
     c   d
0  NaN NaN
1  2.0 NaN
2  5.0 NaN
3  8.0 NaN
     a    b    c   d
1  0.0  1.0  2.0 NaN
2  3.0  4.0  5.0 NaN
3  6.0  7.0  8.0 NaN
     a    b    c
0  NaN  NaN  NaN
1  0.0  1.0  2.0
2  3.0  4.0  5.0
3  6.0  7.0  8.0


# Mask operation-DataFrame: element operation and row operation

In [14]:
df = pd.DataFrame(np.arange(9).reshape(3,3),index=[1,2,3],columns=['a','b','c'])
print(df)
print(df[df['c']>2])
df[df<5]=0
print(df)

   a  b  c
1  0  1  2
2  3  4  5
3  6  7  8
   a  b  c
2  3  4  5
3  6  7  8
   a  b  c
1  0  0  0
2  0  0  5
3  6  7  8


# Function apply: Series-map, DataFrame-applymap and apply

In [15]:
def zero2TF_element(x):
    if x==0:
        return True
    else:
        return False
print(df)
print(df['b'].map(zero2TF_element))
print(df.applymap(zero2TF_element))
print(df.apply(max))
print(df.apply(max,axis=1))

   a  b  c
1  0  0  0
2  0  0  5
3  6  7  8
1     True
2     True
3    False
Name: b, dtype: bool
       a      b      c
1   True   True   True
2   True   True  False
3  False  False  False
a    6
b    7
c    8
dtype: int64
1    0
2    5
3    8
dtype: int64


# DataFrame: row/column elements operations
By default, arithmetic between DataFrame and Series matches the index of the Series on the DataFrame's columns.

In [16]:
df = pd.DataFrame(np.arange(1,10).reshape(3,3),index=[1,2,3],columns=['a','b','c'])
s1 = pd.Series([1,2,3],index=[1,2,3])
s2 = pd.Series([1,2,3],index=['a','b','c'])
print(df,s1,sep='\n')
# defult is column operations
print(df-s1)
print(df-s2)
print(df-s1.values)

# do row operation
print(df.add(-s1.values,axis=0))


   a  b  c
1  1  2  3
2  4  5  6
3  7  8  9
1    1
2    2
3    3
dtype: int64
    a   b   c   1   2   3
1 NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN
   a  b  c
1  0  0  0
2  3  3  3
3  6  6  6
   a  b  c
1  0  0  0
2  3  3  3
3  6  6  6
   a  b  c
1  0  1  2
2  2  3  4
3  4  5  6


# Sorting-Series

In [17]:
s1 = pd.Series(range(4),index=['b','c','d','a'])
print(s1)
print(s1.sort_index())
print(s1.sort_index().sort_values())

b    0
c    1
d    2
a    3
dtype: int64
a    3
b    0
c    1
d    2
dtype: int64
b    0
c    1
d    2
a    3
dtype: int64


# Sorting-DataFrame

In [18]:
df = pd.DataFrame(np.arange(9,0,-1).reshape(3,3),index=[3,2,1],columns=['a','c','b'])
print(df)
print(df.sort_index())  # sort according to index
print(df.sort_index(axis=1)) # sort according to columns
print(df.sort_values('c')) # sort by column 'c'
print(df.sort_values(1,axis=1)) # sort by row 1

   a  c  b
3  9  8  7
2  6  5  4
1  3  2  1
   a  c  b
1  3  2  1
2  6  5  4
3  9  8  7
   a  b  c
3  9  7  8
2  6  4  5
1  3  1  2
   a  c  b
1  3  2  1
2  6  5  4
3  9  8  7
   b  c  a
3  7  8  9
2  4  5  6
1  1  2  3


# Computing descriptive statistics, correlation

In [19]:
df = pd.DataFrame(np.random.randn(10,10),index=range(10),columns=['a','b','c','d','e','f','g','h','i','j'])
df['sum'],df['mean'],df['max'],df['min'] = df.sum(axis=1),df.mean(axis=1),df.max(axis=1),df.min(axis=1)
df.describe()
df.corr()

Unnamed: 0,a,b,c,d,e,f,g,h,i,j,sum,mean,max,min
a,1.0,0.212924,-0.035569,0.294352,0.112748,-0.378507,-0.378807,0.287327,0.209175,-0.005804,0.449891,0.449891,0.349498,0.062234
b,0.212924,1.0,-0.699359,0.381586,-0.349282,-0.269271,-0.381273,-0.03794,-0.382261,0.077848,-0.102834,-0.102834,-0.546884,0.339736
c,-0.035569,-0.699359,1.0,-0.710829,-0.176514,0.513775,0.053662,0.489575,0.508858,0.26649,0.329405,0.329405,0.455182,-0.069089
d,0.294352,0.381586,-0.710829,1.0,0.366708,-0.632922,0.015975,-0.231023,-0.309765,0.056602,0.228516,0.228516,-0.106182,0.486669
e,0.112748,-0.349282,-0.176514,0.366708,1.0,-0.274663,0.231637,-0.247174,-0.07052,-0.322511,0.109336,0.109336,0.518991,-0.107652
f,-0.378507,-0.269271,0.513775,-0.632922,-0.274663,1.0,0.035187,0.010529,-0.067569,-0.27662,-0.247962,-0.247962,-0.278057,-0.17188
g,-0.378807,-0.381273,0.053662,0.015975,0.231637,0.035187,1.0,0.223614,-0.35521,0.148149,0.267293,0.267293,0.229693,0.037697
h,0.287327,-0.03794,0.489575,-0.231023,-0.247174,0.010529,0.223614,1.0,0.018944,0.729278,0.799248,0.799248,0.486501,0.409556
i,0.209175,-0.382261,0.508858,-0.309765,-0.07052,-0.067569,-0.35521,0.018944,1.0,-0.05575,0.140023,0.140023,0.490619,-0.259282
j,-0.005804,0.077848,0.26649,0.056602,-0.322511,-0.27662,0.148149,0.729278,-0.05575,1.0,0.66096,0.66096,0.182547,0.72593


# Hierarchical Indexing to be continued...