## DataFrames
*pd.DataFrame(self, **data=None**, **index=None**, **columns=None**, dtype=None, copy=False)*

DataFrames are simple ***bunch of series that share the same index***

#### *You call the ROW, then the COLUMN  (0,1)*

In [1]:
import numpy as np
import pandas as pd

In [3]:
from numpy.random import randn

In [15]:
np.random.seed(101) #same random number

In [16]:
df = pd.DataFrame (randn(5,4),['A','B','C','D','E'],['W','X','Y','Z'])

In [18]:
df  #W,X,Y,Z is the series, while A to E are the index

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [21]:
df['W']

2.7068498393999381

In [22]:
type(df['W'])

pandas.core.series.Series

In [29]:
type(df)

pandas.core.frame.DataFrame

### Call Multiple Columns
*Use [[ ]]*

In [28]:
df[['X','Z']]

Unnamed: 0,X,Z
A,0.628133,0.503826
B,-0.319318,0.605965
C,0.740122,-0.589001
D,-0.758872,0.955057
E,1.978757,0.683509


### Create New Column

In [41]:
df['newCol'] = df['X']+df['Z']

In [31]:
df

Unnamed: 0,W,X,Y,Z,newCol
A,2.70685,0.628133,0.907969,0.503826,1.131958
B,0.651118,-0.319318,-0.848077,0.605965,0.286647
C,-2.018168,0.740122,0.528813,-0.589001,0.151122
D,0.188695,-0.758872,-0.933237,0.955057,0.196184
E,0.190794,1.978757,2.605967,0.683509,2.662266


### Drop Column

In [32]:
df.drop('newCol',axis=1)


Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [36]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### (Row) axis = 0; (Column) axis = 1

In [42]:
df.drop('newCol',axis=1,inplace=True)

In [44]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### Calling Rows
loc (label based) and iloc (numeric based)

In [45]:
df.loc['E']

W    0.190794
X    1.978757
Y    2.605967
Z    0.683509
Name: E, dtype: float64

In [47]:
df.iloc[4]

W    0.190794
X    1.978757
Y    2.605967
Z    0.683509
Name: E, dtype: float64

In [48]:
df.loc['E','Z']

0.68350888553891453

In [49]:
df.loc[['D','E'],['X','Z']] # you call the ROW, then COLUMN

Unnamed: 0,X,Z
D,-0.758872,0.955057
E,1.978757,0.683509


In [50]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### Conditional Selection

In [52]:
df > 0 #boolean result

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [53]:
booldf = df > 0

In [54]:
booldf

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [55]:
df [booldf]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [56]:
df['W'] > 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [59]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

### Series value when based on column

In [58]:
df[df['W'] > 0] # series value when based on column

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [62]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [69]:
resultdf = df[df['W'] > 0]

In [70]:
resultdf

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [71]:
resultdf['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [89]:
df[df['W'] > 0]['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

#### Key: Understand the below Conditional Selection in Pandas

In [73]:
df[df['W'] > 0][['X','Y']]

Unnamed: 0,X,Y
A,0.628133,0.907969
B,-0.319318,-0.848077
D,-0.758872,-0.933237
E,1.978757,2.605967


In [76]:
boolser = df['W'] > 0 
result =df[boolser]
mycols =['Y', 'X']
result[mycols]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


### Multiple Conditions Selection

and is &

or is |

In [78]:
df[(df['W'] > 0) & (df['Y']>1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


In [79]:
df[(df['W'] > 0) | (df['Y']>1)]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### Reset Index

Resets index to a numerical index, set old index as new column

In [81]:
 df.reset_index()  #inplace to make permanent!

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [82]:
newind = 'CA NY WY OR CO'.split()

In [83]:
'CA NY WY OR CO'.split()

['CA', 'NY', 'WY', 'OR', 'CO']

In [84]:
newind

['CA', 'NY', 'WY', 'OR', 'CO']

In [85]:
df['States'] = newind

In [90]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


### Set Index

* Not inplace by default


In [87]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509


In [88]:
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO
