# DataFrames

In [1]:
import pandas as pd
import numpy as np

In [2]:
from numpy.random import randn
np.random.seed(55)

In [3]:
dataframe = pd.DataFrame(randn(4,5),index=['P','Q','R','S'],columns=['A','B','C','D','E'])

In [4]:
dataframe

Unnamed: 0,A,B,C,D,E
P,-1.623731,-0.101784,-1.809791,0.262654,0.259953
Q,-0.381086,-0.00229,0.341615,0.897572,-0.3611
R,1.656445,-1.189009,1.666429,-2.003439,-0.477873
S,1.368799,0.258169,0.702352,0.888382,0.72222


## Indexing and Selectionin dataframes

Let's learn the various methods to select data from a DataFrame

In [5]:
dataframe['A']

P   -1.623731
Q   -0.381086
R    1.656445
S    1.368799
Name: A, dtype: float64

In [None]:
# Passing a list of column names
dataframe[['A','B']]

Unnamed: 0,A,B
P,-1.623731,-0.101784
Q,-0.381086,-0.00229
R,1.656445,-1.189009
S,1.368799,0.258169


DataFrame Columns are just Series

In [None]:
type(dataframe['A'])

pandas.core.series.Series

**Creating a new column:**

In [None]:
dataframe['F'] = dataframe['A'] + dataframe['B']

In [None]:
dataframe

Unnamed: 0,A,B,C,D,E,F
P,-1.623731,-0.101784,-1.809791,0.262654,0.259953,-1.725515
Q,-0.381086,-0.00229,0.341615,0.897572,-0.3611,-0.383376
R,1.656445,-1.189009,1.666429,-2.003439,-0.477873,0.467436
S,1.368799,0.258169,0.702352,0.888382,0.72222,1.626967


** Removing Columns**

In [None]:
dataframe.drop('F',axis=1)

Unnamed: 0,A,B,C,D,E
P,-1.623731,-0.101784,-1.809791,0.262654,0.259953
Q,-0.381086,-0.00229,0.341615,0.897572,-0.3611
R,1.656445,-1.189009,1.666429,-2.003439,-0.477873
S,1.368799,0.258169,0.702352,0.888382,0.72222


In [None]:
dataframe

Unnamed: 0,A,B,C,D,E,F
P,-1.623731,-0.101784,-1.809791,0.262654,0.259953,-1.725515
Q,-0.381086,-0.00229,0.341615,0.897572,-0.3611,-0.383376
R,1.656445,-1.189009,1.666429,-2.003439,-0.477873,0.467436
S,1.368799,0.258169,0.702352,0.888382,0.72222,1.626967


In [None]:
# Not inplace unless specified!
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [None]:
dataframe.drop('F',axis=1,inplace=True)

In [None]:
dataframe

Unnamed: 0,A,B,C,D,E
P,-1.623731,-0.101784,-1.809791,0.262654,0.259953
Q,-0.381086,-0.00229,0.341615,0.897572,-0.3611
R,1.656445,-1.189009,1.666429,-2.003439,-0.477873
S,1.368799,0.258169,0.702352,0.888382,0.72222


Can also drop rows this way:

In [None]:
dataframe.drop('P',axis=0,inplace=True)

In [None]:
dataframe.drop('P',axis=0,inplace=True)
dataframe

Unnamed: 0,A,B,C,D,E
Q,-0.381086,-0.00229,0.341615,0.897572,-0.3611
R,1.656445,-1.189009,1.666429,-2.003439,-0.477873
S,1.368799,0.258169,0.702352,0.888382,0.72222


** Selecting Rows**

In [None]:
dataframe.loc['Q']

A   -0.381086
B   -0.002290
C    0.341615
D    0.897572
E   -0.361100
Name: Q, dtype: float64

Or select based off of position instead of label 

In [None]:
dataframe.iloc[0]

A   -0.381086
B   -0.002290
C    0.341615
D    0.897572
E   -0.361100
Name: Q, dtype: float64

In [None]:
# Select all the rows and columns
dataframe.iloc[:]

Unnamed: 0,A,B,C,D,E
Q,-0.381086,-0.00229,0.341615,0.897572,-0.3611
R,1.656445,-1.189009,1.666429,-2.003439,-0.477873
S,1.368799,0.258169,0.702352,0.888382,0.72222


In [None]:
#Skip the last column from the dataframe
dataframe.iloc[:,:-1]

Unnamed: 0,A,B,C,D
Q,-0.381086,-0.00229,0.341615,0.897572
R,1.656445,-1.189009,1.666429,-2.003439
S,1.368799,0.258169,0.702352,0.888382


In [None]:
#Select subset Q and R rows along with B and C columns
dataframe.iloc[:2,1:3]

Unnamed: 0,B,C
Q,-0.00229,0.341615
R,-1.189009,1.666429


In [None]:
# Select subset of rows and column using dataframe.loc
dataframe.loc['Q','A']

-0.3810863829200849

In [None]:
dataframe
dataframe.loc[['Q','R'],['B','C']]

Unnamed: 0,B,C
Q,-0.00229,0.341615
R,-1.189009,1.666429


### Conditional Selection

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [None]:
dataframe

Unnamed: 0,A,B,C,D,E
Q,-0.381086,-0.00229,0.341615,0.897572,-0.3611
R,1.656445,-1.189009,1.666429,-2.003439,-0.477873
S,1.368799,0.258169,0.702352,0.888382,0.72222


In [None]:
dataframe>0

Unnamed: 0,A,B,C,D,E
Q,False,False,True,True,False
R,True,False,True,False,False
S,True,True,True,True,True


In [None]:
#return the values where the value> 0 else returns NaN
dataframe[dataframe>0]

Unnamed: 0,A,B,C,D,E
Q,,,0.341615,0.897572,
R,1.656445,,1.666429,,
S,1.368799,0.258169,0.702352,0.888382,0.72222


In [None]:
#Returns the complete rows and columns where the value of columns A > 0
dataframe[dataframe['A']>0]

Unnamed: 0,A,B,C,D,E
R,1.656445,-1.189009,1.666429,-2.003439,-0.477873
S,1.368799,0.258169,0.702352,0.888382,0.72222


For multiple conditions we can use | and & with parenthesis:

In [None]:
dataframe[(dataframe['A']>0) & (dataframe['B'] > 0)]

Unnamed: 0,A,B,C,D,E
S,1.368799,0.258169,0.702352,0.888382,0.72222


In [None]:
dataframe

Unnamed: 0,A,B,C,D,E
Q,-0.381086,-0.00229,0.341615,0.897572,-0.3611
R,1.656445,-1.189009,1.666429,-2.003439,-0.477873
S,1.368799,0.258169,0.702352,0.888382,0.72222


In [None]:
# reset_index() reset to default 0,1...n index
dataframe.reset_index()

Unnamed: 0,index,A,B,C,D,E
0,Q,-0.381086,-0.00229,0.341615,0.897572,-0.3611
1,R,1.656445,-1.189009,1.666429,-2.003439,-0.477873
2,S,1.368799,0.258169,0.702352,0.888382,0.72222
