In [35]:
import pandas as pd
import numpy as np
import plydata as ply

In [2]:
from numpy.random import randn
np.random.seed(101)
rmat = randn(5,4)

### Make a dataframe

In [6]:
df = pd.DataFrame(data = rmat,
                  index = 'a b c d e'.split(), 
                  columns = 'w x y z'.split())
df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


### Grab 1 column

In [10]:
df['y']

a    0.907969
b   -0.848077
c    0.528813
d   -0.933237
e    2.605967
Name: y, dtype: float64

### Grab more than 1 column

In [8]:
# pass a list into the dataframe brackets
df[['w', 'y']]

Unnamed: 0,w,y
a,2.70685,0.907969
b,0.651118,-0.848077
c,-2.018168,0.528813
d,0.188695,-0.933237
e,0.190794,2.605967


### Add a new column to the dataframe

In [18]:
df['new'] = df['w'] + df['y']
df

Unnamed: 0,w,x,y,z,new
a,2.70685,0.628133,0.907969,0.503826,3.614819
b,0.651118,-0.319318,-0.848077,0.605965,-0.196959
c,-2.018168,0.740122,0.528813,-0.589001,-1.489355
d,0.188695,-0.758872,-0.933237,0.955057,-0.744542
e,0.190794,1.978757,2.605967,0.683509,2.796762


### Drop a column (columns are axis = 1, rows are axis = 0)

In [19]:
df.drop('new', axis = 1, inplace = True)
df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


### Selecting 1 Row

In [20]:
df.loc['a']

w    2.706850
x    0.628133
y    0.907969
z    0.503826
Name: a, dtype: float64

In [21]:
df.iloc[2]

w   -2.018168
x    0.740122
y    0.528813
z   -0.589001
Name: c, dtype: float64

### Selecting subset of rows

In [22]:
df.loc[['a', 'e']]

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
e,0.190794,1.978757,2.605967,0.683509


### Select subset of rows and columns

In [25]:
df.loc[['a','b'], ['y', 'z']]

Unnamed: 0,y,z
a,0.907969,0.503826
b,-0.848077,0.605965


# Conditional Selection

Dataframes with boolean operators output a dataframe (or series) of boolean results

In [26]:
df > 0

Unnamed: 0,w,x,y,z
a,True,True,True,True
b,True,False,False,True
c,False,True,True,False
d,True,False,False,True
e,True,True,True,True


In [28]:
df[df > 0]

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,,,0.605965
c,,0.740122,0.528813,
d,0.188695,,,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [29]:
df['w'] > 0

a     True
b     True
c    False
d     True
e     True
Name: w, dtype: bool

### From df, Select rows Where w > 0

In [30]:
df[df['w'] > 0]

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


### From df, Select col y of the rows Where w > 0

In [31]:
df[df['w'] > 0]['y']

a    0.907969
b   -0.848077
d   -0.933237
e    2.605967
Name: y, dtype: float64

### From df, Select row A of col Y, of the rows Where W > 0

In [33]:
df[df['w'] > 0]['y'].loc['a']

0.9079694464765431

In [49]:
(df
 >> ply.query('w > 0')
 >> ply.select('y')
)\
   .loc['a'][0]

0.9079694464765431

### Compound Conditional

In [52]:
df[ (df['w'] > 0) & (df['y'] > 1) ]

Unnamed: 0,w,x,y,z
e,0.190794,1.978757,2.605967,0.683509


In [53]:
df >> ply.query('w > 0 and y > 1')

Unnamed: 0,w,x,y,z
e,0.190794,1.978757,2.605967,0.683509


### Dataframe Summary

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, a to e
Data columns (total 4 columns):
w    5 non-null float64
x    5 non-null float64
y    5 non-null float64
z    5 non-null float64
dtypes: float64(4)
memory usage: 360.0+ bytes


In [57]:
df.dtypes

w    float64
x    float64
y    float64
z    float64
dtype: object

In [58]:
df.describe()

Unnamed: 0,w,x,y,z
count,5.0,5.0,5.0,5.0
mean,0.343858,0.453764,0.452287,0.431871
std,1.681131,1.061385,1.454516,0.594708
min,-2.018168,-0.758872,-0.933237,-0.589001
25%,0.188695,-0.319318,-0.848077,0.503826
50%,0.190794,0.628133,0.528813,0.605965
75%,0.651118,0.740122,0.907969,0.683509
max,2.70685,1.978757,2.605967,0.955057


In [59]:
df >> ply.summarize()

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509
