# DataFrames

DataFrames are the workhorse of pandas and are directly inspired by the R programming language. We can think of a DataFrame as a bunch of Series objects put together to share the same index. Let's use pandas to explore this topic!

* 1) Creating DataFrames
* 2) Selecting, Creating and Dropping Columns
* 3) Selecting, Creating and Dropping Rows 
* 4) Selecting subset of rows and columns
* 5) Selecting by conditions
* 6) Selecting by data type
* 7) Set New Index
* 8) Reset Index

In [1]:
import pandas as pd
import numpy as np

np.random.seed(101)

## 1) Creating DataFrames

### Using Numpy Array

In [2]:
np.random.randn(5,4)

array([[ 2.70684984,  0.62813271,  0.90796945,  0.50382575],
       [ 0.65111795, -0.31931804, -0.84807698,  0.60596535],
       [-2.01816824,  0.74012206,  0.52881349, -0.58900053],
       [ 0.18869531, -0.75887206, -0.93323722,  0.95505651],
       [ 0.19079432,  1.97875732,  2.60596728,  0.68350889]])

In [3]:
df = pd.DataFrame(np.random.randn(5,4), index=['A', 'B', 'C', 'D', 'E'], columns=['W', 'X', 'Y', 'Z'])

In [67]:
df

Unnamed: 0,W,X,Y,Z,my_list,new
A,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, A to D
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   W        4 non-null      float64
 1   X        4 non-null      float64
 2   Y        4 non-null      float64
 3   Z        4 non-null      float64
 4   my_list  4 non-null      int64  
 5   new      4 non-null      float64
dtypes: float64(5), int64(1)
memory usage: 396.0+ bytes


### Using Dictionary

In [4]:
df2 = pd.DataFrame({'A':[1,2,3],
                  'B':[5,6,7],
                  'C':[1,2,3],
                  'D': [10, 20, 30]}, index=['X', 'Y', 'Z'])

In [5]:
df2

Unnamed: 0,A,B,C,D
X,1,5,1,10
Y,2,6,2,20
Z,3,7,3,30


## 2) Selecting, Creating and Dropping Columns 

### Selecting Columns

In [6]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [8]:
df[['X', 'Y']]

Unnamed: 0,X,Y
A,1.693723,-1.706086
B,0.390528,0.166905
C,0.07296,0.638787
D,-0.75407,-0.943406
E,1.901755,0.238127


In [66]:
df['W']

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [67]:
df['X']

A    1.693723
B    0.390528
C    0.072960
D   -0.754070
E    1.901755
Name: X, dtype: float64

In [68]:
# Pass a list of column names
df[['Z','W', 'X']]

Unnamed: 0,Z,W,X
A,-1.159119,0.302665,1.693723
B,0.184502,-0.134841,0.390528
C,0.329646,0.807706,0.07296
D,0.484752,-0.497104,-0.75407
E,1.996652,-0.116773,1.901755


DataFrame Columns are just Series

In [69]:
type(df['W'])

pandas.core.series.Series

In [70]:
df['W'].dtype

dtype('float64')

### Creating a New Column

In [7]:
df

Unnamed: 0,W,X,Y,Z
A,0.302665,1.693723,-1.706086,-1.159119
B,-0.134841,0.390528,0.166905,0.184502
C,0.807706,0.07296,0.638787,0.329646
D,-0.497104,-0.75407,-0.943406,0.484752
E,-0.116773,1.901755,0.238127,1.996652


In [8]:
df['my_list'] = [20, 50, 80, 90, 70]

In [9]:
df

Unnamed: 0,W,X,Y,Z,my_list
A,0.302665,1.693723,-1.706086,-1.159119,20
B,-0.134841,0.390528,0.166905,0.184502,50
C,0.807706,0.07296,0.638787,0.329646,80
D,-0.497104,-0.75407,-0.943406,0.484752,90
E,-0.116773,1.901755,0.238127,1.996652,70


In [10]:
df['my_np_array'] = np.array([200, 500, 800, 900, 700])
df['empty_column'] = np.nan
df['None_column'] = None
df['no values'] = 'NO'

In [11]:
df

Unnamed: 0,W,X,Y,Z,my_list,my_np_array,empty_column,None_column,no values
A,0.302665,1.693723,-1.706086,-1.159119,20,200,,,NO
B,-0.134841,0.390528,0.166905,0.184502,50,500,,,NO
C,0.807706,0.07296,0.638787,0.329646,80,800,,,NO
D,-0.497104,-0.75407,-0.943406,0.484752,90,900,,,NO
E,-0.116773,1.901755,0.238127,1.996652,70,700,,,NO


In [76]:
pd.Series([10,20,30,40,50], index=['B', 'A', 'E', 'D', 'C'])

B    10
A    20
E    30
D    40
C    50
dtype: int64

In [12]:
df['my_ser'] = pd.Series([10,20,30,40,50], index=['B', 'A', 'E', 'D', 'C'])

In [13]:
df

Unnamed: 0,W,X,Y,Z,my_list,my_np_array,empty_column,None_column,no values,my_ser
A,0.302665,1.693723,-1.706086,-1.159119,20,200,,,NO,20
B,-0.134841,0.390528,0.166905,0.184502,50,500,,,NO,10
C,0.807706,0.07296,0.638787,0.329646,80,800,,,NO,50
D,-0.497104,-0.75407,-0.943406,0.484752,90,900,,,NO,40
E,-0.116773,1.901755,0.238127,1.996652,70,700,,,NO,30


In [79]:
df['W'] + df['Y']

A   -1.403420
B    0.032064
C    1.446493
D   -1.440510
E    0.121354
dtype: float64

In [14]:
df['new'] = df['W'] + df['Y']

In [15]:
df

Unnamed: 0,W,X,Y,Z,my_list,my_np_array,empty_column,None_column,no values,my_ser,new
A,0.302665,1.693723,-1.706086,-1.159119,20,200,,,NO,20,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,500,,,NO,10,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,800,,,NO,50,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,900,,,NO,40,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,70,700,,,NO,30,0.121354


In [16]:
df['plus_5'] = df['my_ser'] + 5
df

Unnamed: 0,W,X,Y,Z,my_list,my_np_array,empty_column,None_column,no values,my_ser,new,plus_5
A,0.302665,1.693723,-1.706086,-1.159119,20,200,,,NO,20,-1.40342,25
B,-0.134841,0.390528,0.166905,0.184502,50,500,,,NO,10,0.032064,15
C,0.807706,0.07296,0.638787,0.329646,80,800,,,NO,50,1.446493,55
D,-0.497104,-0.75407,-0.943406,0.484752,90,900,,,NO,40,-1.44051,45
E,-0.116773,1.901755,0.238127,1.996652,70,700,,,NO,30,0.121354,35


In [18]:
df['my_np_array'] > 600

A    False
B    False
C     True
D     True
E     True
Name: my_np_array, dtype: bool

In [19]:
df['gt_85'] = df['my_np_array'] > 600
df

Unnamed: 0,W,X,Y,Z,my_list,my_np_array,empty_column,None_column,no values,my_ser,new,plus_5,gt_85
A,0.302665,1.693723,-1.706086,-1.159119,20,200,,,NO,20,-1.40342,25,False
B,-0.134841,0.390528,0.166905,0.184502,50,500,,,NO,10,0.032064,15,False
C,0.807706,0.07296,0.638787,0.329646,80,800,,,NO,50,1.446493,55,True
D,-0.497104,-0.75407,-0.943406,0.484752,90,900,,,NO,40,-1.44051,45,True
E,-0.116773,1.901755,0.238127,1.996652,70,700,,,NO,30,0.121354,35,True


### Droping Columns

In [20]:
df

Unnamed: 0,W,X,Y,Z,my_list,my_np_array,empty_column,None_column,no values,my_ser,new,plus_5,gt_85
A,0.302665,1.693723,-1.706086,-1.159119,20,200,,,NO,20,-1.40342,25,False
B,-0.134841,0.390528,0.166905,0.184502,50,500,,,NO,10,0.032064,15,False
C,0.807706,0.07296,0.638787,0.329646,80,800,,,NO,50,1.446493,55,True
D,-0.497104,-0.75407,-0.943406,0.484752,90,900,,,NO,40,-1.44051,45,True
E,-0.116773,1.901755,0.238127,1.996652,70,700,,,NO,30,0.121354,35,True


In [21]:
df_1 = df.drop('None_column', axis=1)

In [22]:
df_1

Unnamed: 0,W,X,Y,Z,my_list,my_np_array,empty_column,no values,my_ser,new,plus_5,gt_85
A,0.302665,1.693723,-1.706086,-1.159119,20,200,,NO,20,-1.40342,25,False
B,-0.134841,0.390528,0.166905,0.184502,50,500,,NO,10,0.032064,15,False
C,0.807706,0.07296,0.638787,0.329646,80,800,,NO,50,1.446493,55,True
D,-0.497104,-0.75407,-0.943406,0.484752,90,900,,NO,40,-1.44051,45,True
E,-0.116773,1.901755,0.238127,1.996652,70,700,,NO,30,0.121354,35,True


In [23]:
# Not inplace unless specified!
df

Unnamed: 0,W,X,Y,Z,my_list,my_np_array,empty_column,None_column,no values,my_ser,new,plus_5,gt_85
A,0.302665,1.693723,-1.706086,-1.159119,20,200,,,NO,20,-1.40342,25,False
B,-0.134841,0.390528,0.166905,0.184502,50,500,,,NO,10,0.032064,15,False
C,0.807706,0.07296,0.638787,0.329646,80,800,,,NO,50,1.446493,55,True
D,-0.497104,-0.75407,-0.943406,0.484752,90,900,,,NO,40,-1.44051,45,True
E,-0.116773,1.901755,0.238127,1.996652,70,700,,,NO,30,0.121354,35,True


In [24]:
df.drop('None_column', axis=1, inplace=True)

In [26]:
df

Unnamed: 0,W,X,Y,Z,my_list,my_np_array,empty_column,no values,my_ser,new,plus_5,gt_85
A,0.302665,1.693723,-1.706086,-1.159119,20,200,,NO,20,-1.40342,25,False
B,-0.134841,0.390528,0.166905,0.184502,50,500,,NO,10,0.032064,15,False
C,0.807706,0.07296,0.638787,0.329646,80,800,,NO,50,1.446493,55,True
D,-0.497104,-0.75407,-0.943406,0.484752,90,900,,NO,40,-1.44051,45,True
E,-0.116773,1.901755,0.238127,1.996652,70,700,,NO,30,0.121354,35,True


In [27]:
df.drop(columns = ['my_ser', 'my_np_array', 'gt_85', 'plus_5'], axis=1, inplace=True)

In [28]:
df

Unnamed: 0,W,X,Y,Z,my_list,empty_column,no values,new
A,0.302665,1.693723,-1.706086,-1.159119,20,,NO,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,,NO,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,,NO,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,,NO,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,70,,NO,0.121354


## 3) Selecting, Creating and Dropping Rows 

### Selecting Rows

In [29]:
df

Unnamed: 0,W,X,Y,Z,my_list,empty_column,no values,new
A,0.302665,1.693723,-1.706086,-1.159119,20,,NO,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,,NO,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,,NO,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,,NO,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,70,,NO,0.121354


In [30]:
# df['W']
columns = ['W', 'Y']
df[columns]

Unnamed: 0,W,Y
A,0.302665,-1.706086
B,-0.134841,0.166905
C,0.807706,0.638787
D,-0.497104,-0.943406
E,-0.116773,0.238127


In [31]:
# location
df.loc['A']


W               0.302665
X               1.693723
Y              -1.706086
Z              -1.159119
my_list               20
empty_column         NaN
no values             NO
new             -1.40342
Name: A, dtype: object

In [96]:
df.loc['D']

W              -0.497104
X               -0.75407
Y              -0.943406
Z               0.484752
my_list               90
empty_column         NaN
no values             NO
new             -1.44051
Name: D, dtype: object

Or select based off of position instead of label 

In [97]:
# index location
df.iloc[3]

W              -0.497104
X               -0.75407
Y              -0.943406
Z               0.484752
my_list               90
empty_column         NaN
no values             NO
new             -1.44051
Name: D, dtype: object

In [98]:
df.loc[['A', 'C']]

Unnamed: 0,W,X,Y,Z,my_list,empty_column,no values,new
A,0.302665,1.693723,-1.706086,-1.159119,20,,NO,-1.40342
C,0.807706,0.07296,0.638787,0.329646,80,,NO,1.446493


In [99]:
df.iloc[[3, 4]]

Unnamed: 0,W,X,Y,Z,my_list,empty_column,no values,new
D,-0.497104,-0.75407,-0.943406,0.484752,90,,NO,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,70,,NO,0.121354


In [32]:
df.iloc[2:5]

Unnamed: 0,W,X,Y,Z,my_list,empty_column,no values,new
C,0.807706,0.07296,0.638787,0.329646,80,,NO,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,,NO,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,70,,NO,0.121354


In [101]:
df.loc['A']

W               0.302665
X               1.693723
Y              -1.706086
Z              -1.159119
my_list               20
empty_column         NaN
no values             NO
new             -1.40342
Name: A, dtype: object

In [36]:
# df.loc[('C', 'Z')]
df.loc['C', 'Z']

0.32964629880452445

In [34]:
df.loc['C']['Z']

0.32964629880452445

In [104]:
df.loc['A', 'W']

0.3026654485851825

In [105]:
df['W']

A    0.302665
B   -0.134841
C    0.807706
D   -0.497104
E   -0.116773
Name: W, dtype: float64

In [106]:
df['W'].loc['A']

0.3026654485851825

In [37]:
df

Unnamed: 0,W,X,Y,Z,my_list,empty_column,no values,new
A,0.302665,1.693723,-1.706086,-1.159119,20,,NO,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,,NO,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,,NO,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,,NO,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,70,,NO,0.121354


### Creating a New Row

In [108]:
df

Unnamed: 0,W,X,Y,Z,my_list,empty_column,no values,new
A,0.302665,1.693723,-1.706086,-1.159119,20,,NO,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,,NO,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,,NO,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,,NO,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,70,,NO,0.121354


In [38]:
df.loc['V'] = [10, 20, 50, 80, 90, np.nan, 'NO', 0.23]

In [39]:
df

Unnamed: 0,W,X,Y,Z,my_list,empty_column,no values,new
A,0.302665,1.693723,-1.706086,-1.159119,20,,NO,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,,NO,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,,NO,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,,NO,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,70,,NO,0.121354
V,10.0,20.0,50.0,80.0,90,,NO,0.23


In [40]:
df.loc['H'] = df.loc['A'] + df.loc['B']

In [41]:
df

Unnamed: 0,W,X,Y,Z,my_list,empty_column,no values,new
A,0.302665,1.693723,-1.706086,-1.159119,20,,NO,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,,NO,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,,NO,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,,NO,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,70,,NO,0.121354
V,10.0,20.0,50.0,80.0,90,,NO,0.23
H,0.167825,2.084251,-1.539181,-0.974618,70,,NONO,-1.371357


### Dropping Rows

In [42]:
df.drop(index = 'H', axis=0)

Unnamed: 0,W,X,Y,Z,my_list,empty_column,no values,new
A,0.302665,1.693723,-1.706086,-1.159119,20,,NO,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,,NO,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,,NO,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,,NO,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,70,,NO,0.121354
V,10.0,20.0,50.0,80.0,90,,NO,0.23


In [43]:
df

Unnamed: 0,W,X,Y,Z,my_list,empty_column,no values,new
A,0.302665,1.693723,-1.706086,-1.159119,20,,NO,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,,NO,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,,NO,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,,NO,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,70,,NO,0.121354
V,10.0,20.0,50.0,80.0,90,,NO,0.23
H,0.167825,2.084251,-1.539181,-0.974618,70,,NONO,-1.371357


In [44]:
df.drop('H', axis=0, inplace=True)

In [45]:
df

Unnamed: 0,W,X,Y,Z,my_list,empty_column,no values,new
A,0.302665,1.693723,-1.706086,-1.159119,20,,NO,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,,NO,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,,NO,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,,NO,-1.44051
E,-0.116773,1.901755,0.238127,1.996652,70,,NO,0.121354
V,10.0,20.0,50.0,80.0,90,,NO,0.23


In [46]:
df.drop(index = ['E', 'V'], axis=0, inplace=True)

In [47]:
df

Unnamed: 0,W,X,Y,Z,my_list,empty_column,no values,new
A,0.302665,1.693723,-1.706086,-1.159119,20,,NO,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,,NO,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,,NO,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,,NO,-1.44051


## 4) Selecting subset of rows and columns

In [119]:
df.loc['B']['Y']

0.16690463609281317

In [48]:
df.loc['B'  , 'Y']

0.16690463609281317

In [49]:
# df.loc[['A','B']][['W', 'Y']]

df.loc[   ['A', 'B']  , ['W', 'X']   ]

Unnamed: 0,W,X
A,0.302665,1.693723
B,-0.134841,0.390528


In [52]:
df[['W', 'X']].loc[['A','B']]

Unnamed: 0,W,X
A,0.302665,1.693723
B,-0.134841,0.390528


## 5) Selecting by conditions

An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [53]:
df = df[['W', 'X', 'Y', 'Z', 'my_list', 'new']]
df

Unnamed: 0,W,X,Y,Z,my_list,new
A,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


In [56]:
df > 0

Unnamed: 0,W,X,Y,Z,my_list,new
A,True,True,False,False,True,False
B,False,True,True,True,True,True
C,True,True,True,True,True,True
D,False,False,False,True,True,False


In [125]:
df[df > 0]

Unnamed: 0,W,X,Y,Z,my_list,new
A,0.302665,1.693723,,,20,
B,,0.390528,0.166905,0.184502,50,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,1.446493
D,,,,0.484752,90,


In [126]:
df['W'] > 0

A     True
B    False
C     True
D    False
Name: W, dtype: bool

In [57]:
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z,my_list,new
A,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
C,0.807706,0.07296,0.638787,0.329646,80,1.446493


In [128]:
df[df['W'] > 0]['Y']

A   -1.706086
C    0.638787
Name: Y, dtype: float64

In [58]:
df[df['W'] > 0][['Y','X']]

Unnamed: 0,Y,X
A,-1.706086,1.693723
C,0.638787,0.07296


In [130]:
df.loc['A', 'Y']

-1.7060859307350775

In [131]:
df

Unnamed: 0,W,X,Y,Z,my_list,new
A,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


In [59]:
df.loc[ ['A', 'B']  ,  ['X', 'Y', 'W'] ]

Unnamed: 0,X,Y,W
A,1.693723,-1.706086,0.302665
B,0.390528,0.166905,-0.134841


In [61]:
df.loc[df['W'] > 0 , ['X', 'Y']]

# df.loc[ df['W'] > 0 , ['Y','X']]   #  .loc[['A','C']]['X']

Unnamed: 0,X,Y
A,1.693723,-1.706086
C,0.07296,0.638787


In [134]:
df

Unnamed: 0,W,X,Y,Z,my_list,new
A,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


**For two conditions you can use | and & with parenthesis:**

In [135]:
(df['W'] > 0) & (df['Y'] < 1)

A     True
B    False
C     True
D    False
dtype: bool

In [136]:
(df['W'] > 0) | (df['Y'] < 1)

A    True
B    True
C    True
D    True
dtype: bool

In [137]:
~(df['W'] > 0) & (df['Y'] < 1)


A    False
B     True
C    False
D     True
dtype: bool

In [62]:
# df.loc[  (df['W'] > 0) & (df['Y'] < 1) & (df['Z'] < 1)]

df.loc[ (df['W'] > 0) | (df['Y'] < 1)]

Unnamed: 0,W,X,Y,Z,my_list,new
A,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


In [63]:
df.loc[((df['W'] > 0) & (df['my_list'] > 50)) | (df['Y'] < 0) ]

Unnamed: 0,W,X,Y,Z,my_list,new
A,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
C,0.807706,0.07296,0.638787,0.329646,80,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


In [139]:
df

Unnamed: 0,W,X,Y,Z,my_list,new
A,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


In [140]:
df[ (df['W'] < 0) | ~(df['Y'] < 1)]

Unnamed: 0,W,X,Y,Z,my_list,new
B,-0.134841,0.390528,0.166905,0.184502,50,0.032064
D,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


**isin**

In [141]:
df

Unnamed: 0,W,X,Y,Z,my_list,new
A,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


In [None]:
df.loc[(df['my_list'] == 20) | (df['my_list'] == 50)]

In [64]:
df.loc[(df['my_list'].isin([20, 50]))] 

Unnamed: 0,W,X,Y,Z,my_list,new
A,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
B,-0.134841,0.390528,0.166905,0.184502,50,0.032064


In [143]:
df.loc[~(df['my_list'].isin([20, 70]))]

Unnamed: 0,W,X,Y,Z,my_list,new
B,-0.134841,0.390528,0.166905,0.184502,50,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


**between**

In [144]:
df.loc[  (df['my_list'].between(30, 80))  ]

Unnamed: 0,W,X,Y,Z,my_list,new
B,-0.134841,0.390528,0.166905,0.184502,50,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,1.446493


In [145]:
df.loc[  ~(df['my_list'].between(30, 80))  ]

Unnamed: 0,W,X,Y,Z,my_list,new
A,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
D,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


In [146]:
df[~(df['my_list'].isin([20, 70]))]

Unnamed: 0,W,X,Y,Z,my_list,new
B,-0.134841,0.390528,0.166905,0.184502,50,0.032064
C,0.807706,0.07296,0.638787,0.329646,80,1.446493
D,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


**Quiz**

In [174]:
my_df = pd.DataFrame(np.random.randn(100, 4), columns=['X', 'Y', 'Z', 'S'])
my_df

Unnamed: 0,X,Y,Z,S
0,1.099550,-1.025499,1.132170,-0.918222
1,-1.290992,-0.008876,-0.072111,1.292545
2,-0.214258,0.324996,-1.019374,-1.243351
3,-0.366681,0.589336,2.619668,-1.064573
4,1.826460,-0.287396,1.431274,-1.486898
...,...,...,...,...
95,-0.212220,1.330096,0.683211,-0.008983
96,0.429693,0.662408,1.223457,0.842224
97,2.133137,1.159285,-1.738897,0.134020
98,1.040047,-0.200309,-1.560216,0.050336


In [179]:
my_df[(my_df['X'] > 0) & (my_df['Z'] < 0)]['S']

5    -0.276139
15    1.119649
16   -0.662194
18   -0.767988
20    2.020620
23   -1.382815
24   -0.890814
29   -1.404220
37    1.547742
42   -1.019750
45   -0.406914
46   -0.030757
53   -0.196932
59   -0.217813
61   -0.406113
62   -0.276797
65   -1.479348
68    1.967961
69   -1.674672
71   -1.280880
75   -1.447798
83    0.706543
85   -0.360401
86    0.841790
91    0.308671
97    0.134020
98    0.050336
Name: S, dtype: float64

In [178]:
my_df[(my_df['X'] > 0) & (my_df['Z'] < 0)]['S'].iloc[3:9]

18   -0.767988
20    2.020620
23   -1.382815
24   -0.890814
29   -1.404220
37    1.547742
Name: S, dtype: float64

In [175]:
my_df[(my_df['X'] > 0) & (my_df['Z'] < 0)]['S'].iloc[3:9]

18   -0.767988
20    2.020620
23   -1.382815
24   -0.890814
29   -1.404220
37    1.547742
Name: S, dtype: float64

## 6) Selecting by data type

**Convert columns to best possible dtypes use `convert_dtypes()` or `astype()`** 

In [71]:
df3 = pd.DataFrame(
    {
        "integers": pd.Series([1, 2, 3], dtype=np.dtype("int32")),
        "strings": pd.Series(["x", "y", "z"], dtype=np.dtype("O")),
        "booleans": pd.Series([True, False, np.nan], dtype=np.dtype("O")),
        "strings_2": pd.Series(["h", "i", np.nan], dtype=np.dtype("O")),
        "integers_2": pd.Series([10, 30, 20], dtype=np.dtype("float")),
        "floats": pd.Series([np.nan, 100.5, 200], dtype=np.dtype("float")),
        "dates": pd.Series(['30/7/1993 12:26:13', '30/7/1993 12:26:13', '30/7/1993 12:26:13'], dtype=np.dtype("O"))
    }
)

In [72]:
df3

Unnamed: 0,integers,strings,booleans,strings_2,integers_2,floats,dates
0,1,x,True,h,10.0,,30/7/1993 12:26:13
1,2,y,False,i,30.0,100.5,30/7/1993 12:26:13
2,3,z,,,20.0,200.0,30/7/1993 12:26:13


In [73]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   integers    3 non-null      int32  
 1   strings     3 non-null      object 
 2   booleans    2 non-null      object 
 3   strings_2   2 non-null      object 
 4   integers_2  3 non-null      float64
 5   floats      2 non-null      float64
 6   dates       3 non-null      object 
dtypes: float64(2), int32(1), object(4)
memory usage: 284.0+ bytes


In [74]:
df3 = df3.convert_dtypes()
df3

Unnamed: 0,integers,strings,booleans,strings_2,integers_2,floats,dates
0,1,x,True,h,10,,30/7/1993 12:26:13
1,2,y,False,i,30,100.5,30/7/1993 12:26:13
2,3,z,,,20,200.0,30/7/1993 12:26:13


In [75]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   integers    3 non-null      Int32  
 1   strings     3 non-null      string 
 2   booleans    2 non-null      boolean
 3   strings_2   2 non-null      string 
 4   integers_2  3 non-null      Int64  
 5   floats      2 non-null      Float64
 6   dates       3 non-null      string 
dtypes: Float64(1), Int32(1), Int64(1), boolean(1), string(3)
memory usage: 275.0 bytes


In [76]:
df3['integers_2'] = df3['integers_2'].astype('int32')

In [77]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   integers    3 non-null      Int32  
 1   strings     3 non-null      string 
 2   booleans    2 non-null      boolean
 3   strings_2   2 non-null      string 
 4   integers_2  3 non-null      int32  
 5   floats      2 non-null      Float64
 6   dates       3 non-null      string 
dtypes: Float64(1), Int32(1), boolean(1), int32(1), string(3)
memory usage: 260.0 bytes


In [187]:
df3['integers'] = df3['integers'].astype('int32')

In [188]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   integers    3 non-null      int32  
 1   strings     3 non-null      string 
 2   booleans    2 non-null      boolean
 3   strings_2   2 non-null      string 
 4   integers_2  2 non-null      Int64  
 5   floats      2 non-null      Float64
 6   dates       3 non-null      string 
dtypes: Float64(1), Int64(1), boolean(1), int32(1), string(3)
memory usage: 272.0 bytes


In [189]:
df3

Unnamed: 0,integers,strings,booleans,strings_2,integers_2,floats,dates
0,1,x,True,h,10.0,,30/7/1993 12:26:13
1,2,y,False,i,,100.5,30/7/1993 12:26:13
2,3,z,,,20.0,200.0,30/7/1993 12:26:13


In [159]:
df3['dates'] = pd.to_datetime(df3['dates'], format="%Y%m/%d %H:%M:%S")
df3

Unnamed: 0,integers,strings,booleans,strings_2,integers_2,floats,dates
0,1,x,True,h,10.0,,1993-07-30 12:26:13
1,2,y,False,i,,100.5,1993-07-30 12:26:13
2,3,z,,,20.0,200.0,1993-07-30 12:26:13


In [160]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   integers    3 non-null      int32         
 1   strings     3 non-null      string        
 2   booleans    2 non-null      boolean       
 3   strings_2   2 non-null      string        
 4   integers_2  2 non-null      Int64         
 5   floats      2 non-null      Float64       
 6   dates       3 non-null      datetime64[ns]
dtypes: Float64(1), Int64(1), boolean(1), datetime64[ns](1), int32(1), string(2)
memory usage: 272.0 bytes


**Select with data type**

In [161]:
df3.select_dtypes(include=['number'])

Unnamed: 0,integers,integers_2,floats
0,1,10.0,
1,2,,100.5
2,3,20.0,200.0


In [162]:
df3.select_dtypes(include=['int32'])

Unnamed: 0,integers
0,1
1,2
2,3


In [163]:
df3.select_dtypes(include=['int64'])

Unnamed: 0,integers_2
0,10.0
1,
2,20.0


In [164]:
df3.select_dtypes(include=['float32'])

0
1
2


In [165]:
df3.select_dtypes(include=['float64'])

Unnamed: 0,floats
0,
1,100.5
2,200.0


In [166]:
df3.select_dtypes(include=['bool'])

Unnamed: 0,booleans
0,True
1,False
2,


In [167]:
df3.select_dtypes(include=['string'])

Unnamed: 0,strings,strings_2
0,x,h
1,y,i
2,z,


In [168]:
df3.select_dtypes(include=['datetime', 'string'])

Unnamed: 0,strings,strings_2,dates
0,x,h,1993-07-30 12:26:13
1,y,i,1993-07-30 12:26:13
2,z,,1993-07-30 12:26:13


In [169]:
df3.select_dtypes(exclude=['number'])

Unnamed: 0,strings,booleans,strings_2,dates
0,x,True,h,1993-07-30 12:26:13
1,y,False,i,1993-07-30 12:26:13
2,z,,,1993-07-30 12:26:13


In [170]:
df3.select_dtypes(exclude=['datetime'])

Unnamed: 0,integers,strings,booleans,strings_2,integers_2,floats
0,1,x,True,h,10.0,
1,2,y,False,i,,100.5
2,3,z,,,20.0,200.0


## 7) Set New Index

Let's discuss some more features of indexing, including resetting the index or setting it something else.

In [86]:
df

Unnamed: 0_level_0,W,X,Y,Z,my_list,new
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CA,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
NY,-0.134841,0.390528,0.166905,0.184502,50,0.032064
WY,0.807706,0.07296,0.638787,0.329646,80,1.446493
OR,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


In [87]:
newind = ['CA', 'NY', 'WY', 'OR']

In [88]:
df['States'] = newind

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['States'] = newind


In [89]:
df

Unnamed: 0_level_0,W,X,Y,Z,my_list,new,States
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CA,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342,CA
NY,-0.134841,0.390528,0.166905,0.184502,50,0.032064,NY
WY,0.807706,0.07296,0.638787,0.329646,80,1.446493,WY
OR,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051,OR


In [90]:
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z,my_list,new
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CA,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
NY,-0.134841,0.390528,0.166905,0.184502,50,0.032064
WY,0.807706,0.07296,0.638787,0.329646,80,1.446493
OR,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


In [91]:
df

Unnamed: 0_level_0,W,X,Y,Z,my_list,new,States
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CA,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342,CA
NY,-0.134841,0.390528,0.166905,0.184502,50,0.032064,NY
WY,0.807706,0.07296,0.638787,0.329646,80,1.446493,WY
OR,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051,OR


In [92]:
df.set_index('States', inplace=True, drop=False)

In [93]:
df

Unnamed: 0_level_0,W,X,Y,Z,my_list,new,States
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CA,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342,CA
NY,-0.134841,0.390528,0.166905,0.184502,50,0.032064,NY
WY,0.807706,0.07296,0.638787,0.329646,80,1.446493,WY
OR,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051,OR


In [199]:
df.set_index('Y', inplace=True, drop=False)

In [200]:
df

Unnamed: 0_level_0,W,X,Y,Z,my_list,new,States
Y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
-1.706086,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342,CA
0.166905,-0.134841,0.390528,0.166905,0.184502,50,0.032064,NY
0.638787,0.807706,0.07296,0.638787,0.329646,80,1.446493,WY
-0.943406,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051,OR


In [201]:
df.set_index('W', inplace=True)

In [202]:
df

Unnamed: 0_level_0,X,Y,Z,my_list,new,States
W,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.302665,1.693723,-1.706086,-1.159119,20,-1.40342,CA
-0.134841,0.390528,0.166905,0.184502,50,0.032064,NY
0.807706,0.07296,0.638787,0.329646,80,1.446493,WY
-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051,OR


In [98]:
df.set_index('States', inplace=True)

In [99]:
df

Unnamed: 0_level_0,W,X,Y,Z,my_list,new
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CA,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
NY,-0.134841,0.390528,0.166905,0.184502,50,0.032064
WY,0.807706,0.07296,0.638787,0.329646,80,1.446493
OR,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


## 8) Reset Index

resetting index to numbers

In [100]:
df

Unnamed: 0_level_0,W,X,Y,Z,my_list,new
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CA,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
NY,-0.134841,0.390528,0.166905,0.184502,50,0.032064
WY,0.807706,0.07296,0.638787,0.329646,80,1.446493
OR,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


In [101]:
df.reset_index()

Unnamed: 0,States,W,X,Y,Z,my_list,new
0,CA,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
1,NY,-0.134841,0.390528,0.166905,0.184502,50,0.032064
2,WY,0.807706,0.07296,0.638787,0.329646,80,1.446493
3,OR,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


In [102]:
df

Unnamed: 0_level_0,W,X,Y,Z,my_list,new
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CA,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
NY,-0.134841,0.390528,0.166905,0.184502,50,0.032064
WY,0.807706,0.07296,0.638787,0.329646,80,1.446493
OR,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


In [103]:
df.reset_index(inplace=True)

In [104]:
df

Unnamed: 0,States,W,X,Y,Z,my_list,new
0,CA,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
1,NY,-0.134841,0.390528,0.166905,0.184502,50,0.032064
2,WY,0.807706,0.07296,0.638787,0.329646,80,1.446493
3,OR,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


In [105]:
df['index'] = list(range(1, 5))
df

Unnamed: 0,States,W,X,Y,Z,my_list,new,index
0,CA,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342,1
1,NY,-0.134841,0.390528,0.166905,0.184502,50,0.032064,2
2,WY,0.807706,0.07296,0.638787,0.329646,80,1.446493,3
3,OR,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051,4


In [106]:
df.set_index('index')

Unnamed: 0_level_0,States,W,X,Y,Z,my_list,new
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,CA,0.302665,1.693723,-1.706086,-1.159119,20,-1.40342
2,NY,-0.134841,0.390528,0.166905,0.184502,50,0.032064
3,WY,0.807706,0.07296,0.638787,0.329646,80,1.446493
4,OR,-0.497104,-0.75407,-0.943406,0.484752,90,-1.44051


# Great Job!