# Ch 2 - Data Preparation Basics

## Segment 1 - Filtering and selecting data

In [1]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

### Selecting and retrieving data

You can write an index value in two forms

- Label index or
- Integer index

In [2]:
series_obj = Series(np.arange(8), index=['row 1', 'row 2', 'row 3', 'row 4', 'row 5', 'row 6', 'row 7', 'row 8'])
series_obj

row 1    0
row 2    1
row 3    2
row 4    3
row 5    4
row 6    5
row 7    6
row 8    7
dtype: int64

In [3]:
series_obj['row 7']

6

In [4]:
series_obj[[0, 7]]

row 1    0
row 8    7
dtype: int64

In [9]:
np.random.seed(25)
DF_obj = DataFrame(np.random.rand(36).reshape((6, 6)),
                  index=[f'row {i}' for i in range(6)],
                  columns=[f'column {i}' for i in range(6)])

In [10]:
DF_obj

Unnamed: 0,column 0,column 1,column 2,column 3,column 4,column 5
row 0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
row 1,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
row 2,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
row 3,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
row 4,0.514244,0.559053,0.03445,0.71993,0.421004,0.436935
row 5,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [11]:
# Select rows and columns 2 and 5:
DF_obj.loc[['row 2', 'row 5'], ['column 5', 'column 2']]

Unnamed: 0,column 5,column 2
row 2,0.699186,0.161985
row 5,0.525819,0.669612


### Data slicing

You can use slicing to select and return a slice of several values from a data set. Slicing uses index values so you can use the same square brackets when doing data slicing.

In [13]:
series_obj['row 3': 'row 7']

row 3    2
row 4    3
row 5    4
row 6    5
row 7    6
dtype: int64

### Comparing with scalars

In [15]:
DF_obj < 0.5

Unnamed: 0,column 0,column 1,column 2,column 3,column 4,column 5
row 0,False,False,True,True,True,True
row 1,False,True,False,True,True,True
row 2,True,False,True,False,True,False
row 3,True,False,True,False,True,False
row 4,False,False,True,False,True,True
row 5,True,False,False,True,True,False


In [16]:
np.count_nonzero(DF_obj < 0.5)

20

### Filtering with scalars

In [17]:
series_obj[series_obj > 6]

row 8    7
dtype: int64

### Setting values with scalars

In [18]:
series_obj['row 1', 'row 5', 'row 8'] = 8
series_obj

row 1    8
row 2    1
row 3    2
row 4    3
row 5    8
row 6    5
row 7    6
row 8    8
dtype: int64