# Cleaning and Filtering data in a DataFrame

In [1]:
# import pandas and give it an alias
import pandas as pd

# let's import pyplot from matplotlib
# and give it an alias of plt
from matplotlib import pyplot as plt

In [2]:
# read intel-stock-data.csv file
intel_stock_df = pd.read_csv('data/intel-stock-data.csv')

# let's see what it contains
intel_stock_df

# Note: if you're not using Jupyter, you need
# to print the DataFrame to show its content
# print(intel_stock_df)

Unnamed: 0,Date,Open,High,Low,Close
0,10/10/2017,39.930000,39.950001,39.380001,39.650002
1,11/10/2017,39.480000,39.669998,39.060001,39.299999
2,12/10/2017,39.349998,39.389999,38.980000,39.189999
3,13/10/2017,39.439999,39.810001,39.279999,39.669998
4,16/10/2017,39.709999,39.790001,39.439999,39.759998
...,...,...,...,...,...
248,04/10/2018,48.840000,48.860001,47.619999,48.130001
249,05/10/2018,48.230000,48.340000,46.660000,47.029999
250,08/10/2018,46.959999,47.349998,46.520000,47.029999
251,09/10/2018,47.180000,47.619999,46.540001,46.549999


In [5]:
# Open that are greater than 40
# this will return an array of boolean
open_greater_than_40 = intel_stock_df['Open'] > 40
# probably because, Pandas reads the dataframe and
# return if the condition is true or not
open_greater_than_40.dtype

dtype('bool')

In [6]:
# to show the data, we can do the line below
intel_stock_df[open_greater_than_40]

# notice that the DataFrame now contains
# rows whos open price is greater than 40

Unnamed: 0,Date,Open,High,Low,Close
8,20/10/2017,40.330002,40.450001,40.099998,40.430000
9,23/10/2017,40.549999,41.040001,40.400002,40.830002
10,24/10/2017,41.000000,41.049999,40.720001,40.950001
11,25/10/2017,40.889999,41.060001,40.490002,40.779999
12,26/10/2017,40.910000,41.580002,40.709999,41.349998
...,...,...,...,...,...
248,04/10/2018,48.840000,48.860001,47.619999,48.130001
249,05/10/2018,48.230000,48.340000,46.660000,47.029999
250,08/10/2018,46.959999,47.349998,46.520000,47.029999
251,09/10/2018,47.180000,47.619999,46.540001,46.549999


In [7]:
# Here's another approach that will return the same result
open_greater_than_40 = intel_stock_df[intel_stock_df['Open']>40]
open_greater_than_40

Unnamed: 0,Date,Open,High,Low,Close
8,20/10/2017,40.330002,40.450001,40.099998,40.430000
9,23/10/2017,40.549999,41.040001,40.400002,40.830002
10,24/10/2017,41.000000,41.049999,40.720001,40.950001
11,25/10/2017,40.889999,41.060001,40.490002,40.779999
12,26/10/2017,40.910000,41.580002,40.709999,41.349998
...,...,...,...,...,...
248,04/10/2018,48.840000,48.860001,47.619999,48.130001
249,05/10/2018,48.230000,48.340000,46.660000,47.029999
250,08/10/2018,46.959999,47.349998,46.520000,47.029999
251,09/10/2018,47.180000,47.619999,46.540001,46.549999


In [8]:
# let's try another example
high_less_than_40 = intel_stock_df[intel_stock_df['High'] < 40]
high_less_than_40

Unnamed: 0,Date,Open,High,Low,Close
0,10/10/2017,39.93,39.950001,39.380001,39.650002
1,11/10/2017,39.48,39.669998,39.060001,39.299999
2,12/10/2017,39.349998,39.389999,38.98,39.189999
3,13/10/2017,39.439999,39.810001,39.279999,39.669998
4,16/10/2017,39.709999,39.790001,39.439999,39.759998
5,17/10/2017,39.560001,39.869999,39.369999,39.790001


In [9]:
# here, we're filtering using two column arguments
open_gt_39_close_lt_40 = intel_stock_df[(intel_stock_df['Open']>30) & ((intel_stock_df['Close']<=40))]
open_gt_39_close_lt_40

Unnamed: 0,Date,Open,High,Low,Close
0,10/10/2017,39.93,39.950001,39.380001,39.650002
1,11/10/2017,39.48,39.669998,39.060001,39.299999
2,12/10/2017,39.349998,39.389999,38.98,39.189999
3,13/10/2017,39.439999,39.810001,39.279999,39.669998
4,16/10/2017,39.709999,39.790001,39.439999,39.759998
5,17/10/2017,39.560001,39.869999,39.369999,39.790001
