In [1]:
import numpy as np
import pandas as pd

In [2]:
# to display a null value
np.nan

nan

In [3]:
pd.NA

<NA>

In [4]:
# the value that is missing should be a timestamp of some sort
pd.NaT

NaT

## Note! Typical comparisons should be avoided with Missing Values

* https://towardsdatascience.com/navigating-the-hell-of-nans-in-python-71b12558895b
* https://stackoverflow.com/questions/20320022/why-in-numpy-nan-nan-is-false-while-nan-in-nan-is-true

This is generally because the logic here is, since we don't know these values, we can't know if they are equal to each other.

In [5]:
#it is false because
#the general logic is that since you donot know truly what this value is, it should be missing
#both value is missing,there is no way you actually tell if these two missing values are gonna be equal to each other.
np.nan == np.nan

False

In [6]:
# if you're checking for some value to be nan or not
np.nan is np.nan

True

In [7]:
myvar = np.nan

In [8]:
myvar is np.nan

True

## Data

In [9]:
from google.colab import files
uploaded = files.upload()

Saving movie_scores.csv to movie_scores.csv


In [10]:
df = pd.read_csv('movie_scores.csv')

In [11]:
df #some data is missing

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


## Checking and Selecting for Null Values

In [12]:
#returns True or False if you have null value
df.isnull()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,False,False,False,False,False,False
1,True,True,True,True,True,True
2,False,False,False,False,True,True
3,False,False,False,False,False,False
4,False,False,False,False,False,False


In [13]:
df.notnull()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,True,True,True,True,True,True
1,False,False,False,False,False,False
2,True,True,True,True,False,False
3,True,True,True,True,True,True
4,True,True,True,True,True,True


In [14]:
#So you can use these notnull and isnull conditional filtering to only select columns where certain features are present
df["pre_movie_score"].notnull()

Unnamed: 0,pre_movie_score
0,True
1,False
2,False
3,True
4,True


In [15]:
#only get back the rows where I have pre_movie_score
df[df["pre_movie_score"].notnull()]

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [16]:
df[(df["pre_movie_score"].isnull()) & (df["first_name"].notnull())]

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
2,Hugh,Jackman,51.0,m,,


## Three options we had:
- Keep Data
- Drop Data
- Fill Data

## Drop Data

In [17]:
df

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [18]:
help(df.dropna)

Help on method dropna in module pandas.core.frame:

dropna(*, axis: 'Axis' = 0, how: 'AnyAll | lib.NoDefault' = <no_default>, thresh: 'int | lib.NoDefault' = <no_default>, subset: 'IndexLabel | None' = None, inplace: 'bool' = False, ignore_index: 'bool' = False) -> 'DataFrame | None' method of pandas.core.frame.DataFrame instance
    Remove missing values.
    
    See the :ref:`User Guide <missing_data>` for more on which values are
    considered missing, and how to work with missing data.
    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Determine if rows or columns which contain missing values are
        removed.
    
        * 0, or 'index' : Drop rows which contain missing values.
        * 1, or 'columns' : Drop columns which contain missing value.
    
        Only a single axis is allowed.
    
    how : {'any', 'all'}, default 'any'
        Determine if row or column is removed from DataFrame, when we have
        at least one NA

In [19]:
#get rid of all you had those missing values
df.dropna()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [21]:
#go ahead and drop any rows that contain null values unless,
#they have at least one non-null value
df.dropna(thresh=1)

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [22]:
#when tresh=5 hugh jackman gets dropped because
#be he doesn't have at least five non-null values.
df.dropna(thresh=5)

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [24]:
df.dropna(axis=0)
#axis=1 columns.
#axis=0 rows => default

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [27]:
df

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [26]:
df.dropna()

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [25]:
df.dropna(subset=["last_name"])

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


## Fill Data

In [29]:
df

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,,,,,,
2,Hugh,Jackman,51.0,m,,
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [30]:
#help(df.fillna)
df.fillna("NEW VALUE!")

Unnamed: 0,first_name,last_name,age,sex,pre_movie_score,post_movie_score
0,Tom,Hanks,63.0,m,8.0,10.0
1,NEW VALUE!,NEW VALUE!,NEW VALUE!,NEW VALUE!,NEW VALUE!,NEW VALUE!
2,Hugh,Jackman,51.0,m,NEW VALUE!,NEW VALUE!
3,Oprah,Winfrey,66.0,f,6.0,8.0
4,Emma,Stone,31.0,f,7.0,9.0


In [33]:
df["pre_movie_score"].fillna(0)
# if I wanted to make this change permanent, assign this chaneg.
df["pre_movie_score"] = df["pre_movie_score"].fillna(0)

In [35]:
df["pre_movie_score"]

Unnamed: 0,pre_movie_score
0,8.0
1,0.0
2,0.0
3,6.0
4,7.0


In [36]:
df["post_movie_score"]

Unnamed: 0,post_movie_score
0,10.0
1,
2,
3,8.0
4,9.0


In [39]:
df["post_movie_score"].mean()

np.float64(9.0)

In [41]:
df["post_movie_score"].fillna(df["post_movie_score"].mean())

Unnamed: 0,post_movie_score
0,10.0
1,9.0
2,9.0
3,8.0
4,9.0


In [44]:
#this will attempt to do automatically fill everything with the averages
# df.fillna(df.mean())

## Filling with Interpolation

Be careful with this technique, you should try to really understand whether or not this is a valid choice for your data. You should also note there are several methods available, the default is a linear method.

Full Docs on this Method:
https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.interpolate.html

In [45]:
airline_tix = {'first':100,'business':np.nan,'economy-plus':50,'economy':30}

In [46]:
ser = pd.Series(airline_tix)

In [47]:
ser

Unnamed: 0,0
first,100.0
business,
economy-plus,50.0
economy,30.0


In [48]:
ser.interpolate()

Unnamed: 0,0
first,100.0
business,75.0
economy-plus,50.0
economy,30.0
