## Intro to NA Values

In [4]:
import numpy as np
import pandas as pd

In [6]:
sales = pd.read_csv("data/sales.csv", index_col = 0)

In [7]:
sales

Unnamed: 0,Mon,Tue,Wed,Thu,Fri
Steven,34,27,15,,33
Mike,45,9,74,87.0,12
Andi,17,33,54,8.0,29
Paul,87,67,27,45.0,7


In [8]:
sales.loc["Steven", "Thu"]

nan

In [10]:
sales.iloc[1,1] = None

In [12]:
sales

Unnamed: 0,Mon,Tue,Wed,Thu,Fri
Steven,34,27.0,15,,33
Mike,45,,74,87.0,12
Andi,17,33.0,54,8.0,29
Paul,87,67.0,27,45.0,7


In [14]:
sales.iloc[2,2] = np.nan

In [15]:
sales

Unnamed: 0,Mon,Tue,Wed,Thu,Fri
Steven,34,27.0,15.0,,33
Mike,45,,74.0,87.0,12
Andi,17,33.0,,8.0,29
Paul,87,67.0,27.0,45.0,7


In [17]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, Steven to Paul
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Mon     4 non-null      int64  
 1   Tue     3 non-null      float64
 2   Wed     3 non-null      float64
 3   Thu     3 non-null      float64
 4   Fri     4 non-null      int64  
dtypes: float64(3), int64(2)
memory usage: 352.0+ bytes


## Handling NA Values / Missing Values

In [18]:
titanic = pd.read_csv("data/titanic.csv")

In [22]:
titanic.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
deck        688
dtype: int64

In [21]:
titanic.notna().sum()

survived    891
pclass      891
sex         891
age         714
sibsp       891
parch       891
fare        891
embarked    889
deck        203
dtype: int64

In [23]:
titanic.loc[titanic.embarked.isna()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
61,1,1,female,38.0,0,0,80.0,,B
829,1,1,female,62.0,0,0,80.0,,B


In [24]:
titanic.shape

(891, 9)

In [26]:
titanic.dropna().shape

(182, 9)

In [28]:
titanic.dropna(how="all").shape

(891, 9)

In [29]:
titanic.dropna(axis = 1, how="any").shape

(891, 6)

In [31]:
titanic.dropna(axis = 1, thresh=500).shape

(891, 8)

In [33]:
titanic.dropna(axis = 1, thresh=500, inplace=True)

In [34]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [35]:
titanic.loc[titanic.age.isna()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
5,0,3,male,,0,0,8.4583,Q
17,1,2,male,,0,0,13.0000,S
19,1,3,female,,0,0,7.2250,C
26,0,3,male,,0,0,7.2250,C
28,1,3,female,,0,0,7.8792,Q
...,...,...,...,...,...,...,...,...
859,0,3,male,,0,0,7.2292,C
863,0,3,female,,8,2,69.5500,S
868,0,3,male,,0,0,9.5000,S
878,0,3,male,,0,0,7.8958,S


In [36]:
mean_age = titanic.age.mean()

In [37]:
mean_age

29.69911764705882

In [38]:
titanic.age.fillna(value=mean_age, inplace=True)

In [39]:
titanic.age

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: age, Length: 891, dtype: float64

In [40]:

titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       891 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [41]:
pd.read_csv("data/clean_df.csv")

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.000000,1,0,7.2500,S
1,1,1,female,38.000000,1,0,71.2833,C
2,1,3,female,26.000000,0,0,7.9250,S
3,1,1,female,35.000000,1,0,53.1000,S
4,0,3,male,35.000000,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,S
887,1,1,female,19.000000,0,0,30.0000,S
888,0,3,female,29.699118,1,2,23.4500,S
889,1,1,male,26.000000,0,0,30.0000,C
