<a href="https://colab.research.google.com/github/nsgrn/Python-Intro-Assignment/blob/main/2_Lab1_Assignm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dealing with missing data

## A.  Identifying missing values in tabular data

In [15]:
import pandas as pd
import numpy as np
from io import StringIO
import sys

csv_data = \
'''A,B,C,D,E
1.0,2.0,3.0,4.0, 3.2
5.0,6.0,,8.0,23
10.0,11.0,12.0,,t'''

# If you are using Python 2.7, you need
# to convert the string to unicode:

#if (sys.version_info < (3, 0)):
    #csv_data = unicode(csv_data)

### Step 1: Read the csv file as a pandas dataframe

In [16]:
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,3.2
1,5.0,6.0,,8.0,23
2,10.0,11.0,12.0,,t


### Step 2: Check the number of missing values for the columns

In [26]:
df.isna().sum()

Unnamed: 0,0
A,0
B,0
C,1
D,1
E,0


In [23]:
df.isna()

Unnamed: 0,A,B,C,D,E
0,False,False,False,False,False
1,False,False,True,False,False
2,False,False,False,True,False


### Step 3: access the underlying NumPy array via the `values` attribute

In [19]:
df.values

array([[1.0, 2.0, 3.0, 4.0, ' 3.2'],
       [5.0, 6.0, nan, 8.0, '23'],
       [10.0, 11.0, 12.0, nan, 't']], dtype=object)

### Step 4: Remove rows from df that contain missing values

In [34]:
df_1 = df.dropna()
df_1

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,3.2


In [31]:
df_new = df.dropna(axis=0)
df_new

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,3.2


### Step 5: Remove columns from df that contain missing values

In [32]:
df_2 = df.dropna(axis=1)
df_2

Unnamed: 0,A,B,E
0,1.0,2.0,3.2
1,5.0,6.0,23
2,10.0,11.0,t


### Step 6: Only drop rows where all columns are NaN

In [44]:
csv_data_1 = \
'''A,B,C,D,E
1.0,2.0,3.0,4.0, 3.2
5.0,6.0,,8.0,23
10.0,11.0,12.0,,t
,,,,'''
df1 = pd.read_csv(StringIO(csv_data_1))
df1

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,3.2
1,5.0,6.0,,8.0,23
2,10.0,11.0,12.0,,t
3,,,,,


In [49]:
df1.dropna(how="all")

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,3.2
1,5.0,6.0,,8.0,23
2,10.0,11.0,12.0,,t


In [50]:
df1.dropna(axis=1, how="all")

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,3.2
1,5.0,6.0,,8.0,23
2,10.0,11.0,12.0,,t
3,,,,,


### Step 7: Drop rows that have less than 3 real values

In [56]:
df.dropna(thresh=5)

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,3.2


In [57]:
df

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,3.2
1,5.0,6.0,,8.0,23
2,10.0,11.0,12.0,,t


### Step 8: Only drop rows where NaN appear in specific columns (here: 'C')

In [61]:
df.dropna(subset=["C"])

Unnamed: 0,A,B,C,D,E
0,1.0,2.0,3.0,4.0,3.2
2,10.0,11.0,12.0,,t


## B. Imputing missing values

In [None]:
# again: our original array
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

### Step 1: impute missing values via the column mean
`from sklearn.impute import SimpleImputer`

`import numpy as np`

In [70]:
df2 = df.drop("E", axis=1)
df2

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [72]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values= np.nan, strategy="mean")
df_imputed = imputer.fit_transform(df.drop("E", axis=1))
df_imputed

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

<br>
<br>