<a href="https://colab.research.google.com/github/martin-fabbri/colab-notebooks/blob/master/deatling_with_missing_values.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dealing with missing values

In [3]:
!pip uninstall scikit-learn -y
!pip install scikit-learn

Uninstalling scikit-learn-0.21.3:
  Successfully uninstalled scikit-learn-0.21.3
Collecting scikit-learn
[?25l  Downloading https://files.pythonhosted.org/packages/2e/d0/860c4f6a7027e00acff373d9f5327f4ae3ed5872234b3cbdd7bcb52e5eff/scikit_learn-0.22-cp36-cp36m-manylinux1_x86_64.whl (7.0MB)
[K     |████████████████████████████████| 7.0MB 2.8MB/s 
Installing collected packages: scikit-learn
Successfully installed scikit-learn-0.22


In [12]:
import pandas as pd
import numpy as np
import sklearn 

from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

print(f'scikit-learn {sklearn.__version__}')

# set seed for reproducibility
np.random.seed(0)

scikit-learn 0.22


In [0]:
nfl_dataset_url = 'https://raw.githubusercontent.com/martin-fabbri/colab-notebooks/master/data/nfl_play_by_play_v1.csv'
nfl_df = pd.read_csv(nfl_dataset_url)
nfl_df.head(3)

In [0]:
nfl_df.sample(5)

In [0]:
missing_values_count = nfl_df.isnull().sum()
missing_values_count[missing_values_count > 0]

In [0]:
# how many missing values do we have?
total_cells = np.product(nfl_df.shape)
total_missing = missing_values_count.sum()

# percent of data tha is missing
total_missing / total_cells * 100

In [0]:
missing_values_count.filter(items=['TimeSecs'])

In [0]:
missing_values_count.filter(items=['PenalizedTeam'])

In [0]:
# approach # 1 - remove all the rows that contain a missing value
nlf_without_na = nfl_df.dropna()
nlf_without_na.info()

In [0]:
columns_without_na = nfl_df.dropna(axis=1)
columns_without_na

In [0]:
# how much data we have lost?
print('Columns in original dataset:', nfl_df.shape[1])
print('Columns with na dropped:', columns_without_na.shape[1])

In [0]:
sf_building_permits = 'https://raw.githubusercontent.com/martin-fabbri/colab-notebooks/master/data/sf_building_permits.csv'

sf_permits_df = pd.read_csv(sf_building_permits)
sf_permits_df .sample(5)



In [0]:
missing_values_count = sf_permits_df.isnull().sum()
missing_values_count[missing_values_count > 0]

In [0]:
# how many missing values do we have?
total_cells = np.product(sf_permits_df.shape)
total_missing = missing_values_count.sum()

# percent of data tha is missing
total_missing / total_cells * 100

In [0]:
subset_nlf_data = nfl_df.loc[:, 'EPA':'Season']
subset_nlf_data.head(3)

In [0]:
subset_nlf_data.info()

In [0]:
subset_nlf_data.isnull().sum()

In [0]:
subset_nlf_data.fillna(0).head(3)

In [0]:
# replace all NA's with the value that comes directly after it in the same column
# then replace all the remaining na's with 0

subset_nlf_data.fillna(method='bfill', axis=0).fillna(0).isnull().sum()

In [0]:
simple_imputer = SimpleImputer()
data_with_imputed_values = simple_imputer.fit_transform(subset_nlf_data)
data_with_imputed_values[:5]

In [0]:
copy = nfl_df.copy()
copy = copy.select_dtypes(exclude=['object'])

simple_imputer = SimpleImputer()
clean_df = pd.DataFrame(simple_imputer.fit_transform(copy))
# clean_df.columns = copy.columns
clean_df.head()

## Melbourne Housing Snapshot

In [0]:
melbourne_housing_url = 'https://raw.githubusercontent.com/martin-fabbri/colab-notebooks/master/data/melb_data.csv'

melb_df = pd.read_csv(melbourne_housing_url)
melb_df.head()

In [0]:
melb_df.info()

In [0]:
missing_values_count = melb_df.isnull().sum()
missing_values_count[missing_values_count > 0]


In [0]:
melb_target = melb_df['Price']
melb_predictors = melb_df.drop(['Price'], axis=1)
melb_numeric_predictors = melb_predictors.select_dtypes(exclude=['object'])
melb_numeric_predictors.info()

In [0]:
X_train, X_test, y_train, y_test = train_test_split(melb_numeric_predictors, 
                                                    melb_target,
                                                    test_size=0.3,
                                                    random_state=0)
def score_dataset(X_train, X_test, y_train, y_test):
  model = RandomForestRegressor()
  model.fit(X_train, y_train)
  preds = model.predict(X_test)
  return mean_absolute_error(y_test, preds)

## Get Model Score from Dropping Columns with Missing Values

In [0]:
cols_with_missing = melb_numeric_predictors.isnull().sum()
cols_to_drop = [col for col in cols_with_missing[cols_with_missing > 0].index]
print('Cols to drop', cols_to_drop)
X_train_clean = X_train.drop(cols_to_drop, axis=1)
X_test_clean = X_test.drop(cols_to_drop, axis=1)
X_train.head()
print('Mean Absolute Error from dropping columns with Missing Values:')
print(score_dataset(X_train_clean, X_test_clean, y_train, y_test))

## Get Model Score from Imputation

In [0]:
simple_imputer = SimpleImputer()

X_train_simple_imputed = simple_imputer.fit_transform(X_train)
X_test_simple_imputed = simple_imputer.fit_transform(X_test)

print('Mean Absolute Error from Imputation while Track What Was Imputed:')
print(score_dataset(X_train_simple_imputed, X_test_simple_imputed, 
                    y_train, y_test))

## Baseline: Dropping Columns

In [0]:
credit_url = 'https://raw.githubusercontent.com/martin-fabbri/colab-notebooks/master/data/dataset_train_woed.csv'

credit_df = pd.read_csv(credit_url, index_col='ID')
credit_df.head()
credit_target = credit_df['target']
credit_predictors = credit_df.drop(['target'], axis=1)

In [3]:
nan_columns = np.any(np.isnan(credit_predictors))
nan_columns

True

In [4]:
X_train, X_test, y_train, y_test = train_test_split(credit_predictors, 
                                                    credit_target,
                                                    test_size=0.2,
                                                    random_state=0)
X_train.head()

Unnamed: 0_level_0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
3226,0.692866,-0.079777,0.168463,0.076822,0.052545,-0.659061,-0.406083,1.319915,-0.235542,-0.20091,-0.125056,0.235721,0.221552,0.162276,0.243437,0.111703,0.056681,-0.032395,0.562642,0.098836,0.04647,0.382161,0.361833
11816,-0.466803,-0.079777,0.091416,-0.072942,-0.161146,-0.342752,-0.402003,-0.430029,-0.407307,-0.385158,-0.327928,0.144199,0.055172,0.082028,0.153872,0.111703,0.056681,-0.065217,0.01275,0.024585,-0.086809,-0.026687,0.043184
7339,0.692866,0.115154,0.091416,0.076822,0.052545,2.065423,-0.406083,-0.295297,-0.235542,-0.20091,-0.201465,0.060164,0.221552,0.162276,0.243437,0.101398,0.091469,0.120994,0.174191,0.098836,0.119909,0.150499,0.361833
14981,0.651911,0.115154,0.091416,0.076822,0.041466,2.065423,1.473415,1.319915,1.351763,1.426636,1.284719,0.186779,0.179776,0.219737,0.178366,0.202243,0.226296,0.0431,0.01275,0.028998,0.04647,0.382161,0.23643
27168,0.692866,0.115154,0.091416,0.076822,0.041466,-0.659061,-0.406083,-0.295297,-0.235542,-0.20091,-0.201465,-0.015462,-0.027323,0.030418,-0.030937,-0.383466,-0.102017,-0.032395,-8.7e-05,0.155873,0.119909,-0.248711,0.361833


In [5]:
missing_values_count = credit_df.isnull().sum()
missing_values_count[missing_values_count > 0]

LIMIT_BAL    51
MARRIAGE     25
PAY_0        24
dtype: int64

In [6]:
nan_columns = np.any(np.isnan(X_train), axis=0)
nan_columns = nan_columns[~nan_columns].index
print(nan_columns)
X_drop_columns = X_train[nan_columns]
# X_drop_columns.info()

Index(['SEX', 'EDUCATION', 'AGE', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',
       'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5',
       'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5',
       'PAY_AMT6'],
      dtype='object')


In [7]:
print('Does target column contains nulls?', np.isnan(y_train).sum() > 0)

Does target column contains nulls? False


In [8]:
scores = cross_val_score(LogisticRegressionCV(cv=5), X_drop_columns, y_train, cv=10)
np.mean(scores)

0.8016666666666665

## Impute missing values: Mean & Median

In [10]:
imp = SimpleImputer(strategy='median').fit(X_train)
X_median_imp = imp.transform(X_train)
scores = cross_val_score(LogisticRegressionCV(cv=5), X_median_imp, y_train, cv=10)
np.mean(scores)

0.8163333333333334

In [11]:
imp = KNNImputer().fit(X_train)
X_median_imp = imp.transform(X_train)
scores = cross_val_score(LogisticRegressionCV(cv=5), X_median_imp, y_train, cv=10)
np.mean(scores)

0.8164583333333333

In [14]:
knn_pipe = make_pipeline(KNNImputer(), LogisticRegression())
scores = cross_val_score(knn_pipe, X_train, y_train, cv=10)
np.mean(scores)

0.8166249999999999