<a href="https://colab.research.google.com/github/omid-sakaki-ghazvini/Data-Mining/blob/master/Missing_Values.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
omidsakaki1370_data_preparation_example_path = kagglehub.dataset_download('omidsakaki1370/data-preparation-example')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [None]:
df_example = pd.read_csv('/kaggle/input/data-preparation-example/credit_risk_dataset.csv')

In [None]:
df_example.head(10)

In [None]:
df_example.info()

In [None]:
missing_values_count = df_example.isnull().sum()

missing_values_count

In [None]:
df_example['loan_int_rate'].isna()

In [None]:
# how many total missing values do we have?
total_cells = np.product(df_example.shape)
total_missing = missing_values_count.sum()

In [None]:
percent_missing = (total_missing/total_cells) * 100
print(percent_missing)

## **Drop missing values**

In [None]:
df_example.dropna()

In [None]:
# remove all columns with at least one missing value
columns_with_na_dropped = df_example.dropna(axis=1)
columns_with_na_dropped.head()

In [None]:
# just how much data did we lose?
print("Columns in original dataset: %d \n" % df_example.shape[1])
print("Columns with na's dropped: %d" % columns_with_na_dropped.shape[1])

## **Filling in missing values automatically**

### **fillna(0)**

In [None]:
df_0 = df_example['loan_int_rate'].fillna(0)
missing_values_count = df_0.isnull().sum()

print(missing_values_count)
df_0.head(10)

In [None]:
ax1 = plt.subplot(121)
df_example['loan_int_rate'].hist(bins=20, figsize=(15,5));
ax1.set_title("loan_int_rate");

ax2 = plt.subplot(122)
df_0.hist(bins=20);
ax2.set_title("loan_int_rate_fillna(0)");

### **fillna(mean())**

In [None]:
df_mean = df_example['loan_int_rate'].fillna(df_example['loan_int_rate'].mean())
missing_values_count = df_mean.isnull().sum()

print(missing_values_count)
df_mean.head(10)

In [None]:
ax1 = plt.subplot(121)
df_example['loan_int_rate'].hist(bins=20, figsize=(15,5));
ax1.set_title("loan_int_rate");

ax2 = plt.subplot(122)
df_mean.hist(bins=20);
ax2.set_title("loan_int_rate_fillna(mean())");

### **fillna(method = 'ffill')**

In [None]:
df_ffill = df_example['loan_int_rate'].fillna(method = 'ffill')
missing_values_count = df_ffill.isnull().sum()

print(missing_values_count)
df_ffill.head(10)

In [None]:
ax1 = plt.subplot(121)
df_example['loan_int_rate'].hist(bins=20, figsize=(15,5));
ax1.set_title("loan_int_rate");

ax2 = plt.subplot(122)
df_ffill.hist(bins=20);
ax2.set_title("loan_int_rate_fillna(method = 'ffill')");

### **fillna(method = 'pad')**

In [None]:
df_pad = df_example['loan_int_rate'].fillna(method = 'pad')
missing_values_count = df_pad.isnull().sum()

print(missing_values_count)
df_pad.head(10)

In [None]:
ax1 = plt.subplot(121)
df_example['loan_int_rate'].hist(bins=20, figsize=(15,5));
ax1.set_title("loan_int_rate");

ax2 = plt.subplot(122)
df_pad.hist(bins=20);
ax2.set_title("loan_int_rate_fillna(method = 'pad')");

### **fillna(method = 'backfill')**

In [None]:
df_backfill = df_example['loan_int_rate'].fillna(method = 'backfill')
missing_values_count = df_backfill.isnull().sum()

print(missing_values_count)
df_backfill.head(10)

In [None]:
ax1 = plt.subplot(121)
df_example['loan_int_rate'].hist(bins=20, figsize=(15,5));
ax1.set_title("loan_int_rate");

ax2 = plt.subplot(122)
df_backfill.hist(bins=20);
ax2.set_title("loan_int_rate_fillna(method = 'backfill')");

## **Imputation of missing values**

### **Simple Imputer**

In [None]:
from sklearn.impute import SimpleImputer
import scipy.sparse as sp

df_test = df_example['loan_int_rate'].values.reshape(1, 32581)
df_test = sp.csc_matrix(df_test)

imputer = SimpleImputer(missing_values = np.nan , strategy = 'mean')
imputer = imputer.fit(df_test)
imputer = imputer.transform(df_test).toarray()

In [None]:
imputer = pd.Series(imputer.reshape(imputer.shape[1]), name="loan_int_rate")
print(imputer)

In [None]:
ax1 = plt.subplot(121)
df_example['loan_int_rate'].hist(bins=20, figsize=(15,5));
ax1.set_title("loan_int_rate");

ax2 = plt.subplot(122)
imputer.hist(bins=20);
ax2.set_title("Simple Imputer");

## **Nearest neighbors imputation**

In [None]:
from sklearn.impute import KNNImputer

df_test = df_example['loan_int_rate'].values.reshape(1, 32581)

imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputer = imputer.fit_transform(df_test)

print(imputer.shape)

In [None]:
imputer = pd.Series(imputer.reshape(imputer.shape[1]), name="loan_int_rate")
print(imputer)

In [None]:
ax1 = plt.subplot(121)
df_example['loan_int_rate'].hist(bins=20, figsize=(15,5));
ax1.set_title("loan_int_rate");

ax2 = plt.subplot(122)
imputer.hist(bins=20);
ax2.set_title("Nearest neighbors imputation");