<a href="https://colab.research.google.com/github/natalia7244/Machine-Learning-Exercises/blob/main/Missing_Values.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Missing values

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('/content/drive/MyDrive/Data_sets/melb_data.csv')

y = data.Price #target
melb_predictions = data.drop(['Price'], axis = 1)
X = melb_predictions.select_dtypes(exclude=['object']) #I use only numerical predictiors, to keep things sipmle

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state = 0) #divide data into training and validation subsets


# Define function to measure quality of each approach

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(X_train, X_valid, y_train, y_valid):
  model = RandomForestRegressor(n_estimators=10, random_state=0) #10 trees, results are reproducible
  # It's used for regression problems, it works by combining the predictions of multiple decision trees
  model.fit(X_train, y_train) # trains the  model on training data
  preds = model.predict(X_valid) # generates predictions for the validation set using the trained model
  return mean_absolute_error(y_valid, preds) #A function that evaluates how good the model's predictions are

# Approach 1 - Drop columns with missing values

In [14]:
columns_with_missing = [col for col in X_train.columns
                        if X_train[col].isnull().any()] #get name of columns with missing values

reduced_X_train = X_train.drop(columns_with_missing, axis =1) # drop columns in training data
reduced_X_valid = X_valid.drop(columns_with_missing, axis =1) #drop columns in validation data

print("MAE from Approach 1:")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE from Approach 1:
183550.22137772635


# Approach 2 - Imputation

In [23]:
from sklearn.impute import SimpleImputer #fill in missing data

my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train)) # calculates the mean of each column
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid)) # fills in the missing values those means
                                                                  #convert it back to a pandas DataFrame

imputed_X_train_columns = X_train.columns #Columns names are lost, so we need to restore them
imputed_X_valid_columns = X_valid.columns

print("MAE from Approach 2:")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE from Approach 2:
178166.46269899711


# Approach 3 - An Extension to Imputation

In [26]:
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

for col in columns_with_missing:
  X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
  X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

  my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus)) # calculates the mean of each column
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus)) # fills in the missing values those means
                                                                  #convert it back to a pandas DataFrame

imputed_X_train_plus_columns = X_train_plus.columns #Put columns names back
imputed_X_valid_plus_columns = X_valid_plus.columns


print("MAE from Approach 3 (An Extension to Imputation):")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))

MAE from Approach 3 (An Extension to Imputation):
178927.503183954


# Comparing Quality of Each Approach

In [28]:
print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

print("MAE from Approach 2 (Imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

print("MAE from Approach 3 (An Extension to Imputation):")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))

MAE from Approach 1 (Drop columns with missing values):
183550.22137772635
MAE from Approach 2 (Imputation):
178166.46269899711
MAE from Approach 3 (An Extension to Imputation):
178927.503183954
