<a href="https://colab.research.google.com/github/martharegina/machinelearning/blob/main/intermediate_machinelearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('/content/melbourne_data.csv')
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [None]:
# Select target
y = data['Price']

# Select features
features = data.drop(['Price'], axis=1)
X = features.select_dtypes(exclude=['object'])

train_X, val_X, train_y, val_y = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function to compare different approaches

def score_dataset(train_X, val_X, train_y, val_y):
  model = RandomForestRegressor(n_estimators=10, random_state=0)
  model.fit(train_X, train_y)
  predictions = model.predict(val_X)
  return mean_absolute_error(val_y, predictions)

In [None]:
# Approach 1: Drop columns with missing values

cols_with_missing = [col for col in train_X.columns if train_X[col].isnull().any()]

reduced_train_X = train_X.drop(cols_with_missing, axis=1)
reduced_val_X = val_X.drop(cols_with_missing, axis=1)

print("MAE from Approach 1 (Drop columns with missing values):")
print(score_dataset(reduced_train_X, reduced_val_X, train_y, val_y))

MAE from Approach 1 (Drop columns with missing values):
183550.22137772635


In [None]:
# Approach 2: Imputation

from sklearn.impute import SimpleImputer

my_imputer = SimpleImputer()
imputed_train_X = pd.DataFrame(my_imputer.fit_transform(train_X))
imputed_val_X = pd.DataFrame(my_imputer.transform(val_X))

imputed_train_X.columns = train_X.columns
imputed_val_X.columns = val_X.columns

print("MAE from Approach 2 (Imputation):")
print(score_dataset(imputed_train_X, imputed_val_X, train_y, val_y))

MAE from Approach 2 (Imputation):
178166.46269899711


In [None]:
# Approach 3: An extention to imputation

train_X_plus = train_X.copy()
val_X_plus = val_X.copy()

for col in cols_with_missing:
  train_X_plus[col + '_was_missing'] = train_X_plus[col].isnull()
  val_X_plus[col + '_was_missing'] = val_X_plus[col].isnull()

my_imputer = SimpleImputer()
imputed_train_X_plus = pd.DataFrame(my_imputer.fit_transform(train_X_plus))
imputed_val_X_plus = pd.DataFrame(my_imputer.transform(val_X_plus))

imputed_train_X_plus.columns = train_X_plus.columns
imputed_val_X_plus.columns = val_X_plus.columns

print("MAE from Approach 3 (An extention to imputation):")
print(score_dataset(imputed_train_X_plus, imputed_val_X_plus, train_y, val_y))

MAE from Approach 3 (An extention to imputation):
178927.503183954


In [None]:
print(train_X.shape)

train_X.columns[train_X.isnull().sum() > 0]

(10864, 12)


Index(['Car', 'BuildingArea', 'YearBuilt'], dtype='object')