Housing Prices 

In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

BASE_DIR = os.path.dirname(__file__)

#read the data 
X_full = pd.read_csv(os.path.join(BASE_DIR, "train.csv"),index_col='Id')
X_test_full = pd.read_csv(os.path.join(BASE_DIR, "test.csv"),index_col='Id')

#obtaining the features and the target predictiosn 

y = X_full.SalePrice

features =  ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']

X = X_full[features].copy()
X_test = X_test_full[features].copy()

#break off validation set from training data

X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=0)

Looking at the first 5 rows of the data to use for prediction 

In [None]:
X_train.head()

Making the Forest tree model 


In [None]:
from sklearn.ensemble import RandomForestRegressor

#define the models 

model1 = RandomForestRegressor(n_estimators=50, random_state=0)
model2 = RandomForestRegressor(n_estimators=100, random_state= 0)
model3 = RandomForestRegressor(n_estimators= 100, criterion='absolute_error',random_state=0)
model4 = RandomForestRegressor(n_estimators=200, min_samples_split=20,random_state= 0 )
model5 = RandomForestRegressor(n_estimators= 100, max_depth=7, random_state=0)

#list with models 
models = [model1,model2,model3,model4,model5]

Checking the model with the lowest MAE

In [None]:
from sklearn.metrics  import mean_absolute_error

def score_model(model, X_t=X_train, X_v=X_valid, y_t=y_train, y_v=y_valid):
    model.fit(X_t,y_t)
    predictions = model.predict(X_v)
    return mean_absolute_error(y_v,predictions)    

score_mae = []

for i in range(0, len(models)):
    mae = score_model(models[i])
    score_mae.append(mae)
    print(f"Model: {i+1} MAE: {mae}")
    
#the best model
min_mae = min(score_mae)
best_model = [ x  for x in range(len(score_mae)) if score_mae[x] == min_mae ]
 
print(f'The best model: {best_model[0]+1} , MAE = {min_mae} ')

Using the lowest MAE Model 

In [None]:
myModel = models[best_model[0]+1]

myModel.fit(X,y)

preds = myModel.predict(X_valid)

Fixing missing values 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 

#comparing different estimations 

def score_dataset(X_train,X_valid,y_train,y_valid):
    model = RandomForestRegressor(n_estimators=10,random_state=0)
    model.fit(X_train,y_train)
    predictions = model.predict(X_valid)
    mae = mean_absolute_error(y_valid,predictions)
    return mae

Dropping missing all NA columns 

In [None]:
#dropping columns with missing values 

cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

#dropping the missing columsn in the test and training data 
reduced_X_train = X_train.drop(cols_with_missing,axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing,axis=1)

print(score_dataset(reduced_X_train,reduced_X_valid, y_train, y_valid))


Fixing missing values by using imputation 

In [None]:
from sklearn.impute import SimpleImputer

# imputation 

my_imputer = SimpleImputer
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print('MAE : ', score_dataset(imputed_X_train,imputed_X_valid,y_train,y_valid))

Fixing the missing values using extension of imputation 

In [None]:
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()
    
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

print('MAE: ',score_dataset(imputed_X_train_plus,imputed_X_valid_plus,y_train,y_valid))