In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

## Missing values with na_values parameter
We can define a na_values parameter with the values we want to be recognized as NA/NaN. In this case empty strings " ", _, ?  - will be recognized as null values.

In [3]:
# importing data
df = pd.read_csv("train.csv", na_values=[" ", "?","-", "_"])
# droping the first column
df.set_index(df.columns[0],inplace = True)
df

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


In [5]:
# checking the shape (Number of columns and rows)
df.shape

(1460, 80)

# Cleaning and preparing our data
How many missing data points do we have in our dataset?

In [6]:
#number of missing data points per column
missing_values_count = df.isnull().sum()
# look at the number of missing points in all columns
missing_values_count

MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 80, dtype: int64

### Percentage of missing values

In [7]:
#total size and how many total missing values do we have?
total_size = df.size
total_missing = missing_values_count.sum()

#percentage of the data missing
missing_percentage = (total_missing / total_size) * 100
print(f"The percentage of missing values:{missing_percentage}%")

The percentage of missing values:6.70291095890411%


#### Handling 'Not Available' Values in Specific Columns
In our dataset, certain columns use the label 'NA' to represent cases where information is intentionally not available or recorded. However, Pandas interprets 'NA' as missing data. To address this, we are replacing these 'NA' values with the label 'Not Available' in specific columns.
##### Columns to be Processed
We have identified a set of columns where 'NA' does not imply missing data but signifies that the particular feature is not applicable or available:'Alley''MasVnrType''BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','FireplaceQu','GarageType','GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature'
###### Approach
We use a simple for loop to iterate over each specified column and replace 'NaN' values with the label 'Not Available'. This helps in distinguishing between missing data and instances where the information is genuinely not available or recorded.


In [8]:
# Replace NaN values with 'Not Available' only in specified columns
columns_to_fillna = [
    'Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 
    'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 
    'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature'
]
for column in columns_to_fillna:
    df[column] = df[column].fillna(f'No {column}')

#### This version emphasizes that 'NA' is intentionally used to convey specific meanings and clarifies the distinction between missing values and instances where information is intentionally not available or recorded.

In [9]:
df

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,No Alley,Reg,Lvl,AllPub,Inside,...,0,No PoolQC,No Fence,No MiscFeature,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,No Alley,Reg,Lvl,AllPub,FR2,...,0,No PoolQC,No Fence,No MiscFeature,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,No Alley,IR1,Lvl,AllPub,Inside,...,0,No PoolQC,No Fence,No MiscFeature,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,No Alley,IR1,Lvl,AllPub,Corner,...,0,No PoolQC,No Fence,No MiscFeature,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,No Alley,IR1,Lvl,AllPub,FR2,...,0,No PoolQC,No Fence,No MiscFeature,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,RL,62.0,7917,Pave,No Alley,Reg,Lvl,AllPub,Inside,...,0,No PoolQC,No Fence,No MiscFeature,0,8,2007,WD,Normal,175000
1457,20,RL,85.0,13175,Pave,No Alley,Reg,Lvl,AllPub,Inside,...,0,No PoolQC,MnPrv,No MiscFeature,0,2,2010,WD,Normal,210000
1458,70,RL,66.0,9042,Pave,No Alley,Reg,Lvl,AllPub,Inside,...,0,No PoolQC,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1459,20,RL,68.0,9717,Pave,No Alley,Reg,Lvl,AllPub,Inside,...,0,No PoolQC,No Fence,No MiscFeature,0,4,2010,WD,Normal,142125


### Converting Categorical columns to Numeric

In [16]:
# Identify categorical columns to encode
categorical_columns = df.select_dtypes(include=['object']).columns

### Using LabelEncoder to Convert Categorical Columns to Numeric
In this step, we leverage the LabelEncoder to convert categorical columns to numeric representations. LabelEncoder is chosen for its simplicity and efficiency, especially suitable for large datasets. It assigns a unique numeric label to each category, facilitating machine learning model training on categorical data.

In [17]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Encode categorical columns
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])

### Using SimpleImputer to Fill in Missing Values

To handle missing data in our dataset, we employ SimpleImputer, a versatile tool for imputing missing values with either statistical measures or constant values. SimpleImputer provides a straightforward way to address missing data, ensuring a more complete and usable dataset for machine learning.

In [18]:
# Create an imputer instance
imputer = SimpleImputer(strategy='most_frequent')

# Fit the imputer to the data (compute the mean for each column)
imputer.fit(df)

# Transform the data, replacing missing values with the mean of each column
imputed_data = imputer.transform(df)

# Convert the result back to a DataFrame
train_imputed = pd.DataFrame(imputed_data, columns=df.columns, index=df.index)

### Preparing Training Data

In the process of preparing our data for training a machine learning model, we create a copy of the imputed dataset. This separation allows us to maintain the integrity of the original data while ensuring that modifications for model training are performed on a dedicated set

In [19]:
train_features = train_imputed.copy()
train_labels = train_imputed.copy() 

train_features = train_features.drop('SalePrice', axis = 1) # Features used for predicting SalePrice
train_labels = train_labels.pop('SalePrice') # SalePrice values to be predicted

In [20]:
# Split the train_encoded dataset into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_features, train_labels, test_size=0.2, random_state=42)

In [21]:
# Initialize the Random Forest Regressor
forest_regressor = RandomForestRegressor(n_estimators = 500, random_state=0, criterion = "squared_error", max_depth = 10, min_samples_split = 0.1, min_samples_leaf = 1, max_features = 10)

In [22]:
forest_regressor.fit(X_train, y_train)

In [23]:
y_val_pred = forest_regressor.predict(X_val)

In [26]:
# checking the score and error
mae = mean_absolute_error(y_val, y_val_pred)
mse = mean_squared_error(y_val, y_val_pred)
r2 = r2_score(y_val, y_val_pred)

In [25]:
# Optionally, make predictions on the test data
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R2 Score: {r2}")

MAE: 24625.629147564257
MSE: 2070409121.8524513
R2 Score: 0.730075479721779
