In [135]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor  
from sklearn.model_selection import train_test_split
import tensorflow_decision_forests as tfdf
from sklearn.metrics import mean_squared_error

In [63]:
df_training = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [64]:
print(df_training.shape)
df_training.head()

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [65]:
rows_with_na = df_training[df_training.isna().any(axis=1)]
# print(rows_with_na)

# Now we realize that theres 339 rows with NA data out of 1460 rows.. 
print('Percent of data with NaN values:', rows_with_na.shape[0]/ df_training.shape[0])

# Calculate the percentage of NaN values in each column
nan_percentage = df_training.isna().mean() * 100

# Filter for columns where the percentage of NaN values is greater than 0
nan_columns_with_percentage = nan_percentage[nan_percentage > 0]

# Convert to a DataFrame for better readability
nan_columns_df = nan_columns_with_percentage.reset_index()
nan_columns_df.columns = ['Column', 'NaN Percentage']

# Sort the DataFrame by NaN Percentage from highest to lowest
nan_columns_df = nan_columns_df.sort_values(by='NaN Percentage', ascending=False)

# Display the DataFrame
print(nan_columns_df)

Percent of data with NaN values: 1.0
          Column  NaN Percentage
16        PoolQC       99.520548
18   MiscFeature       96.301370
1          Alley       93.767123
17         Fence       80.753425
2     MasVnrType       59.726027
10   FireplaceQu       47.260274
0    LotFrontage       17.739726
11    GarageType        5.547945
12   GarageYrBlt        5.547945
13  GarageFinish        5.547945
14    GarageQual        5.547945
15    GarageCond        5.547945
8   BsmtFinType2        2.602740
6   BsmtExposure        2.602740
7   BsmtFinType1        2.534247
5       BsmtCond        2.534247
4       BsmtQual        2.534247
3     MasVnrArea        0.547945
9     Electrical        0.068493


In [66]:
columns_to_drop = [
    'PoolQC',
    'MiscFeature',
    'Alley',
    'Fence',
    'MasVnrType',
    'FireplaceQu',
    'LotFrontage'
]

# Drop the columns from the DataFrame
df_training_dropped = df_training.drop(columns=columns_to_drop)

# Display the updated DataFrame
print(df_training_dropped)

        Id  MSSubClass MSZoning  LotArea Street LotShape LandContour  \
0        1          60       RL     8450   Pave      Reg         Lvl   
1        2          20       RL     9600   Pave      Reg         Lvl   
2        3          60       RL    11250   Pave      IR1         Lvl   
3        4          70       RL     9550   Pave      IR1         Lvl   
4        5          60       RL    14260   Pave      IR1         Lvl   
...    ...         ...      ...      ...    ...      ...         ...   
1455  1456          60       RL     7917   Pave      Reg         Lvl   
1456  1457          20       RL    13175   Pave      Reg         Lvl   
1457  1458          70       RL     9042   Pave      Reg         Lvl   
1458  1459          20       RL     9717   Pave      Reg         Lvl   
1459  1460          20       RL     9937   Pave      Reg         Lvl   

     Utilities LotConfig LandSlope  ... EnclosedPorch 3SsnPorch ScreenPorch  \
0       AllPub    Inside       Gtl  ...             0   

In [67]:
rows_with_na = df_training_dropped[df_training_dropped.isna().any(axis=1)]
print('New percent of rows with NaN values:', rows_with_na.shape[0]/ df_training_dropped.shape[0])
print(df_training_dropped.shape)

New percent of rows with NaN values: 0.08356164383561644
(1460, 74)


In [68]:
# Drop rows with any NaN values
df_training_cleaned = df_training_dropped.dropna()

# Display the cleaned DataFrame
print(df_training_cleaned)
print(df_training_cleaned)

        Id  MSSubClass MSZoning  LotArea Street LotShape LandContour  \
0        1          60       RL     8450   Pave      Reg         Lvl   
1        2          20       RL     9600   Pave      Reg         Lvl   
2        3          60       RL    11250   Pave      IR1         Lvl   
3        4          70       RL     9550   Pave      IR1         Lvl   
4        5          60       RL    14260   Pave      IR1         Lvl   
...    ...         ...      ...      ...    ...      ...         ...   
1455  1456          60       RL     7917   Pave      Reg         Lvl   
1456  1457          20       RL    13175   Pave      Reg         Lvl   
1457  1458          70       RL     9042   Pave      Reg         Lvl   
1458  1459          20       RL     9717   Pave      Reg         Lvl   
1459  1460          20       RL     9937   Pave      Reg         Lvl   

     Utilities LotConfig LandSlope  ... EnclosedPorch 3SsnPorch ScreenPorch  \
0       AllPub    Inside       Gtl  ...             0   

In [76]:
# Let's find the correlation of all the features to the target variable
# Correlation calculations require numeric data, let's identify numeric and categorical columns
numeric_columns = df_training_cleaned.select_dtypes(include=['number']).columns.tolist()
categorical_columns = df_training_cleaned.select_dtypes(include=['object', 'category']).columns.tolist()

#this encodes the entire dataset to account for categorical variabled
df_encoded = pd.get_dummies(df_training_cleaned, columns=categorical_columns, drop_first=True)
df_encoded

#compute the correlation matrix
df_encoded_1 = df_encoded.copy()
corr_matrix = df_encoded_1.corr()

corr_matrix

# Get the correlation of each feature with SalePrice
saleprice_corr= corr_matrix['SalePrice'].sort_values(ascending=False)
saleprice_corr

# Let's now set a correlation level
level = 0.5

# Filter strong correlations and drop 'SalePrice'
strong_correlations = saleprice_corr[abs(saleprice_corr) > level].drop('SalePrice')

print(strong_correlations)


OverallQual       0.783546
GrLivArea         0.711706
GarageCars        0.640154
GarageArea        0.607535
1stFlrSF          0.604714
TotalBsmtSF       0.602042
FullBath          0.569313
TotRmsAbvGrd      0.551821
YearBuilt         0.504297
YearRemodAdd      0.501435
KitchenQual_TA   -0.524120
ExterQual_TA     -0.585107
Name: SalePrice, dtype: float64


In [77]:
# Next, let's check for multicollinearity among the list of strong_correlation features
selected_features = strong_correlations.index.tolist()

selected_features

# Create the correlation matrix for selected features
selected_corr_matrix = df_encoded_1[selected_features].corr()

In [78]:

#Identify highly correlated (multicollinear) pairs among the selected features that have high correlation
high_corr_pairs = selected_corr_matrix[(abs(selected_corr_matrix) > 0.7) & (selected_corr_matrix!= 1)]
print(high_corr_pairs)

#Identify feautres to drop due to multicollinearity
drop_features = set()

for col in high_corr_pairs.columns:
    for index in high_corr_pairs.index:
        if abs(high_corr_pairs.loc[index, col]) > 0.7 and index != col:
            drop_features.add(col) #add the feature to the collection to drop

# Drop the identified features
df_encoded_1 = df_encoded_1.drop(columns=drop_features)
print("Dropped features:", drop_features)

# Create a subset of df_encoded with strong correlations and one instance of each multicollinear pair
retained_features = list(set(selected_features) - drop_features)
df_subset_1 = df_encoded_1[retained_features]

#Display the final chosen features
print(df_subset_1)

                OverallQual  GrLivArea  GarageCars  GarageArea  1stFlrSF  \
OverallQual             NaN        NaN         NaN         NaN       NaN   
GrLivArea               NaN        NaN         NaN         NaN       NaN   
GarageCars              NaN        NaN         NaN    0.832166       NaN   
GarageArea              NaN        NaN    0.832166         NaN       NaN   
1stFlrSF                NaN        NaN         NaN         NaN       NaN   
TotalBsmtSF             NaN        NaN         NaN         NaN   0.89311   
FullBath                NaN        NaN         NaN         NaN       NaN   
TotRmsAbvGrd            NaN   0.823491         NaN         NaN       NaN   
YearBuilt               NaN        NaN         NaN         NaN       NaN   
YearRemodAdd            NaN        NaN         NaN         NaN       NaN   
KitchenQual_TA          NaN        NaN         NaN         NaN       NaN   
ExterQual_TA            NaN        NaN         NaN         NaN       NaN   

           

In [79]:
df_subset_1

Unnamed: 0,OverallQual,ExterQual_TA,YearBuilt,KitchenQual_TA,FullBath,YearRemodAdd
0,7,False,2003,False,2,2003
1,6,True,1976,True,2,1976
2,7,False,2001,False,2,2002
3,7,True,1915,False,1,1970
4,8,False,2000,False,2,2000
...,...,...,...,...,...,...
1455,6,True,1999,True,2,2000
1456,6,True,1978,True,2,1988
1457,7,False,1941,False,2,2006
1458,5,True,1950,False,1,1996


In [80]:
# Alternatively, lets do feature importance from models

df_encoded_2 = df_encoded.copy()
corr_matrix_2 = df_encoded_2.corr()

# Multicollinearity threshold
threshold = 0.7

# Create a set to hold features to drop
drop_features_2 = set()

# Loop over the correlation matrix and find pairs of highly correlated features
for i in range(len(corr_matrix_2.columns)):
    for j in range(i):
        if abs(corr_matrix_2.iloc[i, j]) > threshold:
            feature_i = corr_matrix_2.columns[i]
            feature_j = corr_matrix_2.columns[j]
            # Add one of the correlated features to the drop list
            drop_features_2.add(feature_j)  # You can choose either feature_j or feature_i


# Drop the highly correlated features from the dataset
df_reduced_2 = df_encoded_2.drop(columns=drop_features_2)
print(f"Dropped features due to multicollinearity: {drop_features_2}")

# Step 5: Fit RandomForestRegressor on the reduced dataset
# Define your target variable 'SalePrice' and independent variables
X = df_reduced_2.drop(columns='SalePrice')
y = df_encoded_2['SalePrice']

# Initialize and fit the model
model = RandomForestRegressor()
model.fit(X, y)

# Step 6: Extract feature importance
importances = model.feature_importances_
feature_importances = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
# print(feature_importances.sort_values(by='Importance', ascending=False))

# Define the threshold for feature importance
threshold = 0.05

# Filter out features below the threshold
important_features = feature_importances[feature_importances['Importance'] > threshold]

# Get the list of feature names
important_features = important_features['Feature'].tolist()

df_subset_2 = df_encoded_2[important_features]


Dropped features due to multicollinearity: {'GarageType_Attchd', 'BsmtQual_Gd', 'BsmtCond_Gd', 'BsmtCond_Po', 'Foundation_CBlock', 'SaleType_New', 'Exterior1st_MetalSd', 'RoofStyle_Gable', '2ndFlrSF', 'OverallQual', 'Exterior1st_CemntBd', 'YearBuilt', 'MSZoning_RL', 'BsmtFinSF2', 'GarageCond_Fa', 'Exterior1st_Stucco', 'TotalBsmtSF', 'Exterior1st_Plywood', 'Exterior1st_VinylSd', 'ExterQual_Gd', 'ExterCond_Gd', 'Exterior1st_HdBoard', 'KitchenQual_Gd', 'Exterior1st_Wd Sdng', 'Condition2_RRAe', 'GarageQual_Fa', 'SaleType_WD', 'GarageCars', 'GrLivArea', 'Exterior1st_CBlock', 'MSZoning_FV'}


In [81]:
df_subset_2

Unnamed: 0,1stFlrSF,FullBath,GarageArea,ExterQual_TA
0,856,2,548,False
1,1262,2,460,True
2,920,2,608,False
3,961,1,642,True
4,1145,2,836,False
...,...,...,...,...
1455,953,2,460,True
1456,2073,2,500,True
1457,1188,2,252,False
1458,1078,1,240,True


In [114]:
X = df_subset_1  # Features
y = df_training_cleaned['SalePrice']  # Target

# Split into training and validation sets 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [117]:
# Convert Series to DataFrame
y_train_df = y_train.to_frame(name='SalePrice')

# Concatenate
train_df = pd.concat([X_train, y_train_df], axis=1)

# Display the combined DataFrame
train_df

Unnamed: 0,OverallQual,ExterQual_TA,YearBuilt,KitchenQual_TA,FullBath,YearRemodAdd,SalePrice
610,9,False,2000,False,2,2000,313000
1402,7,False,2006,False,2,2006,193879
1249,5,True,1950,True,1,1950,119000
1062,5,True,1900,True,2,1950,90000
530,6,False,1988,False,2,1989,175000
...,...,...,...,...,...,...,...
1196,7,False,2006,False,2,2006,219210
1236,7,False,2003,False,2,2003,175500
1413,8,False,2005,False,2,2006,257000
933,7,False,2004,False,2,2004,190000


In [118]:
# Convert Series to DataFrame
y_val_df = y_val.to_frame(name='SalePrice')

# Concatenate
val_df = pd.concat([X_val, y_val_df], axis=1)

# Display the combined DataFrame
val_df

Unnamed: 0,OverallQual,ExterQual_TA,YearBuilt,KitchenQual_TA,FullBath,YearRemodAdd,SalePrice
832,7,False,2003,False,2,2003,237000
965,6,False,2005,False,2,2007,178900
969,6,True,1958,True,1,1958,140000
1411,6,True,1950,True,1,2005,140000
284,6,False,1992,False,2,1992,179200
...,...,...,...,...,...,...,...
119,6,False,2005,False,2,2006,163990
628,5,True,1969,False,1,1969,135000
585,8,False,2005,False,2,2006,369900
593,6,False,2003,False,1,2003,140000


In [124]:
# Convert boolean columns to integers
train_df['ExterQual_TA'] = train_df['ExterQual_TA'].astype(int)  # or .astype(str) if you prefer strings
# Convert boolean columns to integers
train_df['KitchenQual_TA'] = train_df['KitchenQual_TA'].astype(int)  # or .astype(str) if you prefer strings

In [125]:
# Convert DataFrame to TensorFlow dataset
train_data = tfdf.keras.pd_dataframe_to_tf_dataset(
    train_df,
    task=tfdf.keras.Task.REGRESSION,  # Specify that it's a regression task
    label="SalePrice"  # Specify your target column
)

# Define the model
model = tfdf.keras.RandomForestModel(
    task=tfdf.keras.Task.REGRESSION,  # Specify regression
    num_trees=100  # Number of trees in the forest
)

# Fit the model using the TensorFlow dataset
model.fit(train_data)


Use /var/folders/w1/98s57ht549d5119h4qsbd_v00000gn/T/tmppbne5pvr as temporary training directory
Reading training dataset...
Training dataset read in 0:00:00.365112. Found 1070 examples.
Training model...
Model trained in 0:00:00.111857
Compiling model...


I0000 00:00:1727055578.441279 4483502 kernel.cc:774] Start Yggdrasil model training
I0000 00:00:1727055578.442339 4483502 kernel.cc:775] Collect training examples
I0000 00:00:1727055578.442345 4483502 kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: NUMERICAL
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1727055578.446228 4483502 kernel.cc:394] Number of batches: 2
I0000 00:00:1727055578.446236 4483502 kernel.cc:395] Number of examples: 1070
I0000 00:00:1727055578.446995 4483502 kernel.cc:794] Training dataset:
Number of records: 1070
Number of columns: 7

Number of columns by type:
	NUMERICAL: 7 (100%)

Columns:

NUMERICAL: 7 (100%)
	0: "ExterQual_TA" NUMERICAL mean:0.603738 min:0 max:1 sd:0.48912
	1: "FullBath" NUMERICAL mean:1.58879 min:0 max:3 sd:0.554561
	2: "Kit

Model compiled.


<tf_keras.src.callbacks.History at 0x1412461d0>

In [127]:
# Convert boolean columns to integers
val_df['ExterQual_TA'] = val_df['ExterQual_TA'].astype(int)  # or .astype(str) if you prefer strings
# Convert boolean columns to integers
val_df['KitchenQual_TA'] = val_df['KitchenQual_TA'].astype(int)  # or .astype(str) if you prefer strings

In [136]:
# Convert validation DataFrame to TensorFlow dataset
val_data = tfdf.keras.pd_dataframe_to_tf_dataset(
    val_df,  # Your validation DataFrame
    task=tfdf.keras.Task.REGRESSION,
    label="SalePrice"
)

# Make predictions
predictions = model.predict(val_data)



In [137]:
# Make predictions on the test dataset
predictions = model.predict(val_data)

# Convert predictions to a NumPy array if needed
predictions = np.array(predictions)

# Get the actual values from the test DataFrame
actual_values = val_df['SalePrice'].values

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(actual_values, predictions))

print("Root Mean Squared Error:", rmse)

Root Mean Squared Error: 36721.934855411586
