In [83]:
rm(list = ls())

In [84]:
# libraries
suppressMessages({
  library(tidyverse)
  library(broom)
  library(ranger)
}
)

In [85]:
Data <- read.csv("/Users/mpaga/Downloads/train.csv",sep=",")

In [86]:
dim(Data)

[1] 1460   81

In [87]:
names(Data)

 [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
 [5] "LotArea"       "Street"        "Alley"         "LotShape"     
 [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
[13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
[17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
[21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
[25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
[29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
[33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
[37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
[41] "HeatingQC"     "CentralAir"    "Electrical"    "X1stFlrSF"    
[45] "X2ndFlrSF"     "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
[49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
[53] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
[57] "Fireplaces"    "FireplaceQu"

In [88]:
#remove Id colum
Data["Id"] <- NULL
dim(Data)

[1] 1460   80

In [89]:
# numeric data
numData <- Data |> 
  select_if(is.numeric) 

In [90]:
hist(Data$SalePrice)

In [91]:
summary(Data$SalePrice)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  34900  129975  163000  180921  214000  755000 

In [92]:
skimr::skim_without_charts(Data,where(is.numeric))-> num_skimData

In [93]:
(numVal_na <- num_skimData$skim_variable[num_skimData$n_missing !=0] )

[1] "LotFrontage" "MasVnrArea"  "GarageYrBlt"

In [94]:
skimr::skim_without_charts(Data,where(is.character))-> cat_skimData

In [95]:
summary(cat_skimData)

── Data Summary ────────────────────────
                           Values
Name                       Data  
Number of rows             1460  
Number of columns          80    
_______________________          
Column type frequency:           
  character                43    
________________________         
Group variables            None  

Let's impute some of these numerical features

In [96]:
colMeans(is.na(Data[numVal_na]))

LotFrontage  MasVnrArea GarageYrBlt 
0.177397260 0.005479452 0.055479452 

In [97]:
# list of numeric features to impute
imputeVal_list <- apply(Data[numVal_na],2, FUN = "median",na.rm = T,simplify = list)

In [98]:
#impute numerical features 
Data[numVal_na] <- replace_na(Data[numVal_na] ,
                              replace = imputeVal_list
)

In [99]:
# check  na
colMeans(is.na(Data[numVal_na]))

LotFrontage  MasVnrArea GarageYrBlt 
          0           0           0 

In [100]:
# duplicates rows
Data |> 
  select_if(is.numeric) |> 
  unique() |> 
  nrow()

[1] 1460

In [101]:
# select numeric features
Data |> 
  select_if(is.numeric)  ->numData 

# check relationship with target (linear model)
# h0 : there no relationship between X and Y(SalePrice)
# h1 :otherwise

  lm(SalePrice~.,numData) |> 
    summary() |> 
    tidy() |> 
    filter(p.value <=0.05) |> 
    nrow()

[1] 19

28 numerical features have predictive effect on target feature. 

In [102]:
# correlation in numeric features
for (feature in names(numData)[-1]){
  if (cor(numData[names(numData)][1],numData[feature]) >= 0.8 ) print(feature)
}

There is no carrelated features in numeric features 

cat var feat engineering  

In [103]:
# categorical features presenting missing values 
sum(cat_skimData$n_missing  > 0)

[1] 16

In [104]:
# n_missing proportion > 0.95 in cat features 
cat_rid <- cat_skimData$skim_variable[cat_skimData$n_missing >= 0.95] 
length(cat_rid)

[1] 16

16 features present missing values or NA proportion above 0.95, we could get rid of it  

In [105]:
# remove these categorical features
Data[,cat_rid]<- NULL

In [106]:
# check categorical features with NA in Data 
sum(cat_rid %in% names(Data))
rm(cat_rid)

In [107]:
# new data dimension
dim(Data)

[1] 1460   64

In [108]:
# NA check
colMeans(is.na(Data)) |> sum()

[1] 0

There is no longer NA in this dataset

In [109]:
summary(lm(SalePrice~. ,Data)) |> 
  tidy() |> 
  filter(p.value <=0.05) |> 
  print(n=15)

[38;5;246m# A tibble: 59 × 5[39m
   term                  estimate std.error statistic  p.value
   [3m[38;5;246m<chr>[39m[23m                    [3m[38;5;246m<dbl>[39m[23m     [3m[38;5;246m<dbl>[39m[23m     [3m[38;5;246m<dbl>[39m[23m    [3m[38;5;246m<dbl>[39m[23m
[38;5;250m 1[39m MSZoningFV           [4m3[24m[4m3[24m495.    [4m1[24m[4m2[24m123.         2.76 5.81[38;5;246me[39m[31m- 3[39m
[38;5;250m 2[39m MSZoningRH           [4m2[24m[4m5[24m399.    [4m1[24m[4m2[24m189.         2.08 3.74[38;5;246me[39m[31m- 2[39m
[38;5;250m 3[39m MSZoningRL           [4m2[24m[4m7[24m888.    [4m1[24m[4m0[24m386.         2.69 7.35[38;5;246me[39m[31m- 3[39m
[38;5;250m 4[39m MSZoningRM           [4m2[24m[4m6[24m270.     [4m9[24m716.         2.70 6.95[38;5;246me[39m[31m- 3[39m
[38;5;250m 5[39m LotFrontage             96.2      44.0        2.19 2.89[38;5;246me[39m[31m- 2[39m
[38;5;250m 6[39m LotArea                  0.670     

[38;5;250m10[39m LotConfigCulDSac      [4m9[24m864.     [4m3[24m407.         2.90 3.86[38;5;246me[39m[31m- 3[39m
[38;5;250m11[39m LandSlopeMod         [4m1[24m[4m0[24m064.     [4m4[24m002.         2.51 1.20[38;5;246me[39m[31m- 2[39m
[38;5;250m12[39m LandSlopeSev        -[31m[4m2[24m[4m5[24m4[39m[31m46[39m[31m.[39m    [4m1[24m[4m1[24m049.        -[31m2[39m[31m.[39m[31m30[39m 2.14[38;5;246me[39m[31m- 2[39m
[38;5;250m13[39m NeighborhoodEdwards -[31m[4m1[24m[4m8[24m2[39m[31m56[39m[31m.[39m     [4m8[24m039.        -[31m2[39m[31m.[39m[31m27[39m 2.33[38;5;246me[39m[31m- 2[39m
[38;5;250m14[39m NeighborhoodMitchel -[31m[4m2[24m[4m1[24m9[39m[31m08[39m[31m.[39m     [4m8[24m233.        -[31m2[39m[31m.[39m[31m66[39m 7.89[38;5;246me[39m[31m- 3[39m
[38;5;250m15[39m NeighborhoodNAmes   -[31m[4m1[24m[4m6[24m9[39m[31m56[39m[31m.[39m     [4m7[24m887.        -[31m2[39m[31m.[39m[31m15[39m 3.1

In [193]:
ranger(SalePrice~.,Data,
  importance = 'impurity') |> 
  importance() |> 
  sort(decreasing = T) |> 
  tidy() |> 
  mutate(proportion= x/sum(x)) |> 
  filter(proportion >= 0.06)

In tidy.numeric(sort(importance(ranger(SalePrice ~ ., Data, importance = "impurity")),  :
  'tidy.numeric' is deprecated.
See help("Deprecated")


[38;5;246m# A tibble: 5 × 3[39m
  names             x proportion
  [3m[38;5;246m<chr>[39m[23m         [3m[38;5;246m<dbl>[39m[23m      [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m OverallQual 1.16[38;5;246me[39m12     0.128 
[38;5;250m2[39m GrLivArea   7.81[38;5;246me[39m11     0.086[4m7[24m
[38;5;250m3[39m GarageCars  6.22[38;5;246me[39m11     0.069[4m0[24m
[38;5;250m4[39m GarageArea  5.72[38;5;246me[39m11     0.063[4m4[24m
[38;5;250m5[39m TotalBsmtSF 5.71[38;5;246me[39m11     0.063[4m3[24m

In [190]:
ranger(SalePrice~.,Data,importance = 'impurity')  -> rng
rng$variable.importance |> 
sort(decreasing = T) |> 
tidy() |> 
mutate(prop=x/sum(x)) |> 
#print(n=5) 
filter(prop>=0.05)

In tidy.numeric(sort(rng$variable.importance, decreasing = T)) :
  'tidy.numeric' is deprecated.
See help("Deprecated")


[38;5;246m# A tibble: 7 × 3[39m
  names             x   prop
  [3m[38;5;246m<chr>[39m[23m         [3m[38;5;246m<dbl>[39m[23m  [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m OverallQual 1.09[38;5;246me[39m12 0.122 
[38;5;250m2[39m GrLivArea   8.30[38;5;246me[39m11 0.092[4m1[24m
[38;5;250m3[39m GarageCars  6.37[38;5;246me[39m11 0.070[4m7[24m
[38;5;250m4[39m TotalBsmtSF 5.97[38;5;246me[39m11 0.066[4m3[24m
[38;5;250m5[39m X1stFlrSF   5.17[38;5;246me[39m11 0.057[4m4[24m
[38;5;250m6[39m ExterQual   4.97[38;5;246me[39m11 0.055[4m1[24m
[38;5;250m7[39m GarageArea  4.79[38;5;246me[39m11 0.053[4m1[24m

to be continued 😉!