In [1]:
rm(list = ls())

In [2]:
# libraries
suppressMessages({
  library(tidyverse)
  library(broom)
  library(ranger)
}
)

In [3]:
Data <- read.csv("/Users/mpaga/Downloads/train.csv",sep=",")

In [4]:
dim(Data)

[1] 1460   81

In [49]:
#names(Data)

In [6]:
#remove Id colum
Data["Id"] <- NULL
dim(Data)

[1] 1460   80

In [7]:
# numeric data
numData <- Data |> 
  select_if(is.numeric) 

In [8]:
hist(Data$SalePrice)

In [9]:
summary(Data$SalePrice)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  34900  129975  163000  180921  214000  755000 

In [10]:
skimr::skim_without_charts(Data,where(is.numeric))-> num_skimData

In [48]:
summary(num_skimData)

── Data Summary ────────────────────────
                           Values
Name                       Data  
Number of rows             1460  
Number of columns          80    
_______________________          
Column type frequency:           
  numeric                  37    
________________________         
Group variables            None  

In [11]:
(numVal_na <- num_skimData$skim_variable[num_skimData$n_missing !=0] )

[1] "LotFrontage" "MasVnrArea"  "GarageYrBlt"

In [12]:
skimr::skim_without_charts(Data,where(is.character))-> cat_skimData

In [13]:
summary(cat_skimData)

── Data Summary ────────────────────────
                           Values
Name                       Data  
Number of rows             1460  
Number of columns          80    
_______________________          
Column type frequency:           
  character                43    
________________________         
Group variables            None  

Let's impute some of these numerical features

In [14]:
colMeans(is.na(Data[numVal_na]))

LotFrontage  MasVnrArea GarageYrBlt 
0.177397260 0.005479452 0.055479452 

In [15]:
# list of numeric features to impute
imputeVal_list <- apply(Data[numVal_na],2, FUN = "median",na.rm = T,simplify = list)

In [16]:
#impute numerical features 
Data[numVal_na] <- replace_na(Data[numVal_na] ,
                              replace = imputeVal_list
)

In [17]:
# check  na
colMeans(is.na(Data[numVal_na]))

LotFrontage  MasVnrArea GarageYrBlt 
          0           0           0 

In [18]:
# duplicates rows
Data |> 
  select_if(is.numeric) |> 
  unique() |> 
  nrow()

[1] 1460

In [19]:
# select numeric features
Data |> 
  select_if(is.numeric)  ->numData 

# check relationship with target (linear model)
# h0 : there no relationship between X and Y(SalePrice)
# h1 :otherwise

  lm(SalePrice~.,numData) |> 
    summary() |> 
    tidy() |> 
    filter(p.value <=0.05) |> 
    nrow()

[1] 19

28 numerical features have predictive effect on target feature. 

In [20]:
# correlation in numeric features
for (feature in names(numData)[-1]){
  if (cor(numData[names(numData)][1],numData[feature]) >= 0.8 ) print(feature)
}

There is no carrelated features in numeric features 

cat var feat engineering  

In [21]:
# categorical features presenting missing values 
sum(cat_skimData$n_missing  > 0)

[1] 16

In [22]:
# n_missing proportion > 0.95 in cat features 
cat_rid <- cat_skimData$skim_variable[cat_skimData$n_missing >= 0.95] 
length(cat_rid)

[1] 16

16 features present missing values or NA proportion above 0.95, we could get rid of it  

In [23]:
# remove these categorical features
Data[,cat_rid]<- NULL

In [24]:
# check categorical features with NA in Data 
sum(cat_rid %in% names(Data))
rm(cat_rid)

In [25]:
# new data dimension
dim(Data)

[1] 1460   64

In [26]:
# NA check
colMeans(is.na(Data)) |> sum()

[1] 0

There is no longer NA in this dataset

In [27]:
summary(lm(SalePrice~. ,Data)) |> 
  tidy() |> 
  filter(p.value <=0.05) |> 
  print(n=10)

[38;5;246m# A tibble: 59 × 5[39m
   term              estimate std.error statistic  p.value
   [3m[38;5;246m<chr>[39m[23m                [3m[38;5;246m<dbl>[39m[23m     [3m[38;5;246m<dbl>[39m[23m     [3m[38;5;246m<dbl>[39m[23m    [3m[38;5;246m<dbl>[39m[23m
[38;5;250m 1[39m MSZoningFV       [4m3[24m[4m3[24m495.    [4m1[24m[4m2[24m123.         2.76 5.81[38;5;246me[39m[31m- 3[39m
[38;5;250m 2[39m MSZoningRH       [4m2[24m[4m5[24m399.    [4m1[24m[4m2[24m189.         2.08 3.74[38;5;246me[39m[31m- 2[39m
[38;5;250m 3[39m MSZoningRL       [4m2[24m[4m7[24m888.    [4m1[24m[4m0[24m386.         2.69 7.35[38;5;246me[39m[31m- 3[39m
[38;5;250m 4[39m MSZoningRM       [4m2[24m[4m6[24m270.     [4m9[24m716.         2.70 6.95[38;5;246me[39m[31m- 3[39m
[38;5;250m 5[39m LotFrontage         96.2      44.0        2.19 2.89[38;5;246me[39m[31m- 2[39m
[38;5;250m 6[39m LotArea              0.670     0.109      6.16 9.47[38;5;246me

In [28]:
ranger(SalePrice~.,Data,
  importance = 'impurity') |> 
  importance() |> 
  sort(decreasing = T) |> 
  tidy() |> 
  mutate(proportion= x/sum(x)) |> 
  filter(proportion >= 0.05)

In tidy.numeric(sort(importance(ranger(SalePrice ~ ., Data, importance = "impurity")),  :
  'tidy.numeric' is deprecated.
See help("Deprecated")


[38;5;246m# A tibble: 8 × 3[39m
  names             x proportion
  [3m[38;5;246m<chr>[39m[23m         [3m[38;5;246m<dbl>[39m[23m      [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m OverallQual 1.19[38;5;246me[39m12     0.132 
[38;5;250m2[39m GrLivArea   8.31[38;5;246me[39m11     0.091[4m8[24m
[38;5;250m3[39m GarageCars  6.31[38;5;246me[39m11     0.069[4m7[24m
[38;5;250m4[39m TotalBsmtSF 5.44[38;5;246me[39m11     0.060[4m1[24m
[38;5;250m5[39m GarageArea  5.33[38;5;246me[39m11     0.058[4m9[24m
[38;5;250m6[39m ExterQual   5.16[38;5;246me[39m11     0.057[4m0[24m
[38;5;250m7[39m YearBuilt   5.03[38;5;246me[39m11     0.055[4m6[24m
[38;5;250m8[39m X1stFlrSF   4.66[38;5;246me[39m11     0.051[4m4[24m

In [29]:
# importance proportion 
ranger(SalePrice~.,Data,importance = 'impurity')  -> rng
rng$variable.importance |> 
sort(decreasing = T) |> 
tidy() |> 
mutate(prop=x/sum(x)) |> 
#print(n=5) 
filter(prop>=0.05) |> 
select(names) |> 
unlist() -> features

In tidy.numeric(sort(rng$variable.importance, decreasing = T)) :
  'tidy.numeric' is deprecated.
See help("Deprecated")


In [30]:
features

       names1        names2        names3        names4        names5        names6 
"OverallQual"   "GrLivArea"  "GarageCars" "TotalBsmtSF"   "X1stFlrSF"   "ExterQual" 
       names7        names8 
 "GarageArea"   "YearBuilt" 

In [31]:

Data1 <- cbind(Data[features],SalePrice=Data$SalePrice)


In [32]:
rm(Data)

In [33]:
attach(Data1)

NULL

In [35]:
# ggplot object
Data1gg <- ggplot(Data1)

In [36]:
Data1$GarageCars |>  
  table()


  0   1   2   3   4 
 81 369 824 181   5 

In [37]:
Data1gg +
  geom_bar(aes(GarageCars))

In [38]:
Data1$YearBuilt |> 
  table() 


1872 1875 1880 1882 1885 1890 1892 1893 1898 1900 1904 1905 1906 1908 1910 1911 1912 
   1    1    4    1    2    2    2    1    1   10    1    1    1    2   17    1    3 
1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 
   1    7   10    8    1    7    3   30    6    8    7    7   16    9    3    7    4 
1930 1931 1932 1934 1935 1936 1937 1938 1939 1940 1941 1942 1945 1946 1947 1948 1949 
   9    6    4    3    6    9    5    4    8   18   15    2    6    7    5   14   12 
1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 
  20    6    5   12   24   16   14   20   24   26   17   14   19   16   15   24   18 
1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 
  16   22   14   24   22   23   11   10    8   33   32   16    9   10    5    6    4 
1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 
   9    5    5    3   11    3   12    5   13   17   1

In [39]:
Data1gg +
  geom_histogram(aes(YearBuilt))

[1m[22m`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

In [40]:
table(Data1$OverallQual)


  1   2   3   4   5   6   7   8   9  10 
  2   3  20 116 397 374 319 168  43  18 

In [41]:
Data1gg +
  geom_bar(aes(factor(OverallQual)))


NULL

In [43]:
Data1$TotalBsmtSF |> 
  hist()

## finding relations

In [44]:
Data1gg +
  geom_point(aes(TotalBsmtSF,SalePrice) ) +
  geom_smooth(aes(TotalBsmtSF,SalePrice) )

[1m[22m`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

In [45]:
Data1gg +
geom_point(aes(y=SalePrice,x=YearBuilt)) +
geom_smooth(aes(y=SalePrice,x=YearBuilt),se = F)

[1m[22m`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

In [46]:
Data1gg +
  geom_boxplot(aes(factor(OverallQual),SalePrice) )

In [47]:
Data1gg +
  geom_boxplot(aes(factor(GarageCars),SalePrice) )

to be continued 😉!

==split data ===