# Neighborhood (feature engineering with external data)

In [1]:
# Source of details about neighborhood is taken from this website: www.addressreport.com website

In [2]:
library(data.table)

In [3]:
train = fread("/content/train.csv", stringsAsFactors = F)
test  = fread("/content/test.csv", stringsAsFactors = F)

datas = rbindlist(list(train, test), use.names = T, fill = T)
datas[is.na(SalePrice), set := "test"]
datas[!is.na(SalePrice), set := "train"]
datas[, set := factor(set, levels = c("train", "test"))]
datas[, logSalePrice := log(SalePrice)]
datas[, MSSubClass := as.character(MSSubClass)]

dataset = copy(datas)

In [4]:
dim(dataset)

In [5]:
input = "
Neighborhood;Neighborhood_etiquette;Location;Cost_of_living;Income;Owners;Annual_property_tax;School;Crime;Ville;
Blmngtn;Bloomington Heights;NN;2;95256;83;3742;4;-53;0;
Blueste;Bluestem;WW;;;;;;;0;
BrDale;Briardale;NN;-37;45588;39;1789;8;-27;0;
BrkSide;Brookside;NN;;;;;;;0;proche de Stone brook
ClearCr;Clear Creek;WW;;;;;8;;0;proche de Sunset Rock
CollgCr;College Creek;WW;-20;66875;54;2616;8;-45;0;
Crawfor;Crawford;WW;;;;;;;1;Comté de crawford
Edwards;Edwards;;;;;;;;0;
Gilbert;Gilbert;NN;1;66250;82;2706;4;-36;1;
Greens;Greens;NN;-10;84600;61;3667;8;-41;0;
GrnHill;Green Hills;SS;-24;61100;54;3097;8;-57;0;
IDOTRR;Iowa DOT and Rail Road;Center;;;;;;;0;
Landmrk;Landmark;WW;-19;66458;54;1667;8;-29;0;
MeadowV;Meadow Village;SS;-14;53962;50;1521;8;-46;0;
Mitchel;Mitchell;SS;;;;;;;1;
NAmes ;North Ames;NN;;;;;;;0;
NoRidge;Northridge;NN;-9;88438;61;5562;8;-53;0;
NPkVill ;Northpark Villa;NN;-9;88438;61;5562;8;-53;0;
NridgHt;Northridge Heights;NN;2;95256;83;5478;4;-53;0;
NWAmes ;Northwest Ames;NN;;;;;;;0;
OldTown;Old Town;Center;-35;37708;25;2342;8;-34;0;Original ames
SWISU;South & West of Iowa State University;Center;11;32268;34;2486;8;-44;0;CollegeHeight
Sawyer;Sawyer;WW;-11;69067;66;2471;8;-42;0;Ontario height
SawyerW;Sawyer West;WW;-11;71600;49;1705;8;-42;0;Orinal ontario
Somerst;Somerset;NN;-10;84600;61;2314;8;-41;0;
StoneBr;Stone Brook;NN;2;95256;83;4369;4;-53;0;
Timber;Timberland;SS;;;;;8;;0;Timberland height / timberlane
Veenker;Veenker;NN;;;;;;;0;proche de The green"

In [6]:
#input

In [7]:
ngbr_details = data.table::fread(input, na.strings = c("", "NA"), stringsAsFactors = F)

In [8]:
print(ngbr_details)
names(ngbr_details)

    Neighborhood                Neighborhood_etiquette Location Cost_of_living
          <char>                                <char>   <char>          <int>
 1:      Blmngtn                   Bloomington Heights       NN              2
 2:      Blueste                              Bluestem       WW             NA
 3:       BrDale                             Briardale       NN            -37
 4:      BrkSide                             Brookside       NN             NA
 5:      ClearCr                           Clear Creek       WW             NA
 6:      CollgCr                         College Creek       WW            -20
 7:      Crawfor                              Crawford       WW             NA
 8:      Edwards                               Edwards     <NA>             NA
 9:      Gilbert                               Gilbert       NN              1
10:       Greens                                Greens       NN            -10
11:      GrnHill                           Green Hil

In [9]:
mean(ngbr_details$Cost_of_living,na.rm = TRUE)
mean(ngbr_details$Income,na.rm = TRUE)
mean(ngbr_details$Owners,na.rm = TRUE)
mean(ngbr_details$Annual_property_tax,na.rm = TRUE)
mean(ngbr_details$School,na.rm = TRUE)
mean(ngbr_details$Crime,na.rm = TRUE)

In [10]:
#ngbr_details$Owners

In [11]:
ngbr_details[is.na(Location), Location := "NN"]
ngbr_details[is.na(Cost_of_living), Cost_of_living := -18]
ngbr_details[is.na(Income), Income := 46358]
ngbr_details[is.na(Owners), Owners:= 43]
ngbr_details[is.na(Annual_property_tax), Annual_property_tax := 2479]
ngbr_details[is.na(School), School := 8]
ngbr_details[is.na(Crime), Crime := -28]

In [12]:
sum(is.na(ngbr_details[, -11, with = F]))

In [13]:
ngbr_details = ngbr_details[, -c(2, 11), with = F]

In [14]:
ngbr_details

Neighborhood,Location,Cost_of_living,Income,Owners,Annual_property_tax,School,Crime,Ville
<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
Blmngtn,NN,2,95256,83,3742,4,-53,0
Blueste,WW,-18,46358,43,2479,8,-28,0
BrDale,NN,-37,45588,39,1789,8,-27,0
BrkSide,NN,-18,46358,43,2479,8,-28,0
ClearCr,WW,-18,46358,43,2479,8,-28,0
CollgCr,WW,-20,66875,54,2616,8,-45,0
Crawfor,WW,-18,46358,43,2479,8,-28,1
Edwards,NN,-18,46358,43,2479,8,-28,0
Gilbert,NN,1,66250,82,2706,4,-36,1
Greens,NN,-10,84600,61,3667,8,-41,0


In [15]:
data.table::setnames(ngbr_details,
         old = c("Location", "Cost_of_living", "Income", "Owners", "Annual_property_tax", "School", "Crime"),
         new = paste0("Ngbr_", c("Location", "Cost_of_living", "Income", "Owners", "Annual_property_tax", "School", "Crime")))

In [16]:
ngbr_details

Neighborhood,Ngbr_Location,Ngbr_Cost_of_living,Ngbr_Income,Ngbr_Owners,Ngbr_Annual_property_tax,Ngbr_School,Ngbr_Crime,Ville
<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
Blmngtn,NN,2,95256,83,3742,4,-53,0
Blueste,WW,-18,46358,43,2479,8,-28,0
BrDale,NN,-37,45588,39,1789,8,-27,0
BrkSide,NN,-18,46358,43,2479,8,-28,0
ClearCr,WW,-18,46358,43,2479,8,-28,0
CollgCr,WW,-20,66875,54,2616,8,-45,0
Crawfor,WW,-18,46358,43,2479,8,-28,1
Edwards,NN,-18,46358,43,2479,8,-28,0
Gilbert,NN,1,66250,82,2706,4,-36,1
Greens,NN,-10,84600,61,3667,8,-41,0


In [17]:
ngbr_details[, `:=` (
  Ngbr_Location_WW     = as.numeric(Ngbr_Location == "WW"),
  Ngbr_Location_SS     = as.numeric(Ngbr_Location == "SS"),
  Ngbr_Location_Center = as.numeric(Ngbr_Location == "Center"),
  Ngbr_Location_NN     = as.numeric(Ngbr_Location == "NN"),
  Ngbr_Location        = NULL
)]

In [18]:
dataset = merge(dataset, ngbr_details, all.x = T, by = "Neighborhood")

In [19]:
dim(dataset)

In [20]:
tmp = dataset[, list(
  medLogSalePrice  = median(logSalePrice, na.rm = T),
  medSalePrice     = median(SalePrice, na.rm = T),
  N                = .N,
  meanLogSalePrice = mean(SalePrice, na.rm = T),
  meanSalePrice    = mean(SalePrice, na.rm = T),
  sdLogSalePrice   = sd(SalePrice, na.rm = T),
  sdSalePrice      = sd(SalePrice, na.rm = T)
  ), by = "Neighborhood"][order(medLogSalePrice)]

In [21]:
tmp

Neighborhood,medLogSalePrice,medSalePrice,N,meanLogSalePrice,meanSalePrice,sdLogSalePrice,sdSalePrice
<chr>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>
MeadowV,11.38509,88000,37,98576.47,98576.47,23491.05,23491.05
IDOTRR,11.54248,103000,93,100123.78,100123.78,33376.71,33376.71
BrDale,11.57119,106000,30,104493.75,104493.75,14330.176,14330.176
OldTown,11.68688,119000,239,128225.3,128225.3,52650.583,52650.583
Edwards,11.70962,121750,194,128219.7,128219.7,43208.616,43208.616
BrkSide,11.73022,124300,108,124834.05,124834.05,40348.689,40348.689
Sawyer,11.81303,135000,151,136793.14,136793.14,22345.129,22345.129
Blueste,11.82654,137500,10,137500.0,137500.0,19091.883,19091.883
SWISU,11.84582,139500,48,142591.36,142591.36,32622.918,32622.918
NAmes,11.8494,140000,443,145847.08,145847.08,33075.345,33075.345


In [22]:
km = kmeans(tmp[, 2:8], centers = 10, iter.max = 200)

In [23]:
km$cluster

In [24]:
dic = data.table(
  Neighborhood   = tmp$Neighborhood,
  NeighborhoodCl = km$cluster
)

In [25]:
dic

Neighborhood,NeighborhoodCl
<chr>,<int>
MeadowV,2
IDOTRR,2
BrDale,2
OldTown,5
Edwards,5
BrkSide,5
Sawyer,6
Blueste,6
SWISU,7
NAmes,7


In [26]:
dataset[dic, on = "Neighborhood"]

Neighborhood,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,⋯,Ngbr_Owners,Ngbr_Annual_property_tax,Ngbr_School,Ngbr_Crime,Ville,Ngbr_Location_WW,Ngbr_Location_SS,Ngbr_Location_Center,Ngbr_Location_NN,NeighborhoodCl
<chr>,<int>,<chr>,<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,⋯,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<int>
MeadowV,24,120,RM,44,4224,Pave,,Reg,Lvl,⋯,50,1521,8,-46,0,0,1,0,0,2
MeadowV,76,180,RM,21,1596,Pave,,Reg,Lvl,⋯,50,1521,8,-46,0,0,1,0,0,2
MeadowV,345,160,RM,36,2592,Pave,,Reg,Lvl,⋯,50,1521,8,-46,0,0,1,0,0,2
MeadowV,358,120,RM,44,4224,Pave,,Reg,Lvl,⋯,50,1521,8,-46,0,0,1,0,0,2
MeadowV,435,180,RM,21,1890,Pave,,Reg,Lvl,⋯,50,1521,8,-46,0,0,1,0,0,2
MeadowV,490,180,RM,21,1526,Pave,,Reg,Lvl,⋯,50,1521,8,-46,0,0,1,0,0,2
MeadowV,491,160,RM,,2665,Pave,,Reg,Lvl,⋯,50,1521,8,-46,0,0,1,0,0,2
MeadowV,615,180,RM,21,1491,Pave,,Reg,Lvl,⋯,50,1521,8,-46,0,0,1,0,0,2
MeadowV,650,180,RM,21,1936,Pave,,Reg,Lvl,⋯,50,1521,8,-46,0,0,1,0,0,2
MeadowV,916,160,RM,21,2001,Pave,,Reg,Lvl,⋯,50,1521,8,-46,0,0,1,0,0,2


In [27]:
dataset = dataset[dic, on = "Neighborhood"]
dataset$Neighborhood = NULL

dataset[, `:=` (
  Neighborhood1  = as.numeric(NeighborhoodCl == 1),
  Neighborhood2  = as.numeric(NeighborhoodCl == 2),
  Neighborhood3  = as.numeric(NeighborhoodCl == 3),
  Neighborhood4  = as.numeric(NeighborhoodCl == 4),
  Neighborhood5  = as.numeric(NeighborhoodCl == 5),
  Neighborhood6  = as.numeric(NeighborhoodCl == 6),
  Neighborhood7  = as.numeric(NeighborhoodCl == 7),
  Neighborhood8  = as.numeric(NeighborhoodCl == 8),
  Neighborhood9  = as.numeric(NeighborhoodCl == 9),
  Neighborhood10 = as.numeric(NeighborhoodCl == 10),
  NeighborhoodCl = NULL
)]

In [28]:
dim(dataset)

In [29]:
trainset = dataset[dataset$set == "train"]

In [30]:
dim(trainset)

In [31]:
write.csv(trainset, file = "newtrain.csv",row.names = F)

In [32]:
dataset[,"set"]

set
<fct>
train
train
train
train
train
train
train
train
train
train
