In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
import numpy as np
import pandas as pd
import gc

In [3]:
path = "../Zillow2/Data/"
print("Read the properties and merge with coord")
prop = pd.read_csv(path + 'renamed_properties_2016.csv')
prop = prop.drop("id", axis=1)
print "Size of the properties data frame: ", prop.shape

Read the properties and merge with coord


  interactivity=interactivity, compiler=compiler, result=result)


Size of the properties data frame:  (2985217, 58)


In [4]:
# drop two constant features and fips
prop = prop.drop(["flag_tub", "flag_fireplace", "fips"], axis=1)
print "Size of the properties data frame: ", prop.shape

Size of the properties data frame:  (2985217, 55)


In [5]:
# Drop missing rows from prop
index = pd.isnull(prop["latitude"])
prop = prop[~index]
print "Size of the properties data frame: ", prop.shape

Size of the properties data frame:  (2973780, 55)


In [6]:
# clasify features to three classes
number = ["num_bathroom", "num_bedroom", "num_bathroom_calc", "num_fireplace", "num_bath", "num_garage", "num_pool", 
          "num_room", "num_75_bath", "num_unit", "num_story"]
quality = ["type_aircon", "type_architectural", "type_framing", "type_quality", "type_deck", "type_heating", 
           "pooltypeid10", "pooltypeid2", "pooltypeid7", "type_zoning_landuse", "type_story", "type_material"]
position = ["region_city", "region_county", "region_neighbor", "region_zip", "zoning_landuse_county", 
             "zoning_property", "censustractandblock", "rawcensustractandblock"]
tax3 = ["tax_delinquency"]
ll = number + quality + position + tax3

print len(ll)

32


In [7]:
prop["miss_val"] = prop.isnull().sum(axis=1)

In [8]:
print("Read train file to obtain some statistic about the features and target")
train = pd.read_csv(path + 'renamed_train_2016.csv', parse_dates=['date'])
train = train.drop("id", axis=1)

train["month"] = train['date'].dt.month

train = pd.merge(train, prop, how='left', on='id_parcel')

train = train.drop(['id_parcel', 'date'], axis=1)
print "Size of the train data frame: ", train.shape

Read train file to obtain some statistic about the features and target
Size of the train data frame:  (90275, 57)


In [9]:
prop['latitude'] = prop['latitude'] / 1e6
prop['longitude'] = prop['longitude'] / 1e6

In [10]:
number = ["num_bathroom", "num_bedroom", "num_bathroom_calc", "num_fireplace", "num_bath", "num_garage", "num_pool", 
          "num_room", "num_75_bath", "num_unit", "num_story"]
pd.isnull(prop[number]).sum(axis=0)

num_bathroom              25
num_bedroom               13
num_bathroom_calc     117475
num_fireplace        2661143
num_bath              117475
num_garage           2090513
num_pool             2456246
num_room                  38
num_75_bath          2662149
num_unit              996290
num_story            2291711
dtype: int64

In [11]:
# Combine the rare events to avoid overfit
from sklearn.preprocessing import LabelEncoder

def fix_number(target, threshold = 200, na_value = 0):
    
    X = train[target].fillna(na_value)
    max_target = 10000000
    
    density = X.value_counts()
    index = density.index
    rare = density[density < threshold].index
    X.loc[X.isin(rare)] = max_target
    
    lbl = LabelEncoder()
    lbl.fit(list(X))
#    train[target] = lbl.transform(list(X))
    
    Y = pd.DataFrame(prop[["id_parcel", target]])
    Y[target] = Y[target].fillna(na_value)
    
    known = Y[target].isin(index)
    Y_known = Y[known]
    Y_unknown = Y[~known]
    
    Y_known.loc[Y_known[target].isin(rare), target] = max_target
    Y_known.loc[:, target] = lbl.transform(Y_known[target].values)
    
    print "Number of classes: ", Y_known.loc[:, target].max() + 2
    Y_unknown.loc[:, target] = Y_known.loc[:, target].max() + 1
    
    prop.loc[known, target] = Y_known[target].values
    prop.loc[~known, target] = Y_unknown[target].values

# I assumed all NaN values are zeros
for ii in number:
    print ii
    fix_number(ii, threshold = 200)

num_bathroom


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


Number of classes:  14
num_bedroom
Number of classes:  11
num_bathroom_calc
Number of classes:  14
num_fireplace
Number of classes:  6
num_bath
Number of classes:  9
num_garage
Number of classes:  7
num_pool
Number of classes:  3
num_room
Number of classes:  11
num_75_bath
Number of classes:  4
num_unit
Number of classes:  7
num_story
Number of classes:  6


In [12]:
quality = ["type_aircon", "type_architectural", "type_framing", "type_quality", "type_deck", "type_heating", 
           "pooltypeid10", "pooltypeid2", "pooltypeid7", "type_story", "type_material"]
pd.isnull(prop[quality]).sum(axis=0)

type_aircon           2162261
type_architectural    2967719
type_framing          2961151
type_quality          1035292
type_deck             2956684
type_heating          1167379
pooltypeid10          2936841
pooltypeid2           2941705
pooltypeid7           2488321
type_story            2972156
type_material         2967033
dtype: int64

In [13]:
# I assumed all NaN values as a new group
for ii in quality:
    fix_number(ii, threshold = 200, na_value = -1)

Number of classes:  6
Number of classes:  4
Number of classes:  3
Number of classes:  7
Number of classes:  3
Number of classes:  7
Number of classes:  3
Number of classes:  3
Number of classes:  3
Number of classes:  3
Number of classes:  4


In [14]:
# position exclude latitude and longitude
position = ["region_city", "region_county", "region_neighbor", "region_zip", "zoning_landuse_county", 
            "type_zoning_landuse", "zoning_property", "censustractandblock", "rawcensustractandblock"]

pd.isnull(prop[position]).sum(axis=0)

region_city                 51408
region_county                   0
region_neighbor           1817378
region_zip                   2543
zoning_landuse_county         840
type_zoning_landuse             0
zoning_property            995151
censustractandblock         63689
rawcensustractandblock          0
dtype: int64

In [15]:
from sklearn import neighbors
from sklearn.preprocessing import LabelEncoder

def fillna_knn(target, k=100):
    gps = ['latitude', 'longitude']
    
    index = pd.isnull(prop[target])
    Y = prop.loc[index, gps]
    X = prop.loc[~index, [target] + gps]

    print "Size of the missing data: ", Y.shape
    print "Size of the data: ", X.shape
    
    lbl = LabelEncoder()
    lbl.fit(list(X[target].values))
    X[target] = lbl.transform(list(X[target].values))

    print "Number of classes: ", X[target].max() + 1
    
    clf = neighbors.KNeighborsClassifier( n_neighbors = k, weights = 'uniform', n_jobs = 8 )
    clf.fit(X.drop(target, axis=1), X[target])
    
    Y[target] = clf.predict(Y)
    
    prop.loc[index, target] = Y[target].values
    prop.loc[~index, target] = X[target].values

In [16]:
# impute missing value
position_miss = ["region_zip", "zoning_landuse_county"]
for ii in position_miss:
    print ii
    fillna_knn(ii)

region_zip
Size of the missing data:  (2543, 2)
Size of the data:  (2971237, 3)
Number of classes:  405
zoning_landuse_county
Size of the missing data:  (840, 2)
Size of the data:  (2972940, 3)
Number of classes:  240


In [17]:
# convert object features to int
object_type = ["zoning_property", "zoning_landuse_county"]
print("convert object features to int")
for c in object_type:
    prop[c]=prop[c].fillna(-2)
    lbl = LabelEncoder()
    lbl.fit(list(prop[c].values))
    prop[c] = lbl.transform(list(prop[c].values))

convert object features to int


In [18]:
# impute missing value; missing value as a new group
region = ["region_zip", "region_city", "region_county", "region_neighbor", "type_zoning_landuse", "censustractandblock", "rawcensustractandblock"]
print("convert region features to int")
for c in region:
    prop[c]=prop[c].fillna(-2)
    lbl = LabelEncoder()
    lbl.fit(list(prop[c].values))
    prop[c] = lbl.transform(list(prop[c].values))

convert region features to int


In [19]:
from sklearn import neighbors

def fillna_knn_reg(target, k=100):
    gps = ['latitude', 'longitude']
    
    index = pd.isnull(prop[target])
    Y = prop.loc[index, gps]
    X = prop.loc[~index, [target] + gps]

    print "Size of the missing data: ", Y.shape
    print "Size of the data: ", X.shape
       
    clf = neighbors.KNeighborsRegressor( n_neighbors = k, weights = 'uniform' )
    clf.fit(X.drop(target, axis=1), X[target])
    
    Y[target] = clf.predict(Y)
    
    prop.loc[index, target] = Y[target].values
    prop.loc[~index, target] = X[target].values
    
fillna_knn_reg("build_year")

Size of the missing data:  (48491, 2)
Size of the data:  (2925289, 3)


In [20]:
# convert data type to reduce the size
for c in number:
    prop[c] = prop[c].astype(np.int8)
    
for c in quality:
    prop[c] = prop[c].astype(np.int8)
    
for c in position:
    prop[c] = prop[c].astype(np.int16)
    
print('Binding to float32')
for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

Binding to float32


In [21]:
## Add new features
# Hadi can add more features hear to improve the code
prop["area_ratio"] = prop["area_total_calc"] / prop["area_lot"]
prop["tax_building_area"] = prop["tax_building"] / (prop["area_lot"])
prop["tax_property_area"] = prop["tax_property"] / (prop["area_lot"])

prop["tax_total_property"] = prop["tax_total"] / prop["tax_property"]
prop["tax_totalproperty"] = prop["tax_total"] * prop["tax_property"]
prop["tax_building_land"] = prop["tax_building"] / prop["tax_land"]

prop["area_total_firstfloor"] = prop["area_total_calc"] / prop["area_firstfloor_finished"]

prop["lat_lon"] = prop["latitude"] / prop["longitude"]
prop["latlon"] = prop["latitude"] * prop["longitude"]

In [22]:
# Calculate the average value of selected features in different region and then
# calculate the difference between average and exact value
ll1 = ['region_zip', 'region_neighbor', 'zoning_landuse_county', 'region_city']
ll2 = ['zip', 'neighbor', 'zon', 'city']
fea = ["area_lot", "area_total_calc", "tax_building",  "tax_property", "latitude", "longitude", "tax_total",
       "tax_property_area", "tax_building_area", "build_year", "num_bedroom", "num_bathroom"]

for c1, c2 in zip(ll1, ll2):
    for cc in fea:
        ave = prop.groupby(c1)[cc].mean().to_dict()
        name1 = c2 + "_" + cc
        prop[name1] = prop[c1].map(ave)
        name2 = name1 + "_diff"
        prop[name2] = prop[cc] - prop[name1]

In [23]:
# count the number of house in each region
ll = ['region_zip', 'region_city', 'region_neighbor', 'type_zoning_landuse']
for c in ll:
    count = prop[c].value_counts().to_dict()
    name = c + "_count"
    prop[name] = prop[c].map(count)

In [24]:
# I convert the coordinate from cartesian to polar coordinate
print "Read location file"
coord = pd.read_csv(path + "location2.csv")
print coord.shape

prop = pd.merge(prop, coord.drop(["latitude", "longitude"], axis = 1), how='left', on='id_parcel')
del coord
gc.collect()

Read location file
(2973780, 15)


307

In [25]:
prop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2973780 entries, 0 to 2973779
Columns: 177 entries, id_parcel to num_rot75_Y
dtypes: float32(31), float64(108), int16(9), int64(7), int8(22)
memory usage: 3.0 GB


In [26]:
print('Binding to float32')
for c, dtype in zip(prop.columns, prop.dtypes):
    if dtype == np.float64:
        prop[c] = prop[c].astype(np.float32)

Binding to float32


In [27]:
prop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2973780 entries, 0 to 2973779
Columns: 177 entries, id_parcel to num_rot75_Y
dtypes: float32(139), int16(9), int64(7), int8(22)
memory usage: 1.8 GB


In [28]:
# I used kmean and split the data to new clusters based on lat and lon
print "Read cluster file"
clusters = pd.read_csv(path + "n_clusters2.csv")
print clusters.shape

prop = pd.merge(prop, clusters, how='left', on='id_parcel')
del clusters
gc.collect()

Read cluster file
(2973780, 7)


35

In [29]:
clusters = ['cluster0', 'cluster1', 'cluster2', 'cluster3', 'cluster4', 'cluster5']
for c in clusters:
    count = prop[c].value_counts().to_dict()
    name = c + "_count"
    prop[name] = prop[c].map(count)

In [30]:
clusters = ['cluster1', 'cluster2', 'cluster3'] #, 'cluster1', 'cluster2', 'cluster3', 'cluster4']
clusters2 = ['cl1', 'cl2', 'cl3'] #, 'cl1', 'cl2', 'cl3', 'cl4']
    
fea = ["area_lot", "area_total_calc", "tax_building",  "tax_property", "latitude", "longitude", "tax_total",
       "tax_property_area", "tax_building_area", "build_year", "num_bedroom", "num_bathroom"]

for c1, c2 in zip(clusters, clusters2):
    for cc in fea:
        ave = prop.groupby(c1)[cc].mean().to_dict()
        name1 = c2 + "_" + cc
        prop[name1] = prop[c1].map(ave)
        name2 = name1 + "_diff"
        prop[name2] = prop[cc] - prop[name1]
        
        prop[name1] = prop[name1].astype(np.float32)
        prop[name2] = prop[name2].astype(np.float32)

In [31]:
prop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2973780 entries, 0 to 2973779
Columns: 261 entries, id_parcel to cl3_num_bathroom_diff
dtypes: float32(211), int16(9), int64(19), int8(22)
memory usage: 2.9 GB


In [32]:
# Read Train
print("Read train file and merge with properties to generate train file and taraget")
train = pd.read_csv(path + 'renamed_train_2016.csv', parse_dates=['date'])
train = train.drop("id", axis=1)

train["month"] = train['date'].dt.month

train = pd.merge(train, prop, how='left', on='id_parcel')
train = train.drop(['date'], axis=1)
print "Size of the train data frame: ", train.shape

Read train file and merge with properties to generate train file and taraget
Size of the train data frame:  (90275, 263)


In [33]:
# Exclude train from prop
id_parcel = train["id_parcel"].values
prop = prop.set_index("id_parcel")
prop = prop.drop(id_parcel)
prop = prop.reset_index()
prop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2883630 entries, 0 to 2883629
Columns: 261 entries, id_parcel to cl3_num_bathroom_diff
dtypes: float32(211), int16(9), int64(19), int8(22)
memory usage: 2.8 GB


In [34]:
print("Read the missing")
miss = pd.read_csv(path + 'missing.csv')
miss.info()

Read the missing
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11437 entries, 0 to 11436
Data columns (total 3 columns):
Unnamed: 0    11437 non-null int64
index         11437 non-null int64
id_parcel     11437 non-null int64
dtypes: int64(3)
memory usage: 268.1 KB


In [35]:
store = pd.HDFStore('Data/store2.h5')

In [36]:
store["prop"] = prop
store["train"] = train
store["miss"] = miss

In [37]:
print store

<class 'pandas.io.pytables.HDFStore'>
File path: Data/store3.h5
/miss             frame        (shape->[11437,3])    
/prop             frame        (shape->[2883630,261])
/train            frame        (shape->[90275,263])  


In [38]:
store.close()