# Western Australia Rental Prices - Extract

https://www.kaggle.com/c/deloitte-western-australia-rental-prices/

In [1]:
# imports
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from __future__ import division
from sklearn import preprocessing
%matplotlib inline

In [2]:
# settings and constants
%logstop
%logstart  -o 'Extract' rotate
plt.rcParams['figure.figsize'] = (10.0, 8.0)
pd.set_option('display.max_rows', 50)
start_time = pd.datetime.now()
print start_time

Logging hadn't been started.
Activating auto-logging. Current session state plus future input saved.
Filename       : Extract
Mode           : rotate
Output logging : True
Raw input log  : False
Timestamping   : False
State          : active
2015-11-25 12:18:03.357540


In [3]:
# based on Mac with Postgres.app
# export PATH="/Applications/Postgres.app/Contents/Versions/9.4/bin:$PATH"
# then pip install psycopg2
# then sudo brew install openssl
# and follow these instructions:
# http://stackoverflow.com/questions/11365619/psycopg2-installation-error-library-not-loaded-libssl-dylib

from sqlalchemy import create_engine
engine = create_engine('postgresql://paulperry:ciao,ciao@localhost:5432/australia', 
                       connect_args={'client_encoding': 'latin1'})

# EXTRACT

In [4]:
train = pd.read_csv('data/train.csv', low_memory=False)
print train.shape

(834570, 5)


In [5]:
train.columns = map(str.lower, train.columns)
train.set_index('ren_id', inplace=True)
train.ren_date_eff_from = pd.to_datetime(train.ren_date_eff_from)
train[:2]

Unnamed: 0_level_0,ren_date_eff_from,ren_base_rent,ve_number,ren_lease_length
ren_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1579258,1990-02-13,280,4807702,
1203979,1990-04-13,115,332135,


In [6]:
#train = pd.read_csv('data/train.csv', low_memory=False)
test = pd.read_csv('data/test.csv', low_memory=False)
print test.shape

(150508, 4)


In [7]:
test.columns = map(str.lower, test.columns)
test.set_index('ren_id', inplace=True)
test.ren_date_eff_from = pd.to_datetime(test.ren_date_eff_from)
test[:2]

Unnamed: 0_level_0,ren_date_eff_from,ve_number,ren_lease_length
ren_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10568,2004-02-18,2402939,
12686,2004-02-18,1352438,


## Quantity Features

In [8]:
# get all the quantities 

qp1 = "select t1.ren_id, t1.ve_number, t2.urv_id, t2.uvv_date_eff_from, t2.uvv_quantity from " # query part 1
qp2 = " t1 join (select distinct on (ve_number, urv_id) ve_number, urv_id, uvv_date_eff_from, uvv_quantity \
        from valuation_entities_details \
        where urv_ven_quality_ind like 'N' and urv_ven_quantity_ind like 'Y' and uvv_quantity is not null \
        group by ve_number, urv_id, uvv_date_eff_from, uvv_quantity \
        order by ve_number, urv_id, uvv_date_eff_from desc) t2  \
        on( t1.ve_number = t2.ve_number and t1.ren_date_eff_from > t2.uvv_date_eff_from and urv_id is not null)"
qp3 = ";"

fullq = qp1+'train'+qp2+' union all '+qp1+'test'+qp2+qp3

%time ve_quantity = pd.read_sql_query(fullq, engine)
print ve_quantity.shape
ve_quantity[:5]

CPU times: user 20.9 s, sys: 3.21 s, total: 24.1 s
Wall time: 1min 44s
(7597821, 5)


Unnamed: 0,ren_id,ve_number,urv_id,uvv_date_eff_from,uvv_quantity
0,2391247,61,19,2000-07-01,60.0
1,2391247,61,58,1986-07-01,2.0
2,2658991,61,58,1986-07-01,2.0
3,890584,61,58,1986-07-01,2.0
4,2391247,61,206,1986-07-01,3.5


In [9]:
# # this confirmed there are no duplicate ren_id's for the same date in this table
# vegroup = ve_quantity.groupby(['ren_id','ve_number','urv_id', 'uvv_date_eff_from']).count().sort_values('uvv_quantity',ascending=False)
# vegroup[:5]
# # if there are dupes, I would eliminate this way:
# print len(ve_quantity.index)
# ve_quantity.drop_duplicates(['ren_id','ve_number','urv_id'], keep='first', inplace=True)
# print len(ve_quantity.index)

In [10]:
quantity_features = pd.pivot(ve_quantity.ren_id, ve_quantity.urv_id, ve_quantity.uvv_quantity)
quantity_features.iloc[:5]

  return np.sum(name == np.asarray(self.names)) > 1


urv_id,4,5,19,20,21,23,24,25,26,28,...,405,413,431,434,440,441,443,444,477,479
ren_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1.0,,217,,,,,,,,...,,,,1.0,,,1.0,,,
9,,,119,,,,,,,,...,,,,,,,1.0,,,
13,,,60,,,,,,,,...,,,,,,,,,,
26,1.0,,120,,,,,,,,...,,,,,,,,,,
27,,,78,,,,,,,,...,,,,,,,,,,


In [11]:
# stop if somehow we messed up the pivot
if len(quantity_features.index) > (len(train.index)+len(test.index)):
    raise Exception('bad pivot')

In [12]:
# # generate the ve_keys if we don't have the file
# q_key ="select urv_id, urv_description, urv_ven_quality_ind, urv_ven_quantity_ind, count(*) as c \
#     from valuation_entities_details \
#     group by urv_id, urv_description, urv_ven_quality_ind, urv_ven_quantity_ind \
#     order by c desc;"

# ve_key = pd.read_sql_query(q_key, engine)
# ve_key.to_csv('ve_key.csv', index=False)
# print len(ve_key.index)
# ve_key.iloc[:10]

In [13]:
# load the valuation entity keys
ve_key = pd.read_csv('ve_key.csv')
ve_key.set_index('urv_id', inplace=True)
ve_key[:5]

Unnamed: 0_level_0,urv_description,urv_ven_quality_ind,urv_ven_quantity_ind,c
urv_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
206,EFFECTIVE ROOMS,N,Y,1034318
58,BEDROOMS,N,Y,924250
410,YEAR BUILT,Y,N,923596
271,KITCHEN,N,Y,858940
89,BRICK WALLS,N,N,789499


In [14]:
def rename_columns(df):
    # map columns to ve_key names
    col_names = [ve_key.loc[i].urv_description.lower().replace(' ','_').replace('-','_') 
                 for i in df.columns]
    col_names = [s.encode('ascii','ignore') for s in col_names]
    col_names = [c.translate(None,".()&?!;$/\\,") for c in col_names]
    if len(col_names) == len(df.columns):
        df.columns = col_names
    else:
        raise Exception('Problem renaming columns!') 

    return df

In [15]:
quantity_features.columns

Int64Index([  4,   5,  19,  20,  21,  23,  24,  25,  26,  28,  29,  30,  31,
             33,  56,  57,  58,  59,  69,  70,  71,  73,  74,  75, 201, 203,
            206, 210, 214, 216, 224, 227, 229, 230, 232, 233, 234, 236, 267,
            271, 272, 280, 283, 284, 291, 292, 318, 331, 340, 344, 345, 356,
            361, 366, 367, 368, 369, 370, 371, 372, 387, 403, 404, 405, 413,
            431, 434, 440, 441, 443, 444, 477, 479],
           dtype='int64', name=u'urv_id')

In [16]:
quantity_features = rename_columns(quantity_features)
print quantity_features.columns

Index([u'activity_room', u'airconditioned', u'house_area', u'office_area_1',
       u'shop_area', u'store_room_area_1', u'warehouse_area_1',
       u'factory_area_1', u'any_other_area', u'basement_area', u'shed_area_1',
       u'total_area', u'mezzanine_area', u'workshop_area_1', u'bar',
       u'bath_room', u'bedrooms', u'single_bedrooms', u'carbay_under_cover',
       u'tandem_carbay', u'open_car_bay', u'carport_under_main_roof',
       u'carport_attached', u'carport_detached', u'dining_room',
       u'dressing_room', u'effective_rooms', u'ensuite', u'excess_land',
       u'family_room', u'frontage_of_block', u'fronts_onto_lake',
       u'games__room', u'detached_gamesroom', u'garage_under_main_roof',
       u'garage_attached', u'garage_detached', u'guest_room', u'kennels',
       u'kitchen', u'laundry', u'lounge_room', u'meals_area', u'music_room',
       u'other_rooms', u'plate_height', u'parents__retreat', u'sleep_out',
       u'sewing_room', u'shed', u'sitting_room', u'spa_room',

In [17]:
print quantity_features.shape

(979255, 73)


In [18]:
quantity_features.count().sort_values(ascending=False)

bedrooms                   931557
effective_rooms            834353
kitchen                    827811
lounge_room                763260
house_area                 690340
dining_room                525876
bath_room                  346280
toilet                     312093
family_room                297204
ensuite                    279033
carport_under_main_roof    224964
storey                     212683
walk_in_robe               200900
garage_under_main_roof     168170
meals_area                 161340
carport_detached           112629
store_room                 103041
games__room                 93795
carport_attached            86176
garage_detached             72513
study                       65796
alfresco_room               40300
garage_attached             33340
effective_bed_count         30775
theatre_room                30765
                            ...  
dressing_room                 279
music_room                    272
office_area_1                 271
shop_area     

In [19]:
def df_summary(df):
    features = []
    for c in df.columns:
        unique = len(df[c].unique())
        vals =  df[c].notnull().sum()
        example = df[c].unique()[:9]
        features.append([c, unique, vals, example])
    df_features = pd.DataFrame(features, columns=['feature','unique','count','example_values'])
    return df_features.sort_values(by='unique', ascending=False)

In [20]:
df_summary(quantity_features)

Unnamed: 0,feature,unique,count,example_values
2,house_area,11905,690340,"[217.0, 119.0, 60.0, 120.0, 78.0, 114.0, nan, ..."
30,frontage_of_block,1364,21738,"[nan, 18.0, 20.48, 20.12, 18.61, 34.41, 10.06,..."
10,shed_area_1,195,1401,"[nan, 36.0, 72.0, 14.6, 54.0, 198.0, 46.2, 90...."
8,any_other_area,154,20358,"[nan, 41.0, 64.0, 220.0, 130.0, 61.6, 68.0, 48..."
26,effective_rooms,81,834353,"[11.0, 7.0, 4.0, 6.5, 4.5, 6.0, 9.5, 2.5, 9.0]"
3,office_area_1,66,271,"[nan, 135.0, 110.0, 1.0, 120.0, 35.0, 139.0, 1..."
71,area_of_sheds,51,86,"[nan, 72.0, 38.0, 30.0, 54.0, 25.0, 32.0, 74.0..."
13,workshop_area_1,39,87,"[nan, 60.0, 17.0, 169.0, 50.0, 99.0, 108.0, 20..."
60,number_of_units,29,5243,"[nan, 2.0, 4.0, 8.0, 12.0, 3.0, 11.0, 10.0, 5.0]"
11,total_area,29,704,"[nan, 96.0, 73.0, 85.0, 88.0, 95.0, 100.0, 187..."


In [21]:
quantity_features[:5]

Unnamed: 0_level_0,activity_room,airconditioned,house_area,office_area_1,shop_area,store_room_area_1,warehouse_area_1,factory_area_1,any_other_area,basement_area,...,workshop,effective_bed_count,ven_land_area,theatre_room,hall_area,jetty,alfresco_room,kitchenette,area_of_sheds,scullery
ren_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,1.0,,217,,,,,,,,...,,,,1.0,,,1.0,,,
9,,,119,,,,,,,,...,,,,,,,1.0,,,
13,,,60,,,,,,,,...,,,,,,,,,,
26,1.0,,120,,,,,,,,...,,,,,,,,,,
27,,,78,,,,,,,,...,,,,,,,,,,


## Quality Features

In [22]:
# get all the qualities 

qq1 = "select t1.ren_id, t1.ve_number, t2.urv_id, t2.uvv_date_eff_from, t2.uvv_quality from " # query part 1
qq2 = " t1 join (select distinct on (ve_number, urv_id) ve_number, urv_id, uvv_date_eff_from, uvv_quality \
        from valuation_entities_details \
        where urv_ven_quality_ind like 'Y' and urv_ven_quantity_ind like 'N' and uvv_quality <> '' \
        group by ve_number, urv_id, uvv_date_eff_from, uvv_quality \
        order by ve_number, urv_id, uvv_date_eff_from desc) t2  \
        on( t1.ve_number = t2.ve_number and t1.ren_date_eff_from > t2.uvv_date_eff_from )"
qq3 = ";"

fullqq = qq1+'train'+qq2+' union all '+qq1+'test'+qq2+qq3

%time ve_quality = pd.read_sql_query(fullqq, engine)

CPU times: user 3.35 s, sys: 198 ms, total: 3.55 s
Wall time: 24.3 s


In [23]:
print ve_quality.shape
ve_quality[:5]

(1489542, 5)


Unnamed: 0,ren_id,ve_number,urv_id,uvv_date_eff_from,uvv_quality
0,1184400,2125,82,1986-07-01,FAIR
1,1184400,2125,410,1986-07-01,1975
2,4653370,3861,82,1986-07-01,FAIR
3,4579201,3861,82,1986-07-01,FAIR
4,3880378,3861,82,1986-07-01,FAIR


In [24]:
# one dupe?
print len(ve_quality.index)
ve_quality.drop_duplicates(['ren_id','ve_number','urv_id'], keep='first', inplace=True)
print len(ve_quality.index)

1489542
1489541


In [25]:
quality_features = ve_quality.pivot('ren_id', 'urv_id', 'uvv_quality')
quality_features.iloc[:5]

urv_id,82,410,411
ren_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,,2010,
9,,2010,
13,FAIR,1976,
26,,2008,
27,FAIR,1981,


In [26]:
quality_features = rename_columns(quality_features)
print quality_features.columns

Index([u'condition', u'year_built', u'year_effective'], dtype='object')


In [27]:
quality_features[:5]

Unnamed: 0_level_0,condition,year_built,year_effective
ren_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,,2010,
9,,2010,
13,FAIR,1976,
26,,2008,
27,FAIR,1981,


## Dummy Features

In [28]:
# get all the dummy features 

qr1 = "select t1.ren_id, t1.ve_number, t2.urv_id, t2.uvv_date_eff_from, t2.uvv_quality, t2.uvv_quantity from " 
qr2 = " t1 join (select distinct on (ve_number, urv_id) \
        ve_number, urv_id, uvv_date_eff_from, uvv_quality, uvv_quantity \
        from valuation_entities_details \
        where urv_ven_quality_ind like 'N' and urv_ven_quantity_ind like 'N'  \
        group by ve_number, urv_id, uvv_date_eff_from, uvv_quality, uvv_quantity \
        order by ve_number, urv_id, uvv_date_eff_from desc) t2  \
        on( t1.ve_number = t2.ve_number and t1.ren_date_eff_from > t2.uvv_date_eff_from )"
qr3 = ";"

fullqr = qr1+'train'+qr2+' union all '+qr1+'test'+qr2+qr3

%time ve_dummy = pd.read_sql_query(fullqr, engine)

CPU times: user 6.44 s, sys: 494 ms, total: 6.93 s
Wall time: 32.4 s


In [29]:
print ve_dummy.shape
ve_dummy[:5]

(2327154, 6)


Unnamed: 0,ren_id,ve_number,urv_id,uvv_date_eff_from,uvv_quality,uvv_quantity
0,2391247,61,89,1986-07-01,,
1,2658991,61,89,1986-07-01,,
2,890584,61,89,1986-07-01,,
3,2391247,61,195,2004-07-01,,
4,2391247,61,321,1986-07-01,,


In [30]:
# drop duplicates: 3 of them.
print len(ve_dummy.index)
ve_dummy.drop_duplicates(['ren_id', 've_number','urv_id'], keep='first', inplace=True)
print len(ve_dummy.index)

2327154
2327151


In [31]:
# we could look at the values, but we move on
#ve_dummy[ve_dummy.uvv_quantity.notnull()].sort_values(by='uvv_quantity', ascending=False)[:10]

In [32]:
dummy_quantity = ve_dummy[ve_dummy.uvv_quantity.notnull()].pivot('ren_id', 'urv_id', 'uvv_quantity')
dummy_quantity.iloc[:5]

urv_id,6,7,8,9,10,11,12,13,14,17,...,322,329,332,355,388,389,391,392,393,476
ren_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
147,,,,,,,,,,,...,,,,,,,,,,
317,,,,,,,,,,,...,,,,,,,,,,
379,,,,,,,,,,,...,,,,,,,,,,
380,,,,,,,,,,,...,,,,,,,,,,
513,,,,,,,,,,,...,,,,,,,,,,


In [33]:
print dummy_quantity.shape

(47543, 65)


In [34]:
dummy_quantity = rename_columns(dummy_quantity)

In [35]:
dummy_quantity.notnull().sum().sort_values(ascending=False)

bore                       27511
below_ground_pool          13855
other_extra                 4076
lift                        2574
sauna                       1162
loft                         375
lotdth                       339
enclosed_back_veranda        193
office                       192
cabana                       187
open_patio                   187
enclosed_front_veranda       180
rumpus_room                  169
enclosed_patio               168
airconditioned_ducted        148
cellar                       112
reticulation                  89
pantry                        87
utility_room                  79
atrium                        75
gymnasium                     58
playroom                      56
above_ground_pool             41
attic                         28
proximity_to_city             25
                           ...  
no_airconditioning             4
close_to_church                4
river_views                    4
busy_road                      3
gazebo    

In [36]:
ve_dummy[:5]

Unnamed: 0,ren_id,ve_number,urv_id,uvv_date_eff_from,uvv_quality,uvv_quantity
0,2391247,61,89,1986-07-01,,
1,2658991,61,89,1986-07-01,,
2,890584,61,89,1986-07-01,,
3,2391247,61,195,2004-07-01,,
4,2391247,61,321,1986-07-01,,


In [37]:
# We keep a dummy variable
ve_dummy.uvv_quality = '1'

dummy_quality = ve_dummy[ve_dummy.uvv_quantity.isnull()].pivot('ren_id', 'urv_id', 'uvv_quality')
dummy_quality.iloc[:5]

urv_id,1,2,3,6,13,15,16,17,18,35,...,445,446,448,449,453,454,455,456,457,476
ren_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
13,,,,,,,,,,,...,,,,,,,,,,
26,,,,,,,,,,,...,,,,,,,,,,
27,,,,,,,,,,,...,,,,,,,,,,


In [38]:
print dummy_quality.shape

(976583, 236)


In [39]:
dummy_quality = rename_columns(dummy_quality)

In [40]:
# let's see what they look like
dummy_quality.count().sort_values(ascending=False)

brick__walls                867341
tile_roof                   700370
iron_roof                   171179
street_allowance             99128
big_rooms                    41953
fibro_walls                  35889
brick_veneer                 35685
standard_of_finish           31469
below_ground_pool            31125
small_rooms                  28833
fibro_roof                   26660
general_views                16313
weather_board_walls          15631
river_views                  13481
ocean_views                  11930
precast_concrete_walls       10410
_community_facilities        10378
airconditioned_ducted         9348
common_walls                  7532
located_at_rear               7378
inferior_location             6558
timber_frame                  6169
general_attributes            6095
rear_units                    5824
location                      5772
                             ...  
gravel_soil                      3
public_toilet                    3
no_onsite_carpark   

In [41]:
# NOTE: This bus_stop feature appears twice and may need to be dropped but is not a high count
print 'bus_stop: ', dummy_quality['bus_stop'].iloc[:,0].notnull().sum()
# remove the duplicate columns
dummy_quality.drop(['bus_stop'], axis=1, inplace=True)

bus_stop:  9


In [42]:
print dummy_quality.columns.values

['dual_access' 'street_access' 'row_access' 'airconditioned_ducted'
 'library' 'public_pool' 'public_park' 'community_centre'
 'lot_has_city_views' 'lot_has_river_views' 'lot_has_ocean_views'
 'lot_has_land__views' 'lot_has_hill__views' 'lot_looks_over_a_park'
 'atrium' 'attic' 'general_attributes' 'extra_land' 'big_rooms'
 'large_shed' 'age_of_buildings' '_community_facilities' 'river_frontage'
 'canals_frontage' 'close_to_park' 'boat_shed' 'bore' 'cold_room' 'cabana'
 'canal_frontage' 'max__value_for_carcover' 'cellar' 'character'
 'change_room' 'steel_frame_and_iron_wall' 'iron_walls' 'stone_walls'
 'concrete_block_walls' 'brick__walls' 'steel_frame' 'timber_frame'
 'brick_veneer' 'weather_board_walls' 'other' 'brick_clad'
 'brick__iron_walls' 'stucco_walls' 'brick_walls_timber_frame'
 'brick__weathboard_walls' 'fibro_walls' 'rammed_earth'
 'precast_concrete_walls' 'limestone' 'design' 'block_of_flats'
 'service_station' 'public_toilet' 'sump' 'hotel' 'transformer'
 'kindergarden' '

In [43]:
# remove the columns in quality that are already reflected in quantity
dummy_overlap = set(dummy_quality.columns) & set(dummy_quantity.columns)
print dummy_overlap
for o in dummy_overlap:
    dummy_quality.drop(o, axis=1, inplace=True)
print set(dummy_quality.columns) & set(dummy_quantity.columns)

set(['office', 'attic', 'house', 'gymnasium', 'enclosed_back_veranda', 'airconditioned_ducted', 'iron_roof', 'service_station', 'atrium', 'bore', 'enclosed_patio', 'general_detriments', 'loft', 'wall_airconditioning', 'enclosed_front_veranda', 'solarium', 'cabana', 'hall', 'busy_intersection', 'tile_roof', 'utility_room', 'reticulation', 'max__value_for_carcover', 'fibro_roof', 'lake__views', 'shops', 'ocean_views', 'river_views', 'cellar', 'above_ground_pool', 'lotdth', 'large_shed', 'big_rooms', 'rumpus_room', 'rear_units', 'other_extra', 'lift', 'pantry', 'entry', 'extra_land', 'sauna', 'gallery', 'close_to_church', 'playroom', 'gazebo', 'open_patio', 'busy_road', 'vestibule', 'no_airconditioning', 'below_ground_pool', 'foyer', 'public_park', 'enclosed_side_veranda'])
set([])


In [44]:
# let's see if the deleted columns in quality are really in quantity
dummy_quantity.columns

Index([u'airconditioned_ducted', u'shopping_centre', u'church',
       u'service_station', u'liquor_outlet', u'primary_school',
       u'secondary_school', u'bus_stop', u'train_stop', u'public_park',
       u'atrium', u'attic', u'extra_land', u'big_rooms', u'large_shed',
       u'building_depth', u'building_frontage', u'bore', u'cabana',
       u'max__value_for_carcover', u'cellar', u'close_to_church',
       u'no_airconditioning', u'general_detriments', u'rear_units', u'shops',
       u'busy_intersection', u'busy_road', u'wall_airconditioning',
       u'enclosed_back_veranda', u'enclosed_front_veranda', u'entry',
       u'enclosed_side_veranda', u'foyer', u'gallery', u'gazebo', u'gymnasium',
       u'hall', u'house', u'lift', u'loft', u'lotdth', u'other_extra',
       u'office', u'pantry', u'enclosed_patio', u'open_patio', u'playroom',
       u'below_ground_pool', u'above_ground_pool', u'proximity_to_city',
       u'_rate_per_m2_comind_pro', u'_rate_per_carbay_com_pr', u'reticulation'

In [45]:
# let's review what we have
df_summary(dummy_quantity)

Unnamed: 0,feature,unique,count,example_values
41,lotdth,108,339,"[nan, 28.98, 39.23, 35.0, 31.82, 37.22, 31.0, ..."
50,proximity_to_city,12,25,"[nan, 35.0, 20.0, 7.0, 17.0, 14.0, 5.0, 10.0, ..."
15,building_depth,5,15,"[nan, 74.3, 0.01, 0.5, 70.0]"
42,other_extra,4,4076,"[nan, 1.0, 2.0, 1.5]"
1,shopping_centre,4,8,"[nan, 3.0, 2.0, 5.0]"
0,airconditioned_ducted,3,148,"[nan, 1.0, 2.0]"
62,ocean_views,3,18,"[nan, 1.0, 2.0]"
60,vestibule,3,23,"[nan, 1.0, 2.0]"
51,_rate_per_m2_comind_pro,3,9,"[nan, 120.0, 115.0]"
45,enclosed_patio,3,168,"[nan, 1.0, 0.5]"


In [46]:
dummy_quantity[:5]

Unnamed: 0_level_0,airconditioned_ducted,shopping_centre,church,service_station,liquor_outlet,primary_school,secondary_school,bus_stop,train_stop,public_park,...,iron_roof,rumpus_room,sauna,solarium,utility_room,vestibule,river_views,ocean_views,lake__views,fibro_roof
ren_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
147,,,,,,,,,,,...,,,,,,,,,,
317,,,,,,,,,,,...,,,,,,,,,,
379,,,,,,,,,,,...,,,,,,,,,,
380,,,,,,,,,,,...,,,,,,,,,,
513,,,,,,,,,,,...,,,,,,,,,,


In [47]:
dummy_quality[:5]

Unnamed: 0_level_0,dual_access,street_access,row_access,library,public_pool,community_centre,lot_has_city_views,lot_has_river_views,lot_has_ocean_views,lot_has_land__views,...,max_depc'n_govt_asset,unit_position,laneway,phone_tower,marina_views,brick__fibro_walls,timber_frame__fibro_w,timber_framebrickfib_wa,steel_frame_and_fibro,weatherboardfibro_wall
ren_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
13,,,,,,,,,,,...,,,,,,,,,,
26,,,,,,,,,,,...,,,,,,,,,,
27,,,,,,,,,,,...,,,,,,,,,,


In [48]:
# clear up some space, we are going to need it
import gc
gc.collect()

200

## Demographics Features

In [49]:
# get all the demographic features

qd1 = "select distinct on (ren_id) t1.ren_id, t1.ve_number, t3.lan_id, t3.lnp_pin, t3.sa1_7,  \
area_albers_sqm, gccsa_code_2011, sa2_5digitcode_2011, \
sa3_code_2011, sa4_code_2011, state_code_2011,  \
poacode, ra_code11, code, movie_titles, \
groups, groups_1, groups_2, \
predominant_lifestage, financial_status, worklife, area_wealth_dynamic, \
stability_indicator, featurecodegroup, feature_code  \
from "
qd2 = " t1 \
left join land_valuation_key t2 on (t1.ve_number = t2.ve_number) \
left join demographics_key t3 on (t2.lan_id = t3.lan_id) \
left join demographics t4 on (t3.sa1_7 = t4.sa1_7) "
qd3 = ";"

fullqd = qd1+' train '+qd2+' union all '+qd1+' test '+qd2+qd3

%time demo = pd.read_sql_query(fullqd, engine)
print len(demo.ren_id.unique()), demo.shape
demo[:2]

CPU times: user 12.8 s, sys: 1.24 s, total: 14 s
Wall time: 32.7 s
985078 (985078, 25)


Unnamed: 0,ren_id,ve_number,lan_id,lnp_pin,sa1_7,area_albers_sqm,gccsa_code_2011,sa2_5digitcode_2011,sa3_code_2011,sa4_code_2011,...,groups,groups_1,groups_2,predominant_lifestage,financial_status,worklife,area_wealth_dynamic,stability_indicator,featurecodegroup,feature_code
0,6,4485377,1385714,11521697,5110327,489217,5GPER,51103,50503,505,...,G05,Younger Families,Family,Young families,Net worth very low,Mix of working people and professionals,Affluence slowly improving,Fairly settled and less likely to move,5,21
1,9,3747416,4426471,11903453,5105705,225176,5GPER,51057,50403,504,...,G05,Younger Families,Family,Young families,Net worth very low,Mix of working people and professionals,Affluence slowly improving,Fairly settled and less likely to move,5,21


In [50]:
# # the sql query has no dupes, so we skip this step
# print demo.shape
# demo.drop_duplicates(['ren_id'], keep='last', inplace=True)
# print demo.shape

In [51]:
df_summary(demo)

Unnamed: 0,feature,unique,count,example_values
0,ren_id,985078,985078,"[6, 9, 13, 26, 27, 41, 42, 59, 61]"
2,lan_id,289203,985020,"[1385714.0, 4426471.0, 4536296.0, 249501.0, 48..."
1,ve_number,288444,985076,"[4485377.0, 3747416.0, 481945.0, 719761.0, 777..."
3,lnp_pin,179499,414628,"[11521697.0, 11903453.0, nan, 697365.0, 517638..."
4,sa1_7,5042,985005,"[5110327.0, 5105705.0, 5104623.0, 5111821.0, 5..."
5,area_albers_sqm,5016,985005,"[489217.0, 225176.0, 230739.0, 157454.0, 11886..."
11,poacode,236,985005,"[6065.0, 6063.0, 6051.0, 6104.0, 6060.0, 6722...."
7,sa2_5digitcode_2011,230,985005,"[51103.0, 51057.0, 51046.0, 51118.0, 51095.0, ..."
14,movie_titles,51,985005,"[The Parent Trap, Home Alone, Mad Max, Trading..."
13,code,51,985005,"[21.0, 29.0, 11.0, 28.0, 39.0, 22.0, 48.0, 9.0..."


In [52]:
import gc 
gc.collect()

176

In [53]:
transform_cols = [u'movie_titles', u'groups', u'groups_1', u'groups_2', u'predominant_lifestage', u'financial_status',
                  u'worklife', u'area_wealth_dynamic', u'stability_indicator']

In [54]:
# encode the categorical features
from sklearn import preprocessing
% time demo_encoded = demo[transform_cols].apply(preprocessing.LabelEncoder().fit_transform) 
demo_encoded[:4]

CPU times: user 25.8 s, sys: 116 ms, total: 25.9 s
Wall time: 25.9 s


Unnamed: 0,movie_titles,groups,groups_1,groups_2,predominant_lifestage,financial_status,worklife,area_wealth_dynamic,stability_indicator
0,42,5,10,5,13,10,4,12,3
1,42,5,10,5,13,10,4,12,3
2,17,8,5,9,9,2,2,12,11
3,20,7,2,8,1,1,10,12,1


In [55]:
# # and make the categoricals strings
# # TODO: use Pandas Categorical types?
%time demo_encoded = demo_encoded.apply(lambda y: ['A'+str(x) if pd.notnull(x) else x for x in y])
demo_encoded[:4]

CPU times: user 21.5 s, sys: 391 ms, total: 21.9 s
Wall time: 21.7 s


Unnamed: 0,movie_titles,groups,groups_1,groups_2,predominant_lifestage,financial_status,worklife,area_wealth_dynamic,stability_indicator
0,A42,A5,A10,A5,A13,A10,A4,A12,A3
1,A42,A5,A10,A5,A13,A10,A4,A12,A3
2,A17,A8,A5,A9,A9,A2,A2,A12,A11
3,A20,A7,A2,A8,A1,A1,A10,A12,A1


In [56]:
demo_categorical_features = ['sa2_5digitcode_2011', 'state_code_2011', 'poacode', 'sa1_7', 'feature_code', 
                              'sa4_code_2011', 'featurecodegroup','sa3_code_2011', 'ra_code11', 'code']

In [57]:
# # categorical
# demo_cat = demo[demo_categorical_features].apply(lambda y: ['A'+str(int(x)) if pd.notnull(x) else x for x in y])
demo_cat = demo[demo_categorical_features]
demo_cat[:5]

Unnamed: 0,sa2_5digitcode_2011,state_code_2011,poacode,sa1_7,feature_code,sa4_code_2011,featurecodegroup,sa3_code_2011,ra_code11,code
0,51103,5,6065,5110327,21,505,5,50503,50,21
1,51057,5,6063,5105705,21,504,5,50403,50,21
2,51046,5,6051,5104623,29,504,8,50401,50,29
3,51118,5,6104,5111821,11,506,7,50602,50,11
4,51095,5,6060,5109506,28,505,3,50502,50,28


In [58]:
demo_orig_cols = list((set(demo.columns) - set(demo_encoded.columns)) - set(demo_cat.columns))
demo_orig_cols

[u've_number',
 u'lnp_pin',
 u'lan_id',
 u'ren_id',
 u'area_albers_sqm',
 u'gccsa_code_2011']

In [59]:
demo.columns

Index([u'ren_id', u've_number', u'lan_id', u'lnp_pin', u'sa1_7',
       u'area_albers_sqm', u'gccsa_code_2011', u'sa2_5digitcode_2011',
       u'sa3_code_2011', u'sa4_code_2011', u'state_code_2011', u'poacode',
       u'ra_code11', u'code', u'movie_titles', u'groups', u'groups_1',
       u'groups_2', u'predominant_lifestage', u'financial_status', u'worklife',
       u'area_wealth_dynamic', u'stability_indicator', u'featurecodegroup',
       u'feature_code'],
      dtype='object')

In [60]:
# we should not have lost any rows by this point
print demo.shape
print demo_encoded.shape
print demo_cat.shape

(985078, 25)
(985078, 9)
(985078, 10)


In [61]:
demo_all = pd.concat([demo[demo_orig_cols], demo_cat, demo_encoded], axis=1)
print demo_all.shape

(985078, 25)


In [62]:
demo_all.drop('ve_number', axis=1,inplace=True)
demo_all.set_index('ren_id', inplace=True)
demo_all[:3]

Unnamed: 0_level_0,lnp_pin,lan_id,area_albers_sqm,gccsa_code_2011,sa2_5digitcode_2011,state_code_2011,poacode,sa1_7,feature_code,sa4_code_2011,...,code,movie_titles,groups,groups_1,groups_2,predominant_lifestage,financial_status,worklife,area_wealth_dynamic,stability_indicator
ren_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,11521697.0,1385714,489217,5GPER,51103,5,6065,5110327,21,505,...,21,A42,A5,A10,A5,A13,A10,A4,A12,A3
9,11903453.0,4426471,225176,5GPER,51057,5,6063,5105705,21,504,...,21,A42,A5,A10,A5,A13,A10,A4,A12,A3
13,,4536296,230739,5GPER,51046,5,6051,5104623,29,504,...,29,A17,A8,A5,A9,A9,A2,A2,A12,A11


## Land Features

In [63]:
qe1 = "select distinct on (ren_id) ren_id, lan_water, lan_drainage, sub_postcode from " 
qe2 = " t1 left join land_valuation_key t2 on (t1.ve_number = t2.ve_number) \
    left join land t3 on (t2.lan_id = t3.lan_id) "

fullqe = qe1+' train '+qe2+' union all '+qe1+' test '+qe2+';'

%time land = pd.read_sql_query(fullqe, engine)
land[:2]

CPU times: user 2.48 s, sys: 247 ms, total: 2.73 s
Wall time: 12.7 s


Unnamed: 0,ren_id,lan_water,lan_drainage,sub_postcode
0,6,N,N,6065
1,9,N,N,6063


In [64]:
print land.shape, len(train.index) + len(test.index)
if len(land.index) != len(train.index) + len(test.index):
    raise Exception('Something went wrong with the query')

(985078, 4) 985078


In [65]:
land.set_index('ren_id', inplace=True)
print land.shape
land[:2]

(985078, 3)


Unnamed: 0_level_0,lan_water,lan_drainage,sub_postcode
ren_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,N,N,6065
9,N,N,6063


In [66]:
# # make postcode categorical 
#land['sub_postcode'] = ['P'+str(int(x)) if pd.notnull(x) else x for x in land.sub_postcode]
#land[:2]

In [67]:
# find records with all null's
print land.shape
land.isnull().sum()

(985078, 3)


lan_water       8
lan_drainage    8
sub_postcode    8
dtype: int64

In [68]:
# we only have 8 null's so we will leave them

## Classification Features

In [69]:
qc1 = "select distinct on (ren_id) ren_id, t2.vec_cls_code, t2.cls_ve_use,  t3.cls_multi_res_ind from "
qc2 = " t1 \
    left join valuation_entities_classifications t2 \
    on (t1.ve_number = t2.ve_number and t1.ren_date_eff_from > t2.vec_date_eff_from) \
    left join valuation_entities_classifications t3 \
    on (t2.ve_number = t3.ve_number and t3.vec_date_eff_from > t2.vec_date_eff_from) "
qc3 = "group by t1.ren_id, t1.ve_number, t2.vec_cls_code, t2.cls_ve_use, t3.cls_multi_res_ind, t3.vec_date_eff_from ;"

fullqc = qc1+' train '+qc2+' union all '+qc1+' test '+qc2+qc3

%time classf = pd.read_sql_query(fullqc, engine)
classf[:2]

CPU times: user 2.39 s, sys: 291 ms, total: 2.68 s
Wall time: 39.7 s


Unnamed: 0,ren_id,vec_cls_code,cls_ve_use,cls_multi_res_ind
0,6,105,V,N
1,9,75,R,


In [70]:
print classf.shape
if len(classf.index) != len(train.index) + len(test.index):
    raise Exception('Bad query for classification features')

(985078, 4)


In [71]:
classf[:10]

Unnamed: 0,ren_id,vec_cls_code,cls_ve_use,cls_multi_res_ind
0,6,105,V,N
1,9,75,R,
2,13,60,R,
3,26,87,R,
4,27,75,R,
5,41,10,R,
6,42,10,R,
7,59,10,R,
8,61,50,R,
9,65,10,R,


In [72]:
classf.cls_ve_use.value_counts()

R    892371
V     85450
C      1410
M       104
I        66
F        52
Name: cls_ve_use, dtype: int64

In [73]:
# # make categorical 
classf['vec_cls_code'] = ['C'+str(int(x)) if pd.notnull(x) else x for x in classf.vec_cls_code]
classf[:2]

Unnamed: 0,ren_id,vec_cls_code,cls_ve_use,cls_multi_res_ind
0,6,C105,V,N
1,9,C75,R,


In [74]:
classf.cls_multi_res_ind.unique()

array([u'N', None, u'Y'], dtype=object)

In [75]:
classf.set_index('ren_id', inplace=True)
classf[:2]

Unnamed: 0_level_0,vec_cls_code,cls_ve_use,cls_multi_res_ind
ren_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
6,C105,V,N
9,C75,R,


In [76]:
print classf.shape
classf.isnull().sum()

(985078, 3)


vec_cls_code           5625
cls_ve_use             5625
cls_multi_res_ind    806627
dtype: int64

In [77]:
# # impute , not just yet
# classf.loc[classf.cls_multi_res_ind.isnull(),'cls_multi_res_ind'] = 'N'

In [78]:
import gc
gc.collect()

125

## Distances

In [79]:
qf1 = "select distinct on (ren_id) ren_id, pin, \
    distance_coast,distance_hospital,distance_school_poly,distance_reserve,distance_waterbody,\
    distance_arterialroad,distance_gpo,distance_golfcourse,distance_university,distance_freeway,\
    distance_shoppingcentre,distance_trainstation,distance_railline,distance_airport from " 
qf2 = " t1 left join land_valuation_key t2 on (t1.ve_number = t2.ve_number) \
    left join land_pins t3 on (t2.lan_id = t3.lan_id and t3.lnp_pin is not null) \
    left join distances t4 on (t3.lnp_pin = t4.pin) "

fullqf = qf1+' train '+qf2+' union all '+qf1+' test '+qf2+';'

%time distances = pd.read_sql_query(fullqf, engine)
print distances.shape
distances[:5]

CPU times: user 4.32 s, sys: 687 ms, total: 5 s
Wall time: 7min 37s
(985078, 16)


Unnamed: 0,ren_id,pin,distance_coast,distance_hospital,distance_school_poly,distance_reserve,distance_waterbody,distance_arterialroad,distance_gpo,distance_golfcourse,distance_university,distance_freeway,distance_shoppingcentre,distance_trainstation,distance_railline,distance_airport
0,6,11521697.0,10052.0,5940.0,795.0,178.0,1307.0,679.0,15263.0,1668.0,5116.0,6072.0,652.0,6186.0,6082.0,15720.0
1,9,11903453.0,16385.0,6511.0,739.0,168.0,487.0,57.0,11984.0,1807.0,6674.0,10253.0,0.0,5206.0,2382.0,7588.0
2,13,,,,,,,,,,,,,,,
3,26,,,,,,,,,,,,,,,
4,27,,,,,,,,,,,,,,,


In [80]:
if len(distances.index) != len(train.index) + len(test.index):
    raise Exception('Bad query for distances features')

In [81]:
distances.set_index('ren_id', inplace=True)
distances[:2]

Unnamed: 0_level_0,pin,distance_coast,distance_hospital,distance_school_poly,distance_reserve,distance_waterbody,distance_arterialroad,distance_gpo,distance_golfcourse,distance_university,distance_freeway,distance_shoppingcentre,distance_trainstation,distance_railline,distance_airport
ren_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
6,11521697,10052,5940,795,178,1307,679,15263,1668,5116,6072,652,6186,6082,15720
9,11903453,16385,6511,739,168,487,57,11984,1807,6674,10253,0,5206,2382,7588


In [82]:
import gc
gc.collect()

797

# Big Merge

In [83]:
allset = train.append(test)
if len(allset) != (len(train) + len(test)):
    raise Exception('Failure in append')

allset[:2]

Unnamed: 0_level_0,ren_base_rent,ren_date_eff_from,ren_lease_length,ve_number
ren_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1579258,280,1990-02-13,,4807702
1203979,115,1990-04-13,,332135


In [101]:
# so how many columns do we have?
allcols =  len(allset.columns) + len(demo_all.columns) + len(quantity_features.columns) \
    + len(quality_features.columns) + len(dummy_quantity.columns) + len(dummy_quality.columns) \
    + len(land.columns) + len(classf.columns) + len(distances.columns)
allcols

370

In [85]:
# merge into one big table
allup = pd.concat([allset, demo_all, quantity_features, quality_features, dummy_quantity, dummy_quality, 
           land, classf, distances], axis=1, join_axes=[allset.index])

print allset.shape, allup.shape
if (len(allset.index) != len(allup.index)) | (len(allup.columns) != allcols):
    raise Exception('Not sure this merge is good')

(985078, 4) (985078, 370)


In [86]:
allup[:5]

Unnamed: 0_level_0,ren_base_rent,ren_date_eff_from,ren_lease_length,ve_number,lnp_pin,lan_id,area_albers_sqm,gccsa_code_2011,sa2_5digitcode_2011,state_code_2011,...,distance_waterbody,distance_arterialroad,distance_gpo,distance_golfcourse,distance_university,distance_freeway,distance_shoppingcentre,distance_trainstation,distance_railline,distance_airport
ren_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1579258,280,1990-02-13,,4807702,360151.0,650100,131756,5GPER,51188,5,...,434.0,531.0,39071.0,2980.0,1868.0,9458.0,656.0,2584.0,2566.0,6856.0
1203979,115,1990-04-13,,332135,,560980,171145,5GPER,51177,5,...,,,,,,,,,,
1431889,150,1990-04-14,,1647835,,2897191,341092,5RWAU,51210,5,...,,,,,,,,,,
1025129,100,1990-09-21,,663393,,4998891,159324,5GPER,51119,5,...,,,,,,,,,,
5191246,80,1990-10-03,,2511508,682741.0,4666462,403410,5RWAU,51199,5,...,1816.0,441.0,551513.0,2739.0,72.0,547169.0,351.0,1022.0,999.0,2965.0


In [102]:
# # we need to re-impute all the new rows!

# allup.loc[:,quantity_features.columns] = allup.loc[:,quantity_features.columns].fillna(0)
# allup.loc[:,quality_features.columns] = allup.loc[:,quality_features.columns].fillna('0')
# allup.loc[:,dummy_quantity.columns] = allup.loc[:,dummy_quantity.columns].fillna(0)
# allup.loc[:,dummy_quality.columns] = allup.loc[:,dummy_quality.columns].fillna('0')
# allup.loc[allup.cls_multi_res_ind.isnull(),'cls_multi_res_ind'] = 'N'

In [90]:
allup.columns.values

array(['ren_base_rent', 'ren_date_eff_from', 'ren_lease_length',
       've_number', u'lnp_pin', u'lan_id', u'area_albers_sqm',
       u'gccsa_code_2011', u'sa2_5digitcode_2011', u'state_code_2011',
       u'poacode', u'sa1_7', u'feature_code', u'sa4_code_2011',
       u'featurecodegroup', u'sa3_code_2011', u'ra_code11', u'code',
       u'movie_titles', u'groups', u'groups_1', u'groups_2',
       u'predominant_lifestage', u'financial_status', u'worklife',
       u'area_wealth_dynamic', u'stability_indicator', 'activity_room',
       'airconditioned', 'house_area', 'office_area_1', 'shop_area',
       'store_room_area_1', 'warehouse_area_1', 'factory_area_1',
       'any_other_area', 'basement_area', 'shed_area_1', 'total_area',
       'mezzanine_area', 'workshop_area_1', 'bar', 'bath_room', 'bedrooms',
       'single_bedrooms', 'carbay_under_cover', 'tandem_carbay',
       'open_car_bay', 'carport_under_main_roof', 'carport_attached',
       'carport_detached', 'dining_room', 'dressing

In [91]:
import gc
gc.collect()

26

# OUTPUT

In [94]:
# split train and test again
train_final = allup.loc[train.index].copy()
test_final = allup.loc[test.index].copy()

In [95]:
print allup.shape
print train_final.shape
print test_final.shape

(985078, 370)
(834570, 370)
(150508, 370)


In [96]:
train_final.to_csv('train_full.csv')

In [97]:
test_final.to_csv('test_full.csv')

In [98]:
allup.to_csv('train_test_full.csv')

In [99]:
end_time = pd.datetime.now()
elapsed_time = end_time - start_time
print elapsed_time

2:35:32.085424


In [100]:
gc.collect()

0