In [1]:
import os
import re
import sys
import operator
import numpy as np
from keras.preprocessing import sequence
from keras.layers import Dropout
from keras.utils import np_utils
import pandas as pd
from scipy import sparse
import xgboost as xgb
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import math
os.chdir('/Volumes/pd stuff/Work/kaggle/rental_listing')
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.utils.np_utils import to_categorical

Using Theano backend.


In [2]:
train_file = "train.json"
test_file = "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)

features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude", "price"]

# transformation of lat and lng #
train_df["price_t"] = train_df["price"]/train_df["bedrooms"] 
test_df["price_t"] = test_df["price"]/test_df["bedrooms"] 

train_df["room_dif"] = train_df["bedrooms"]-train_df["bathrooms"] 
train_df["room_sum"] = train_df["bedrooms"]+train_df["bathrooms"] 
train_df["price_t1"] = train_df["price"]/train_df["room_sum"]
train_df["fold_t1"] = train_df["bedrooms"]/train_df["room_sum"]

test_df["room_dif"] = test_df["bedrooms"]-test_df["bathrooms"] 
test_df["room_sum"] = test_df["bedrooms"]+test_df["bathrooms"] 
test_df["price_t1"] = test_df["price"]/test_df["room_sum"]
test_df["fold_t1"] = test_df["bedrooms"]/test_df["room_sum"]

 
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
train_df["passed"] = train_df["created"].max() - train_df["created"]
train_df['passed']=train_df["passed"].apply(lambda x:re.search(r'\d+',str(x)).group(0))
test_df["created"] = pd.to_datetime(test_df["created"])
test_df["passed"] = train_df["created"].max() - test_df["created"]
test_df['passed']=test_df["passed"].apply(lambda x:re.search(r'\d+',str(x)).group(0))
train_df['passed']=train_df['passed'].astype(int)
test_df['passed']=test_df['passed'].astype(int)


# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour

(49352, 15)
(74659, 14)


In [3]:
train_df["street_number_men"] = train_df["display_address"].apply(lambda x: int(bool(re.search(r'\d',x))))
test_df["street_number_men"] = test_df["display_address"].apply(lambda x: int(bool(re.search(r'\d',x))))
features_to_use.extend(["price_t","num_photos", "num_features", "num_description_words", 
                        "created_year", "created_month", "created_day", "created_hour",
                        "listing_id",'price_t1'])
ny_lat = 40.785091
ny_lon = -73.964613
distance_to_ny_train=[]
distance_to_ny_test=[]
for i in zip(train_df['latitude'],train_df['longitude']):
    distance_to_ny_train.append(math.sqrt(pow((i[0]-ny_lat),2) + pow((i[1]-ny_lon),2)))
for i in zip(test_df['latitude'],test_df['longitude']):
    distance_to_ny_test.append(math.sqrt(pow((i[0]-ny_lat),2) + pow((i[1]-ny_lon),2)))
high_lat= 40.748007
high_lon= -73.968285
distance_to_high_train=[]
distance_to_high_test=[]
for i in zip(train_df['latitude'],train_df['longitude']):
    distance_to_high_train.append(math.sqrt(pow((i[0]-high_lat),2) + pow((i[1]-high_lon),2)))
for i in zip(test_df['latitude'],test_df['longitude']):
    distance_to_high_test.append(math.sqrt(pow((i[0]-high_lat),2) + pow((i[1]-high_lon),2)))
low_lat= 40.739504
low_lon= -73.951667
distance_to_low_train=[]
distance_to_low_test=[]
for i in zip(train_df['latitude'],train_df['longitude']):
    distance_to_low_train.append(math.sqrt(pow((i[0]-low_lat),2) + pow((i[1]-low_lon),2)))
for i in zip(test_df['latitude'],test_df['longitude']):
    distance_to_low_test.append(math.sqrt(pow((i[0]-low_lat),2) + pow((i[1]-low_lon),2)))
mid_lat= 40.745567
mid_lon= -73.965033
distance_to_mid_train=[]
distance_to_mid_test=[]
for i in zip(train_df['latitude'],train_df['longitude']):
    distance_to_mid_train.append(math.sqrt(pow((i[0]-mid_lat),2) + pow((i[1]-mid_lon),2)))
for i in zip(test_df['latitude'],test_df['longitude']):
    distance_to_mid_test.append(math.sqrt(pow((i[0]-mid_lat),2) + pow((i[1]-mid_lon),2)))
mean_description_words_high=91.258140
mean_description_words_mid=97.733547
mean_description_words_low=87.525201
word_distance_high_tr=[]
word_distance_high_te=[]
word_distance_mid_tr=[]
word_distance_mid_te=[]
word_distance_low_tr=[]
word_distance_low_te=[]
for i in train_df['num_description_words']:
    word_distance_high_tr.append(i-mean_description_words_high)
    word_distance_mid_tr.append(i-mean_description_words_mid)
    word_distance_low_tr.append(i-mean_description_words_low)
for i in test_df['num_description_words']:
    word_distance_high_te.append(i-mean_description_words_high)
    word_distance_mid_te.append(i-mean_description_words_mid)
    word_distance_low_te.append(i-mean_description_words_low)
train_df['distance_to_ny_train']=distance_to_ny_train
test_df['distance_to_ny_train']=distance_to_ny_test
train_df['distance_to_high']=distance_to_high_train
test_df['distance_to_high']=distance_to_high_test
train_df['distance_to_low']=distance_to_low_train
test_df['distance_to_low']=distance_to_low_test
train_df['distance_to_mid']=distance_to_mid_train
test_df['distance_to_mid']=distance_to_mid_test
train_df['word_distance_high']=word_distance_high_tr
train_df['word_distance_mid']=word_distance_mid_tr
train_df['word_distance_low']=word_distance_low_tr
test_df['word_distance_high']=word_distance_high_te
test_df['word_distance_mid']=word_distance_mid_te
test_df['word_distance_low']=word_distance_low_te
features_to_use.extend(['distance_to_ny_train','distance_to_high','distance_to_low','distance_to_mid','word_distance_high','word_distance_mid','word_distance_low'])

In [4]:
train_df[features_to_use].dtypes

bathrooms                float64
bedrooms                   int64
latitude                 float64
longitude                float64
price                      int64
price_t                  float64
num_photos                 int64
num_features               int64
num_description_words      int64
created_year               int64
created_month              int64
created_day                int64
created_hour               int64
listing_id                 int64
price_t1                 float64
distance_to_ny_train     float64
distance_to_high         float64
distance_to_low          float64
distance_to_mid          float64
word_distance_high       float64
word_distance_mid        float64
word_distance_low        float64
dtype: object

In [5]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
print(train_df["features"].head())
tfidf = CountVectorizer(stop_words='english', max_features=500)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

10                                                         
10000     Doorman Elevator Fitness_Center Cats_Allowed D...
100004    Laundry_In_Building Dishwasher Hardwood_Floors...
100007                               Hardwood_Floors No_Fee
100013                                              Pre-War
Name: features, dtype: object


In [6]:
train_df.index=range(train_df.shape[0])
test_df.index=range(test_df.shape[0])

In [7]:
tr_sparse = pd.DataFrame(tr_sparse.toarray())
te_sparse=pd.DataFrame(te_sparse.toarray())

In [8]:
train_X=pd.concat([train_df[features_to_use],tr_sparse],axis=1,ignore_index=True)
test_X=pd.concat([test_df[features_to_use],te_sparse],axis=1,ignore_index=True)

In [9]:
test_X.shape

(74659, 526)

In [10]:
train_X.shape

(49352, 526)

In [11]:
target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)

((49352, 526), (74659, 526))


In [12]:
train_y =np_utils.to_categorical(train_y)

In [13]:
train_y

array([[ 0.,  1.,  0.],
       [ 0.,  0.,  1.],
       [ 1.,  0.,  0.],
       ..., 
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.],
       [ 0.,  0.,  1.]])

In [14]:
train_X = sequence.pad_sequences(train_X.as_matrix(), maxlen=train_X.shape[1])
test_X = sequence.pad_sequences(test_X.as_matrix(), maxlen=train_X.shape[1])

In [None]:
train_X = sequence.pad_sequences(train_df[features_to_use].as_matrix(), maxlen=train_df[features_to_use].shape[1])
test_X = sequence.pad_sequences(test_df[features_to_use].as_matrix(), maxlen=test_df[features_to_use].shape[1])

In [None]:
train_X=train_X.as_matrix()
test_X=test_X.as_matrix()

In [None]:
test_X.shape

In [15]:
def baseline_model():
    model = Sequential()
    model.add(Dense(300, input_dim=train_X.shape[1], init='normal', activation='sigmoid'))
    model.add(Dropout(0.35))
    model.add(Dense(300, init='normal', activation='sigmoid'))
    model.add(Dropout(0.35))
    model.add(Dense(3, init='normal', activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

In [None]:
class_weight = {0 :8,1:3,2:1}
model=baseline_model()
model.fit(train_X,train_y,nb_epoch=1000,batch_size=100,class_weight=class_weight)

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000

In [17]:
preds=model.predict(test_X)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("xgb_main_keras_last_stret_number_my_count_features.csv", index=False)        

In [18]:
preds[0:100,:]

array([[ 0.30399293,  0.34628016,  0.34972692],
       [ 0.30399293,  0.34628016,  0.34972692],
       [ 0.30399293,  0.34628016,  0.34972692],
       [ 0.30399293,  0.34628016,  0.34972692],
       [ 0.30399293,  0.34628016,  0.34972692],
       [ 0.30399293,  0.34628016,  0.34972692],
       [ 0.30399293,  0.34628016,  0.34972692],
       [ 0.34433517,  0.32718781,  0.32847703],
       [ 0.30399293,  0.34628016,  0.34972692],
       [ 0.34433517,  0.32718781,  0.32847703],
       [ 0.30399293,  0.34628016,  0.34972692],
       [ 0.34433517,  0.32718781,  0.32847703],
       [ 0.30399293,  0.34628016,  0.34972692],
       [ 0.30399293,  0.34628016,  0.34972692],
       [ 0.30399293,  0.34628016,  0.34972692],
       [ 0.30399293,  0.34628016,  0.34972692],
       [ 0.34433517,  0.32718781,  0.32847703],
       [ 0.34433517,  0.32718781,  0.32847703],
       [ 0.34433517,  0.32718781,  0.32847703],
       [ 0.30399293,  0.34628016,  0.34972692],
       [ 0.34433517,  0.32718781,  0.328

In [None]:
train_df['interest_level'].value_counts()

In [None]:
34284/3839

In [None]:
34284/11229