In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import KFold

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils.np_utils import to_categorical

from preprocess import pre_process

Using TensorFlow backend.


In [2]:
cols_to_use = ['bathrooms', 'bedrooms', 'latitude', 'longitude', 'price', 'price_t', 'num_photos', 
               'num_features', 'avg_desc_words', 'num_desc_words', 'num_desc_words', 'listing_id', 
               'created_year', 'created_month', 'created_day', 'created_hour', 'manager_build', 
               'avg_bedrooms', 'avg_bathrooms', 'avg_price', 'manager_alph', 'manager_num', 
               'price_rooms', 'price_bed', 'price_bath', 'encode_manager', 'kmeans_neighbor', 
               'avg_num_desc', 'density', 'density_u', 'dup_address', 'magic', 'unix_time', 
               'magic_passed', 'magic_month', 'magic_week', 'magic_day', 'magic_dayofweek', 
               'magic_dayofyear', 'magic_hour', 'magic_monthBeginMidEnd', 'num_rho', 'num_phi', 
               'shout', 'manager_level_low', 'manager_level_medium', 'manager_level_high', 
               'manager_skill'
               ]

train_df, test_df = pre_process()
tfidf = CountVectorizer(stop_words='english', max_features=200)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

train_df.head(3)

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,latitude,listing_id,longitude,...,num_rot30_X,num_rot30_Y,num_rot45_X,num_rot45_Y,num_rot60_X,num_rot60_Y,manager_level_low,manager_level_medium,manager_level_high,manager_skill
0,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,,40.7145,7211212,-73.9425,...,-1.711459,-84.393333,-23.495744,-81.074742,-43.678833,-72.231041,1.0,0.0,0.0,0.0
1,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,Doorman Elevator Fitness_Center Cats_Allowed D...,40.7947,7150865,-73.9667,...,-1.654103,-84.454391,-23.456146,-81.148564,-43.659691,-72.312597,,,,
2,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,Laundry_In_Building Dishwasher Hardwood_Floors...,40.7388,6887163,-74.0018,...,-1.720064,-84.456839,-23.520493,-81.133856,-43.718039,-72.281736,,,,


In [12]:
#transfer the categorical varibles to label integer
categorical = ["display_address", "manager_id", "building_id"]
for f in categorical:
    if train_df[f].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train_df[f].values) + list(test_df[f].values))
        train_df[f] = lbl.transform(list(train_df[f].values))
        test_df[f] = lbl.transform(list(test_df[f].values))
        features_to_use.append(f)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [14]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse]).tocsr()
train_X = train_X.toarray()
test_X = test_X.toarray()

from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(train_X)
train_X = imp.transform(train_X)
test_X = imp.transform(test_X)

# Scale train_X and test_X together
traintest = np.vstack((train_X, test_X))
traintest = preprocessing.StandardScaler().fit_transform(traintest)
train_X = traintest[range(train_X.shape[0])]
test_X = traintest[range(train_X.shape[0], traintest.shape[0])]

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
train_y = to_categorical(train_y)

print train_X.shape, test_X.shape

(49352, 246) (74659, 246)


In [15]:
## neural net
def nn_model():
    model = Sequential()
    model.add(Dense(25, input_dim=train_X.shape[1], kernel_initializer='he_normal', activation='sigmoid'))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
    model.add(PReLU())
    model.add(Dense(100, kernel_initializer='he_normal', activation='sigmoid'))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(PReLU())
    model.add(Dense(3, kernel_initializer='he_normal', activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    return (model)

In [16]:
## cv-folds
nfolds = 5
nbags = 5
folds = KFold(n_splits=nfolds, shuffle = True)
testset = test_X
pred_oob = np.zeros((len(train_y), 3))
pred_test = np.zeros((testset.shape[0], 3))
filepath="weights.best.hdf5"

In [17]:
begintime = time()
for i, (inTr, inTe) in enumerate(folds.split(train_X)):
    i += 1
    xtr = train_X[inTr]
    ytr = train_y[inTr]
    xte = train_X[inTe]
    yte = train_y[inTe]
    pred = np.zeros((xte.shape[0], 3))
    for j in range(nbags):
        model = nn_model()
        early_stop = EarlyStopping(monitor='val_loss', patience=75, verbose=0)
        checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True)
        model.fit(xtr, ytr, epochs = 1200, batch_size=1000, verbose = 0, validation_data=[xte, yte])    
        pred += model.predict_proba(x=xte, verbose=0)
        pred_test += model.predict_proba(x=testset, verbose=0)

        print 'Total run time: '+str(datetime.timedelta(seconds=time()-begintime))+\
        '\t Log-Loss of '+str(j+1)+' bag: '+str(log_loss(yte,pred/(j+1)))

    pred /= nbags
    pred_oob[inTe] = pred
    score = log_loss(yte,pred)
    print '================ Fold '+str(i)+' - logloss: '+str(score)+' ================'

print 'Total - logloss: '+str(log_loss(train_y, pred_oob))

Total run time: 0:04:40.608017	 Log-Loss of 1 bag: 0.569731119005
Total run time: 0:09:23.712022	 Log-Loss of 2 bag: 0.559500044783
Total run time: 0:14:08.792607	 Log-Loss of 3 bag: 0.55681900253
Total run time: 0:18:51.796316	 Log-Loss of 4 bag: 0.555792528446
Total run time: 0:23:38.454014	 Log-Loss of 5 bag: 0.553381948562
Total run time: 0:28:23.563494	 Log-Loss of 1 bag: 0.573942328887
Total run time: 0:33:12.060320	 Log-Loss of 2 bag: 0.56513390537
Total run time: 0:37:58.942315	 Log-Loss of 3 bag: 0.561763006552
Total run time: 0:42:43.127472	 Log-Loss of 4 bag: 0.561776978259
Total run time: 0:47:32.669768	 Log-Loss of 5 bag: 0.560230090957
Total run time: 0:52:22.503978	 Log-Loss of 1 bag: 0.57536271117
Total run time: 0:57:08.627423	 Log-Loss of 2 bag: 0.564678247818
Total run time: 1:02:01.159153	 Log-Loss of 3 bag: 0.561737701219
Total run time: 1:06:50.439778	 Log-Loss of 4 bag: 0.558012712153
Total run time: 1:11:40.921862	 Log-Loss of 5 bag: 0.556980866109
Total run tim

In [18]:
# test predictions
pred_test /= (nfolds*nbags)
out_df = pd.DataFrame(pred_test)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("keras_1st.csv", index=False)