In [1]:
import os
import sys
import operator
import numpy as np
import pandas as pd
from scipy import sparse
import xgboost as xgb
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer




In [2]:
data_path = "~/data/TwoSigma/"
train_file = data_path + "train.json"
test_file = data_path + "test.json"
train_df = pd.read_json(train_file)
test_df = pd.read_json(test_file)
print(train_df.shape)
print(test_df.shape)


#pd.read_json("~/data/TwoSigma/train.json")
#pd.read_json("/home/kmitchell/data/TwoSigma/train.json")

(49352, 15)
(74659, 14)


In [3]:
train_df.head()

Unnamed: 0,bathrooms,bedrooms,building_id,created,description,display_address,features,interest_level,latitude,listing_id,longitude,manager_id,photos,price,street_address
10,1.5,3,53a5b119ba8f7b61d4e010512e0dfc85,2016-06-24 07:54:24,A Brand New 3 Bedroom 1.5 bath ApartmentEnjoy ...,Metropolitan Avenue,[],medium,40.7145,7211212,-73.9425,5ba989232d0489da1b5f2c45f6688adc,[https://photos.renthop.com/2/7211212_1ed4542e...,3000,792 Metropolitan Avenue
10000,1.0,2,c5c8a357cba207596b04d1afd1e4f130,2016-06-12 12:19:27,,Columbus Avenue,"[Doorman, Elevator, Fitness Center, Cats Allow...",low,40.7947,7150865,-73.9667,7533621a882f71e25173b27e3139d83d,[https://photos.renthop.com/2/7150865_be3306c5...,5465,808 Columbus Avenue
100004,1.0,1,c3ba40552e2120b0acfc3cb5730bb2aa,2016-04-17 03:26:41,"Top Top West Village location, beautiful Pre-w...",W 13 Street,"[Laundry In Building, Dishwasher, Hardwood Flo...",high,40.7388,6887163,-74.0018,d9039c43983f6e564b1482b273bd7b01,[https://photos.renthop.com/2/6887163_de85c427...,2850,241 W 13 Street
100007,1.0,1,28d9ad350afeaab8027513a3e52ac8d5,2016-04-18 02:22:02,Building Amenities - Garage - Garden - fitness...,East 49th Street,"[Hardwood Floors, No Fee]",low,40.7539,6888711,-73.9677,1067e078446a7897d2da493d2f741316,[https://photos.renthop.com/2/6888711_6e660cee...,3275,333 East 49th Street
100013,1.0,4,0,2016-04-28 01:32:41,Beautifully renovated 3 bedroom flex 4 bedroom...,West 143rd Street,[Pre-War],low,40.8241,6934781,-73.9493,98e13ad4b495b9613cef886d79a6291f,[https://photos.renthop.com/2/6934781_1fa4b41a...,3350,500 West 143rd Street


In [4]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=42, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model

In [7]:
train_df.features[:5]

10                                                       []
10000     [Doorman, Elevator, Fitness Center, Cats Allow...
100004    [Laundry In Building, Dishwasher, Hardwood Flo...
100007                            [Hardwood Floors, No Fee]
100013                                            [Pre-War]
Name: features, dtype: object

In [5]:
features_to_use  = ["bathrooms", "bedrooms", "latitude", "longitude", "price"]


In [9]:
# count of photos #
train_df["num_photos"] = train_df["photos"].apply(len)
test_df["num_photos"] = test_df["photos"].apply(len)

# count of "features" #
train_df["num_features"] = train_df["features"].apply(len)
test_df["num_features"] = test_df["features"].apply(len)

# count of words present in description column #
train_df["num_description_words"] = train_df["description"].apply(lambda x: len(x.split(" ")))
test_df["num_description_words"] = test_df["description"].apply(lambda x: len(x.split(" ")))

# convert the created column to datetime object so as to extract more features 
train_df["created"] = pd.to_datetime(train_df["created"])
test_df["created"] = pd.to_datetime(test_df["created"])

# Let us extract some features like year, month, day, hour from date columns #
train_df["created_year"] = train_df["created"].dt.year
test_df["created_year"] = test_df["created"].dt.year
train_df["created_month"] = train_df["created"].dt.month
test_df["created_month"] = test_df["created"].dt.month
train_df["created_day"] = train_df["created"].dt.day
test_df["created_day"] = test_df["created"].dt.day
train_df["created_hour"] = train_df["created"].dt.hour
test_df["created_hour"] = test_df["created"].dt.hour
train_df["created_dayofweek"] = train_df["created"].dt.weekday
test_df["created_dayofweek"] = test_df["created"].dt.weekday



# adding all these new features to use list #
features_to_use.extend(["num_photos", "num_features", "num_description_words","created_year", "created_month", "created_day", "listing_id", "created_hour", "created_dayofweek"])

In [10]:
features_to_use

['bathrooms',
 'bedrooms',
 'latitude',
 'longitude',
 'price',
 'num_photos',
 'num_features',
 'num_description_words',
 'created_year',
 'created_month',
 'created_day',
 'listing_id',
 'created_hour',
 'created_dayofweek']

In [11]:
categorical = ["display_address", "manager_id", "building_id", "street_address"]
for f in categorical:
        if train_df[f].dtype=='object':
            #print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(train_df[f].values) + list(test_df[f].values))
            train_df[f] = lbl.transform(list(train_df[f].values))
            test_df[f] = lbl.transform(list(test_df[f].values))
            features_to_use.append(f)

In [15]:
train_df.manager_id[:10]
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49352 entries, 10 to 99994
Data columns (total 23 columns):
bathrooms                49352 non-null float64
bedrooms                 49352 non-null int64
building_id              49352 non-null int64
created                  49352 non-null datetime64[ns]
description              49352 non-null object
display_address          49352 non-null int64
features                 49352 non-null object
interest_level           49352 non-null object
latitude                 49352 non-null float64
listing_id               49352 non-null int64
longitude                49352 non-null float64
manager_id               49352 non-null int64
photos                   49352 non-null object
price                    49352 non-null int64
street_address           49352 non-null int64
num_photos               49352 non-null int64
num_features             49352 non-null int64
num_description_words    49352 non-null int64
created_year             49352 non-null int

In [16]:
train_df['features'] = train_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
test_df['features'] = test_df["features"].apply(lambda x: " ".join(["_".join(i.split(" ")) for i in x]))
train_df['features'] = train_df['features'].str.lower()
test_df['features'] = test_df['features'].str.lower()
print(train_df["features"].head())

10                                                         
10000     doorman elevator fitness_center cats_allowed d...
100004    laundry_in_building dishwasher hardwood_floors...
100007                               hardwood_floors no_fee
100013                                              pre-war
Name: features, dtype: object


In [17]:
tfidf = CountVectorizer(stop_words='english', max_features=100)
tr_sparse = tfidf.fit_transform(train_df["features"])
te_sparse = tfidf.transform(test_df["features"])

In [18]:
train_df['description'] = train_df['description'].str.lower()
test_df['description'] = test_df['description'].str.lower()
print(train_df["description"].head())
tfidf = CountVectorizer(stop_words='english', max_features=100)
tra_sparse = tfidf.fit_transform(train_df["features"])
tes_sparse = tfidf.transform(test_df["features"])

10        a brand new 3 bedroom 1.5 bath apartmentenjoy ...
10000                                                      
100004    top top west village location, beautiful pre-w...
100007    building amenities - garage - garden - fitness...
100013    beautifully renovated 3 bedroom flex 4 bedroom...
Name: description, dtype: object


In [None]:
#train_df.to_pickle('train_df.pkl')
#test_df.to_pickle('test_df.pkl')

### Train and test data frames created above and saved to pkl file
pkl file loaded below and used for model building and evaluation   

In [19]:
train_X = sparse.hstack([train_df[features_to_use], tr_sparse,tra_sparse]).tocsr()
test_X = sparse.hstack([test_df[features_to_use], te_sparse, tes_sparse]).tocsr()

target_num_map = {'high':0, 'medium':1, 'low':2}
train_y = np.array(train_df['interest_level'].apply(lambda x: target_num_map[x]))
print(train_X.shape, test_X.shape)

(49352, 218) (74659, 218)


In [20]:
cv_scores = []
kf = model_selection.KFold(n_splits=7, shuffle=True, random_state=2016)
for dev_index, val_index in kf.split(range(train_X.shape[0])):
        dev_X, val_X = train_X[dev_index,:], train_X[val_index,:]
        dev_y, val_y = train_y[dev_index], train_y[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        cv_scores.append(log_loss(val_y, preds))
        print(cv_scores)
        break

[0]	train-mlogloss:1.03943	test-mlogloss:1.03953
Multiple eval metrics have been passed: 'test-mlogloss' will be used for early stopping.

Will train until test-mlogloss hasn't improved in 20 rounds.
[1]	train-mlogloss:0.98739	test-mlogloss:0.98804
[2]	train-mlogloss:0.945497	test-mlogloss:0.946728
[3]	train-mlogloss:0.906603	test-mlogloss:0.908219
[4]	train-mlogloss:0.872682	test-mlogloss:0.875148
[5]	train-mlogloss:0.845483	test-mlogloss:0.848496
[6]	train-mlogloss:0.819229	test-mlogloss:0.822669
[7]	train-mlogloss:0.796226	test-mlogloss:0.800114
[8]	train-mlogloss:0.775539	test-mlogloss:0.780028
[9]	train-mlogloss:0.758479	test-mlogloss:0.76342
[10]	train-mlogloss:0.74271	test-mlogloss:0.748037
[11]	train-mlogloss:0.728749	test-mlogloss:0.734623
[12]	train-mlogloss:0.716505	test-mlogloss:0.722813
[13]	train-mlogloss:0.70577	test-mlogloss:0.712372
[14]	train-mlogloss:0.694535	test-mlogloss:0.701855
[15]	train-mlogloss:0.6843	test-mlogloss:0.692392
[16]	train-mlogloss:0.675384	test-ml

In [21]:
preds, model = runXGB(train_X, train_y, test_X, num_rounds=1000)
out_df = pd.DataFrame(preds)
out_df.columns = ["high", "medium", "low"]
out_df["listing_id"] = test_df.listing_id.values
out_df.to_csv("xgb_starter2.csv", index=False)

In [None]:
test_df