In [None]:
# XGBoost First Try  

After using other models, it seems as if they all max out and can't get optimised any more. 
Therefore, lets use XGBoost as a model to predict the data and get higher on the leaderboard 

In [1]:
#import libraries

import pandas as pd 
import numpy as np 
import os 
import matplotlib as plt
import seaborn as sns
%matplotlib inline

In [2]:
# import train and test 

train = pd.read_json(r'train.json')
test = pd.read_json(r'test.json')

In [3]:
# function to add useful features for model 

def quantify_features(data):
    data['num_photos'] = data['photos'].apply(len)
    data['num_features'] = data['features'].apply(len)
    data['description_length'] = data.description.apply(lambda x: len(x.split()))
    data['created'] = pd.to_datetime(data['created'])
    data['created_month'] = data['created'].dt.month
    data['created_day'] = data['created'].dt.day
    data['price_bed'] = data['price'] / data['bedrooms']
    print data.info()
    
quantify_features(train)
quantify_features(test)

<class 'pandas.core.frame.DataFrame'>
Float64Index: 49352 entries, 10.0 to 99994.0
Data columns (total 21 columns):
bathrooms             49352 non-null float64
bedrooms              49352 non-null int64
building_id           49352 non-null object
created               49352 non-null datetime64[ns]
description           49352 non-null object
display_address       49352 non-null object
features              49352 non-null object
interest_level        49352 non-null object
latitude              49352 non-null float64
listing_id            49352 non-null int64
longitude             49352 non-null float64
manager_id            49352 non-null object
photos                49352 non-null object
price                 49352 non-null int64
street_address        49352 non-null object
num_photos            49352 non-null int64
num_features          49352 non-null int64
description_length    49352 non-null int64
created_month         49352 non-null int64
created_day           49352 non-null int64
p

In [4]:
def cat_codes(X):
    X["street_address"] = X["street_address"].astype('category').cat.codes
    X["building_id"] = X["building_id"].astype('category').cat.codes
    X["description"] = X["description"].astype('category').cat.codes
    X["display_address"] = X["display_address"].astype('category').cat.codes
    X["manager_id"] = X["manager_id"].astype('category').cat.codes
    
cat_codes(train)
cat_codes(test)

In [5]:
features = [ 'display_address',  'created_day', 
           'created_month', 'num_photos', 'num_features', 'description_length', 'longitude', 'latitude', 'price',
           'bathrooms', 'bedrooms', 'building_id', 'manager_id', 'price_bed']

In [6]:
X = train.drop('interest_level', 1)
X = X[features]

Y = train['interest_level'].astype('category')
Y.dtype


category

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 49352 entries, 10.0 to 99994.0
Data columns (total 14 columns):
display_address       49352 non-null int16
created_day           49352 non-null int64
created_month         49352 non-null int64
num_photos            49352 non-null int64
num_features          49352 non-null int64
description_length    49352 non-null int64
longitude             49352 non-null float64
latitude              49352 non-null float64
price                 49352 non-null int64
bathrooms             49352 non-null float64
bedrooms              49352 non-null int64
building_id           49352 non-null int16
manager_id            49352 non-null int16
price_bed             49352 non-null float64
dtypes: float64(4), int16(3), int64(7)
memory usage: 4.8 MB


In [8]:
# import train test split and other scikit learn libraries
from sklearn.cross_validation import train_test_split
X_train, X_test,Y_train, Y_test = train_test_split(X, Y, test_size = 0.25)
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, log_loss

In [9]:
from scipy import sparse
import xgboost as xgb
from sklearn import preprocessing, ensemble

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [15]:
index = test["listing_id"]
test = test[features]

In [13]:



def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=1000):
    param = {}
    param['objective'] = 'multi:softprob'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['num_class'] = 3
    param['eval_metric'] = "mlogloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model





In [14]:
print(X_train.shape, X_test.shape)

((37014, 14), (12338, 14))


In [27]:

for column in X:
    X[column] = X[column].apply(float)
    
for column in test:
    test[column] = test[column].apply(float)

In [26]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 49352 entries, 10.0 to 99994.0
Data columns (total 14 columns):
display_address       49352 non-null float64
created_day           49352 non-null float64
created_month         49352 non-null float64
num_photos            49352 non-null float64
num_features          49352 non-null float64
description_length    49352 non-null float64
longitude             49352 non-null float64
latitude              49352 non-null float64
price                 49352 non-null float64
bathrooms             49352 non-null float64
bedrooms              49352 non-null float64
building_id           49352 non-null float64
manager_id            49352 non-null float64
price_bed             49352 non-null float64
dtypes: float64(14)
memory usage: 5.6 MB


In [28]:

preds, model = runXGB(X, Y , test, num_rounds=400)

TypeError: a float is required