In [38]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import preprocessing, model_selection
import string
from sklearn.feature_extraction.text import  CountVectorizer
from scipy.stats import boxcox
from scipy import sparse



train = pd.read_json('final_train.json')
new_data = pd.read_csv('train_good.csv')
new_data = new_data[['listing_id', 'median_30']]
test = pd.read_json('final_test.json')
train = train.merge(new_data, how = 'left', left_on = 'listing_id', right_on ='listing_id')
test = test.merge(new_data, how = 'left', left_on = 'listing_id', right_on ='listing_id')
listing_id = test.listing_id.values


y_train = train["interest_level"]

y_map = {'low': 2, 'medium': 1, 'high': 0}
train['interest_level'] = train['interest_level'].apply(lambda x: y_map[x])
y_train = train.interest_level.values

train = train.drop(['listing_id', 'interest_level'], axis=1)
test = test.drop('listing_id', axis=1)

ntrain = train.shape[0]

train_test = pd.concat((train, test), axis=0).reset_index(drop=True)

train_test['Zero_building_id'] = train_test['building_id'].apply(lambda x: 1 if x == '0' else 0)

string.punctuation.__add__('!!')
string.punctuation.__add__('(')
string.punctuation.__add__(')')

remove_punct_map = dict.fromkeys(map(ord, string.punctuation))

train_test['address1'] = train_test['display_address']
train_test['address1'] = train_test['address1'].apply(lambda x: x.lower())

address_map = {
    'w': 'west',
    'st.': 'street',
    'ave': 'avenue',
    'st': 'street',
    'e': 'east',
    'n': 'north',
    's': 'south'
}


def address_map_func(s):
    s = s.split(' ')
    out = []
    for x in s:
        if x in address_map:
            out.append(address_map[x])
        else:
            out.append(x)
    return ' '.join(out)


train_test['address1'] = train_test['address1'].apply(lambda x: x.translate(remove_punct_map))
train_test['address1'] = train_test['address1'].apply(lambda x: address_map_func(x))

new_cols = ['street', 'avenue', 'east', 'west', 'north', 'south']

for col in new_cols:
    train_test[col] = train_test['address1'].apply(lambda x: 1 if col in x else 0)

train_test['other_address'] = train_test[new_cols].apply(lambda x: 1 if x.sum() == 0 else 0, axis=1)

train_test['features_count'] = train_test['features'].apply(lambda x: len(x))
train_test['features2'] = train_test['features']
train_test['features2'] = train_test['features2'].apply(lambda x: ' '.join(x))

c_vect = CountVectorizer(stop_words='english', max_features=200, ngram_range=(1, 1))
c_vect.fit(train_test['features2'])

c_vect_sparse_1 = c_vect.transform(train_test['features2'])
c_vect_sparse1_cols = c_vect.get_feature_names()



train_test.drop(['features', 'features2'], axis=1, inplace=True)

managers_count = train_test['manager_id'].value_counts()

train_test['top_10_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 90)] else 0)
train_test['top_25_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 75)] else 0)
train_test['top_5_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 95)] else 0)
train_test['top_50_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 50)] else 0)
train_test['top_1_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 99)] else 0)
train_test['top_2_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 98)] else 0)
train_test['top_15_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 85)] else 0)
train_test['top_20_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 80)] else 0)
train_test['top_30_manager'] = train_test['manager_id'].apply(lambda x: 1 if x in managers_count.index.values[
    managers_count.values >= np.percentile(managers_count.values, 70)] else 0)

buildings_count = train_test['building_id'].value_counts()

train_test['top_10_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 90)] else 0)
train_test['top_25_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 75)] else 0)
train_test['top_5_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 95)] else 0)
train_test['top_50_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 50)] else 0)
train_test['top_1_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 99)] else 0)
train_test['top_2_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 98)] else 0)
train_test['top_15_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 85)] else 0)
train_test['top_20_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 80)] else 0)
train_test['top_30_building'] = train_test['building_id'].apply(lambda x: 1 if x in buildings_count.index.values[
    buildings_count.values >= np.percentile(buildings_count.values, 70)] else 0)

train_test.drop(['display_address'], axis=1, inplace=True)

SEED = 777
NFOLDS = 5

params = {
    'eta':.01,
    'colsample_bytree':.8,
    'subsample':.8,
    'seed':0,
    'nthread':16,
    'objective':'multi:softprob',
    'eval_metric':'mlogloss',
    'num_class':3,
    'silent':1
}


In [42]:


newfeats = ['BB_ratio','Price_P_Room','allow_pets','amount_of_caps','available','bathroom_listed','bathrooms','bedrooms',
            'created_day','created_hour','created_month','dist_to_nearest_college',
            'manager_id', 'building_id',
            'dist_to_nearest_tube','distance_from_midtown','has_phone',
             'is_studio','large_space','latitude','laundry','longitude', 'median_30','n_log_price',
            'n_no_photo','n_num_keyfeat_score','nofee','num_description_words','num_features','num_photos',
             'weekday_created','Zero_building_id','street','avenue','east','west','north','south',
            'features_count','top_10_manager','top_25_manager','top_5_manager','top_50_manager',
            'top_1_manager','top_2_manager','top_15_manager','top_20_manager','top_30_manager','top_10_building',
            'top_25_building','top_5_building','top_50_building','top_1_building','top_2_building','top_15_building',
            'top_20_building','top_30_building']

train_test = train_test[newfeats]

categorical = [x for x in train_test.columns if train_test[x].dtype == object]
for feat in categorical:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_test[feat].values))
    train_test[feat] = lbl.transform(list(train_test[feat].values))
    

train_test_cv1_sparse = sparse.hstack((train_test.astype(float), c_vect_sparse_1)).tocsr()



newfeats += c_vect_sparse1_cols


In [47]:
ntrain = train.shape[0]
x_train = train_test_cv1_sparse[:ntrain, :]
x_test = train_test_cv1_sparse[ntrain:, :]
dtrain = xgb.DMatrix(data=x_train, label=y_train)
dtest = xgb.DMatrix(data=x_test)

In [None]:
xgb.

In [48]:
bst = xgb.cv(params, dtrain, 10000, NFOLDS, early_stopping_rounds=50, verbose_eval=25)

best_rounds = np.argmin(bst['test-mlogloss-mean'])

bst = xgb.train(params, dtrain, best_rounds)

preds = bst.predict(dtest)

preds = pd.DataFrame(preds)

cols = ['high', 'medium', 'low']

preds.columns = cols

preds['listing_id'] = listing_id

preds.to_csv('xgb_feats.csv', index=None)

[0]	train-mlogloss:1.0921+2.42107e-05	test-mlogloss:1.0922+6.41105e-05
[25]	train-mlogloss:0.957347+0.000758518	test-mlogloss:0.9601+0.00104542
[50]	train-mlogloss:0.863033+0.00118574	test-mlogloss:0.868283+0.00189169
[75]	train-mlogloss:0.794622+0.00136609	test-mlogloss:0.802275+0.00270662
[100]	train-mlogloss:0.744006+0.00155479	test-mlogloss:0.754023+0.0032968
[125]	train-mlogloss:0.705758+0.00160121	test-mlogloss:0.718099+0.00387321
[150]	train-mlogloss:0.676182+0.00173209	test-mlogloss:0.690822+0.00422458
[175]	train-mlogloss:0.652987+0.00175471	test-mlogloss:0.669907+0.00474019
[200]	train-mlogloss:0.634479+0.00174947	test-mlogloss:0.653586+0.00512204
[225]	train-mlogloss:0.619433+0.00181857	test-mlogloss:0.640685+0.00541226
[250]	train-mlogloss:0.607013+0.00188752	test-mlogloss:0.630386+0.00566331
[275]	train-mlogloss:0.596454+0.00187477	test-mlogloss:0.621968+0.00587278
[300]	train-mlogloss:0.587309+0.00183486	test-mlogloss:0.61492+0.00604598
[325]	train-mlogloss:0.579269+0.001

In [61]:
#if you want to test features
#rename the "2" number for easy testing
newfeats2 = ['Price_P_Room','allow_pets','amount_of_caps','available','bathrooms','bedrooms',
            'created_day','created_hour','created_month','manager_id', 'building_id', 'latitude','longitude',
            'has_phone', 'is_studio','large_space','laundry', 'median_30','n_log_price',
            'n_no_photo','n_num_keyfeat_score','nofee','num_description_words','num_photos',
             'weekday_created','Zero_building_id','street','avenue','east','west','north','south',
            'top_10_manager','top_25_manager','top_50_manager',
            'top_1_manager','top_2_manager','top_10_building',
            'top_25_building','top_50_building','top_1_building','top_2_building'
            ]

train_test2 = train_test[newfeats2]

train_test_cv1_sparse2 = sparse.hstack((train_test2.astype(float), c_vect_sparse_1)).tocsr()



newfeats2 += c_vect_sparse1_cols

x_train = train_test_cv1_sparse2[:ntrain, :]
x_test = train_test_cv1_sparse2[ntrain:, :]
dtrain = xgb.DMatrix(data=x_train, label=y_train)
dtest = xgb.DMatrix(data=x_test)

bst = xgb.cv(params, dtrain, 10000, NFOLDS, early_stopping_rounds=50, verbose_eval=25)

best_rounds = np.argmin(bst['test-mlogloss-mean'])

bst = xgb.train(params, dtrain, best_rounds)

preds = bst.predict(dtest)

preds = pd.DataFrame(preds)

cols = ['high', 'medium', 'low']

preds.columns = cols

preds['listing_id'] = listing_id

preds.to_csv('xgb_feats2.csv', index=None)

[0]	train-mlogloss:1.0921+2.19326e-05	test-mlogloss:1.0922+5.42712e-05
[25]	train-mlogloss:0.957624+0.000825741	test-mlogloss:0.960321+0.00105521
[50]	train-mlogloss:0.863625+0.00117844	test-mlogloss:0.86871+0.00194244
[75]	train-mlogloss:0.795618+0.001278	test-mlogloss:0.802973+0.0028184
[100]	train-mlogloss:0.745317+0.00140433	test-mlogloss:0.754983+0.00347099
[125]	train-mlogloss:0.707189+0.00159952	test-mlogloss:0.719063+0.00388515
[150]	train-mlogloss:0.677905+0.00166995	test-mlogloss:0.691981+0.00431123
[175]	train-mlogloss:0.654958+0.00170632	test-mlogloss:0.671141+0.0046783
[200]	train-mlogloss:0.63657+0.00177342	test-mlogloss:0.654863+0.00496769
[225]	train-mlogloss:0.621757+0.00173785	test-mlogloss:0.642073+0.00530272
[250]	train-mlogloss:0.609571+0.00171858	test-mlogloss:0.631878+0.00556575
[275]	train-mlogloss:0.599295+0.00169968	test-mlogloss:0.623638+0.00581141
[300]	train-mlogloss:0.590459+0.00167972	test-mlogloss:0.616789+0.0059856
[325]	train-mlogloss:0.582799+0.001667