In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
import functools as ft
import xgboost



In [2]:
train = pd.read_csv('./prepared/basic_train.csv')
print("Train:",train.shape)
test = pd.read_csv('./prepared/basic_test.csv')
print("Test:",test.shape)
train.head(3)

Train: (49352, 12)
Test: (74659, 12)


Unnamed: 0,bathrooms,bedrooms,description_len,features_cnt,hour,interest_level,latitude,listing_id,longitude,photos_cnt,price,weekday
0,1.5,3,588,0,7,medium,40.7145,7211212,-73.9425,5,3000,4
1,1.0,2,8,5,12,low,40.7947,7150865,-73.9667,11,5465,6
2,1.0,1,691,4,3,high,40.7388,6887163,-74.0018,8,2850,6


In [3]:
cols = train.columns
cols = cols.drop(['listing_id','interest_level'])
train_X = train[cols]
train_y = train['interest_level']
test_X = test[cols]
test_y = test['interest_level']
train_X.head(3)

Unnamed: 0,bathrooms,bedrooms,description_len,features_cnt,hour,latitude,longitude,photos_cnt,price,weekday
0,1.5,3,588,0,7,40.7145,-73.9425,5,3000,4
1,1.0,2,8,5,12,40.7947,-73.9667,11,5465,6
2,1.0,1,691,4,3,40.7388,-74.0018,8,2850,6


In [4]:
rft = RandomForestClassifier(n_jobs=-1)
rft.fit(train_X,train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=-1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [5]:
log_loss(y_pred = rft.predict_proba(train_X),y_true=train_y)

0.16135575065701638

In [6]:
result = pd.DataFrame(rft.predict_proba(test_X),columns=rft.classes_)
result['listing_id'] = test['listing_id']

In [7]:
result[['listing_id','high','medium','low']].to_csv('./scores/scored_simple.csv',index=False)
result[['listing_id','high','medium','low']].head(3)

Unnamed: 0,listing_id,high,medium,low
0,7142618,0.2,0.2,0.6
1,7210040,0.4,0.3,0.3
2,7103890,0.0,0.1,0.9


In [8]:
rft = RandomForestClassifier(n_jobs=-1)
params = {
    'n_estimators' : [50,100,200,400],
    'max_features' : ['auto'],
    'max_depth' : [3,5,10,20,50],
    'min_samples_leaf' : [0.003],
    'min_samples_split' : [0.001],
    'criterion' : ['entropy','gini'],
    'class_weight' : ['balanced',None],
    'bootstrap' : [True],
    'oob_score' : [False],
    'random_state' : [0,123,12345]
}
def framework(clf,params,n_iter):
    # calculate # of iterations for Search
    parsize = ft.reduce(lambda a,b: a*b,[len(params[x]) for x in params]) # total # of combinations
    psize = n_iter if parsize > n_iter else parsize # limit # by n_iter
    print ('Parameters combination :',str(psize)+"/"+str(parsize))   
    
    rgs = RandomizedSearchCV(
    estimator = clf,
    param_distributions = params,
    n_iter = psize,
    scoring = 'neg_log_loss',
    n_jobs = -1,
    cv = 5,
    refit=True,
    verbose=1)

    rgs.fit(train_X,train_y)
    
    n_top = 3
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(rgs.cv_results_['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i), "(best model)" if rgs.best_index_ == candidate else "")
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  rgs.cv_results_['mean_test_score'][candidate],
                  rgs.cv_results_['std_test_score'][candidate]))
            print("Parameters: {0}".format(rgs.cv_results_['params'][candidate]))
    
    return rgs.best_estimator_

In [9]:
est = framework(rft,params,30)
# Mean validation score: -0.645 (std: 0.005)
result = pd.DataFrame(est.predict_proba(test_X),columns=est.classes_)
result['listing_id'] = test['listing_id']

result[['listing_id','high','medium','low']].to_csv('./scores/scored_hypopt.csv',index=False)

Parameters combination : 30/240
Fitting 5 folds for each of 30 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  4.5min finished


Model with rank: 1 (best model)
Mean validation score: -0.644 (std: 0.004)
Parameters: {'random_state': 123, 'oob_score': False, 'n_estimators': 400, 'min_samples_split': 0.001, 'min_samples_leaf': 0.003, 'max_features': 'auto', 'max_depth': 50, 'criterion': 'entropy', 'class_weight': None, 'bootstrap': True}
Model with rank: 2 
Mean validation score: -0.645 (std: 0.004)
Parameters: {'random_state': 123, 'oob_score': False, 'n_estimators': 100, 'min_samples_split': 0.001, 'min_samples_leaf': 0.003, 'max_features': 'auto', 'max_depth': 50, 'criterion': 'entropy', 'class_weight': None, 'bootstrap': True}
Model with rank: 3 
Mean validation score: -0.646 (std: 0.005)
Parameters: {'random_state': 0, 'oob_score': False, 'n_estimators': 400, 'min_samples_split': 0.001, 'min_samples_leaf': 0.003, 'max_features': 'auto', 'max_depth': 50, 'criterion': 'gini', 'class_weight': None, 'bootstrap': True}


In [70]:
result[['listing_id','high','medium','low']].to_csv('./scores/scored_simple_rft.csv',index=False)

In [None]:

xgb = xgboost.XGBClassifier(nthread=-1)
#print(xgb.get_params().keys())
xgb_params = {"max_depth": [3,5,7],
              "learning_rate": [0.01,0.05,0.1,0.15],
              "n_estimators": [50,75,100,150,200],
              "min_child_weight": [0.01,0.005],
              "gamma": [0.1,0.5,0.8,0.9,1.0],
              "subsample":[0.75,0.9,1.0],
              #"eval_metric":['logloss'],
              "objective":["multi:softprob"],
              #"seed":[0],
              "nthread":[-1]
              #,"eval_metric": ['roc_auc']
             }

est = framework(xgb,xgb_params,50)

result = pd.DataFrame(est.predict_proba(test_X),columns=est.classes_)
result['listing_id'] = test['listing_id']
result[['listing_id','high','medium','low']].to_csv('./scores/scored_hypopt.csv',index=False)
#Model with rank: 1 (best model)
#Mean validation score: -0.597 (std: 0.005)
#Parameters: {'subsample': 0.9, 'objective': 'multi:softprob', 'nthread': -1, 'n_estimators': 100, 'min_child_weight': 0.005, 'max_depth': 7, 'learning_rate': 0.1, 'gamma': 0.8}

Parameters combination : 50/1800
Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [20]:
result = pd.DataFrame(est.predict_proba(test_X),columns=est.classes_)
result['listing_id'] = test['listing_id']

result[['listing_id','high','medium','low']].to_csv('./scores/scored_hypopt_xgb.csv',index=False)