In [42]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from itertools import product
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss
import xgboost as xgb
import functools as ft

In [2]:
def add_features(df):
    fmt = lambda s: s.replace("\u00a0", "").strip().lower()
    df["photo_count"] = df["photos"].apply(len)
    df["street_address"] = df['street_address'].apply(fmt)
    df["display_address"] = df["display_address"].apply(fmt)
    df["desc_wordcount"] = df["description"].apply(len)
    df["pricePerBed"] = df['price'] / df['bedrooms']
    df["pricePerBath"] = df['price'] / df['bathrooms']
    df["pricePerRoom"] = df['price'] / (df['bedrooms'] + df['bathrooms'])
    df["bedPerBath"] = df['bedrooms'] / df['bathrooms']
    df["bedBathDiff"] = df['bedrooms'] - df['bathrooms']
    df["bedBathSum"] = df["bedrooms"] + df['bathrooms']
    df["bedsPerc"] = df["bedrooms"] / (df['bedrooms'] + df['bathrooms'])

    df = df.fillna(-1).replace(np.inf, -1)
    return df


def factorize(df1, df2, column):
    ps = df1[column].append(df2[column])
    factors = ps.factorize()[0]
    df1[column] = factors[:len(df1)]
    df2[column] = factors[len(df1):]
    return df1, df2


def designate_single_observations(df1, df2, column):
    ps = df1[column].append(df2[column])
    grouped = ps.groupby(ps).size().to_frame().rename(columns={0: "size"})
    df1.loc[df1.join(grouped, on=column, how="left")["size"] <= 1, column] = -1
    df2.loc[df2.join(grouped, on=column, how="left")["size"] <= 1, column] = -1
    return df1, df2


def hcc_encode(train_df, test_df, variable, target, prior_prob, k, f=1, g=1, r_k=None, update_df=None):
    """
    See "A Preprocessing Scheme for High-Cardinality Categorical Attributes in
    Classification and Prediction Problems" by Daniele Micci-Barreca
    """
    hcc_name = "_".join(["hcc", variable, target])

    grouped = train_df.groupby(variable)[target].agg({"size": "size", "mean": "mean"})
    grouped["lambda"] = 1 / (g + np.exp((k - grouped["size"]) / f))
    grouped[hcc_name] = grouped["lambda"] * grouped["mean"] + (1 - grouped["lambda"]) * prior_prob

    df = test_df[[variable]].join(grouped, on=variable, how="left")[hcc_name].fillna(prior_prob)
    if r_k: df *= np.random.uniform(1 - r_k, 1 + r_k, len(test_df))     # Add uniform noise. Not mentioned in original paper

    if update_df is None: update_df = test_df
    if hcc_name not in update_df.columns: update_df[hcc_name] = np.nan
    update_df.update(df)
    return

In [4]:
# Load data
X_train = pd.read_json("train.json").sort_values(by="listing_id")
X_test = pd.read_json("test.json").sort_values(by="listing_id")

In [5]:
# Make target integer, one hot encoded, calculate target priors
X_train = X_train.replace({"interest_level": {"low": 0, "medium": 1, "high": 2}})
X_train = X_train.join(pd.get_dummies(X_train["interest_level"], prefix="pred").astype(int))
prior_0, prior_1, prior_2 = X_train[["pred_0", "pred_1", "pred_2"]].mean()

In [6]:
# Add common features
X_train = add_features(X_train)
X_test = add_features(X_test)

In [7]:
# Special designation for building_ids, manager_ids, display_address with only 1 observation
for col in ('building_id', 'manager_id', 'display_address'):
    X_train, X_test = designate_single_observations(X_train, X_test, col)

In [8]:
# High-Cardinality Categorical encoding
skf = StratifiedKFold(5)
attributes = product(("building_id", "manager_id"), zip(("pred_1", "pred_2"), (prior_1, prior_2)))
for variable, (target, prior) in attributes:
    hcc_encode(X_train, X_test, variable, target, prior, k=5, r_k=None)
    for train, test in skf.split(np.zeros(len(X_train)), X_train['interest_level']):
        hcc_encode(X_train.iloc[train], X_train.iloc[test], variable, target, prior, k=5, r_k=0.01, update_df=X_train)

In [9]:
# Factorize building_id, display_address, manager_id, street_address
for col in ('building_id', 'display_address', 'manager_id', 'street_address'):
    X_train, X_test = factorize(X_train, X_test, col)

In [10]:
# Create binarized features
fmt = lambda feat: [s.replace("\u00a0", "").strip().lower().replace(" ", "_") for s in feat]  # format features
X_train["features"] = X_train["features"].apply(fmt)
X_test["features"] = X_test["features"].apply(fmt)
features = [f for f_list in list(X_train["features"]) + list(X_test["features"]) for f in f_list]
ps = pd.Series(features)
grouped = ps.groupby(ps).agg(len)
features = grouped[grouped >= 10].index.sort_values().values    # limit to features with >=10 observations
mlb = MultiLabelBinarizer().fit([features])
columns = ['feature_' + s for s in mlb.classes_]
flt = lambda l: [i for i in l if i in mlb.classes_]     # filter out features not present in MultiLabelBinarizer
X_train = X_train.join(pd.DataFrame(data=mlb.transform(X_train["features"].apply(flt)), columns=columns, index=X_train.index))
X_test = X_test.join(pd.DataFrame(data=mlb.transform(X_test["features"].apply(flt)), columns=columns, index=X_test.index))

In [14]:
# Save

X_train = X_train.sort_index(axis=1).sort_values(by="listing_id")
X_test = X_test.sort_index(axis=1).sort_values(by="listing_id")
columns_to_drop = ["photos", "pred_0","pred_1", "pred_2", "description", "features", "created"]
X_train.drop([c for c in X_train.columns if c in columns_to_drop], axis=1).\
    to_csv("train_python.csv", index=False, encoding='utf-8')
X_test.drop([c for c in X_test.columns if c in columns_to_drop], axis=1).\
    to_csv("test_python.csv", index=False, encoding='utf-8')
 

In [37]:
# Read new data
train = pd.read_csv("train_python.csv")
test = pd.read_csv("test_python.csv")

In [38]:
print(train.shape)
print(test.shape)

(49352, 286)
(74659, 285)


In [39]:
cols = train.columns
cols = cols.drop(['listing_id','interest_level'])
train_X = train[cols]
train_y = train['interest_level']
test_X = test[cols]

In [40]:
rft = RandomForestClassifier(n_jobs=-1)
params = {
    'n_estimators' : [50,100,200,400],
    'max_features' : ['auto'],
    'max_depth' : [3,5,10,20,50],
    'min_samples_leaf' : [0.003],
    'min_samples_split' : [0.001],
    'criterion' : ['entropy','gini'],
    'class_weight' : ['balanced',None],
    'bootstrap' : [True],
    'oob_score' : [False],
    'random_state' : [0,123,12345]
}
def framework(clf,params,n_iter):
    # calculate # of iterations for Search
    parsize = ft.reduce(lambda a,b: a*b,[len(params[x]) for x in params]) # total # of combinations
    psize = n_iter if parsize > n_iter else parsize # limit # by n_iter
    print ('Parameters combination :',str(psize)+"/"+str(parsize))   
    
    rgs = RandomizedSearchCV(
    estimator = clf,
    param_distributions = params,
    n_iter = psize,
    scoring = 'neg_log_loss',
    n_jobs = -1,
    cv = 5,
    refit=True,
    verbose=1)

    rgs.fit(train_X,train_y)
    
    n_top = 3
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(rgs.cv_results_['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i), "(best model)" if rgs.best_index_ == candidate else "")
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  rgs.cv_results_['mean_test_score'][candidate],
                  rgs.cv_results_['std_test_score'][candidate]))
            print("Parameters: {0}".format(rgs.cv_results_['params'][candidate]))
    
    return rgs.best_estimator_

In [43]:
xgb = xgb.XGBClassifier(nthread=-1)
#print(xgb.get_params().keys())
xgb_params = {"max_depth": [3,5,7],
              "learning_rate": [0.01,0.05,0.1,0.15],
              "n_estimators": [50,75,100,150,200],
              "min_child_weight": [0.01,0.005],
              "gamma": [0.1,0.5,0.8,0.9,1.0],
              "subsample":[0.75,0.9,1.0],
              #"eval_metric":['logloss'],
              "objective":["multi:softprob"],
              #"seed":[0],
              "nthread":[-1]
              #,"eval_metric": ['roc_auc']
             }

est = framework(xgb,xgb_params,50)

Parameters combination : 50/1800
Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 82.4min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 279.2min
[Parallel(n_jobs=-1)]: Done 250 out of 250 | elapsed: 353.7min finished


Model with rank: 1 (best model)
Mean validation score: -0.579 (std: 0.044)
Parameters: {'subsample': 0.75, 'objective': 'multi:softprob', 'nthread': -1, 'n_estimators': 75, 'min_child_weight': 0.01, 'max_depth': 7, 'learning_rate': 0.15, 'gamma': 0.1}
Model with rank: 2 
Mean validation score: -0.579 (std: 0.054)
Parameters: {'subsample': 0.75, 'objective': 'multi:softprob', 'nthread': -1, 'n_estimators': 200, 'min_child_weight': 0.005, 'max_depth': 5, 'learning_rate': 0.15, 'gamma': 0.9}
Model with rank: 3 
Mean validation score: -0.580 (std: 0.034)
Parameters: {'subsample': 0.75, 'objective': 'multi:softprob', 'nthread': -1, 'n_estimators': 100, 'min_child_weight': 0.01, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.8}


In [44]:
result = pd.DataFrame(est.predict_proba(test_X),columns=est.classes_)
result['listing_id'] = test['listing_id'].values

In [45]:
result

Unnamed: 0,0,1,2,listing_id
0,0.065015,0.203845,0.731141,6811958
1,0.312899,0.398462,0.288639,6811960
2,0.122461,0.357515,0.520024,6811964
3,0.957467,0.035353,0.007180,6811971
4,0.733189,0.111023,0.155787,6811974
5,0.951445,0.030730,0.017825,6811983
6,0.410698,0.431817,0.157485,6811984
7,0.933338,0.030411,0.036251,6811985
8,0.661238,0.146402,0.192360,6811988
9,0.984740,0.011803,0.003457,6811990


In [52]:
result.rename(columns={0: 'high', 1: 'medium', 2: 'low'}, inplace=True)
result.columns

Index(['high', 'medium', 'low', 'listing_id'], dtype='object')

In [53]:
result[['listing_id','high','medium','low']].to_csv('scores_250317.csv',index=False)