In [None]:
# use stratified cv to select the optimal parameter to compute price per sqft

In [None]:
%matplotlib inline
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import StratifiedKFold
import itertools as itertools
from sklearn.metrics import log_loss

# stratefied k-fold
def get_skf_indexes(df, target, kfold=4):
    X = df.values
    y = df[target].values
    skf = StratifiedKFold(n_splits=4);
    skf.get_n_splits(X, y);
    indexes = [[],[]]
    for train_index, test_index in skf.split(X, y):
        indexes[0].append(train_index) # Training indexes
        indexes[1].append(test_index) # test indexes
    return indexes

# logistic regression
def get_lr_perf(df_train, df_test, feature='__to_check', target='response', n_quantile=20):
    results = {}
    # Inputs
    xtrain = df_train[feature].values.reshape(-1,1)
    ytrain = df_train[target].values
    xtest = df_test[feature].values.reshape(-1,1)
    ytest = df_test[target].values
    # Evaluation as a single feature
    lr = LogisticRegression()
    lr.fit(xtrain, ytrain);
    yptrain = lr.predict_proba(xtrain)
    yptest = lr.predict_proba(xtest)
    results['train.num'] = np.round(log_loss(ytrain, yptrain), 6)
    results['test.num'] = np.round(log_loss(ytest, yptest), 6)
    # Evaluation as a categorical feature using quantile buckets
    bins = np.unique(np.percentile(xtrain, np.arange(n_quantile, 100, n_quantile)))
    xtrainq = np.digitize(xtrain, bins)
    xtestq = np.digitize(xtest, bins)
    lb = LabelBinarizer()
    x1 = lb.fit_transform(xtrainq)
    x2 = lb.transform(xtestq)
    lr.fit(x1, ytrain);
    yptrain = lr.predict_proba(x1)
    yptest = lr.predict_proba(x2)
    results['train.cat'] = np.round(log_loss(ytrain, yptrain), 6)
    results['test.cat'] = np.round(log_loss(ytest, yptest), 6)
    return results

In [None]:
train_data = pd.read_json('train.json')
train_data['target'] = train_data['interest_level'].apply(lambda x: 0 if x=='low' else 1 if x=='medium' else 2)

In [None]:
# Parameters to check
AA = (0.1, 0.5, 1, 2)
CC = ((0, 4), (0, 3), (1, 4), (1, 3), (0, 2))
DD = ((0, 3), (0, 2), (1, 3), (1, 2))
BB = (0, 0.25, 0.5, 1, 2)
# Reduced set of parameters to run here
AA = (0.5, 1, 2)
CC = ((0, 4), (0, 3), (1, 4), (1, 3))
DD = ((0, 3), (0, 2))
BB = (0.25, 0.5, 1)

In [None]:
# Stratified kfold
idx_train, idx_test = get_skf_indexes(df, 'response', kfold=2) # kfold=4, set to 2 to quickly run here

In [None]:
# Get results
Y = pd.DataFrame()
for iper, (i_train, i_test) in enumerate(zip(idx_train, idx_test)):
    print(iper)
    df_train = df.iloc[i_train, :].copy()
    df_test = df.iloc[i_test, :].copy()
    # For each parameter combination
    for A, C, D, B in itertools.product(AA, CC, DD, BB):
        df_train['__to_check'] = (df_train.price / (A + df_train.bedrooms.clip(C[0], C[1]) + B*df_train.bathrooms.clip(D[0], D[1]))).values
        df_test['__to_check'] = (df_test.price / (A + df_test.bedrooms.clip(C[0], C[1]) + B*df_test.bathrooms.clip(D[0], D[1]))).values
        results = get_lr_perf(df_train, df_test, feature='__to_check', target='response', n_quantile=20)
        results.update({'fold': iper, 'params': {'A':A, 'B': B, 'C': C, 'D':D}})
        Y =  Y.append(pd.DataFrame(pd.Series(results)).transpose())
for i in ['train.cat', 'train.num', 'test.cat', 'test.num']:
    Y[i] = Y[i].astype(float)

In [None]:
# From these results we can conclude than the best proxy for price/sqft using price, bedrooms and bathrooms 
Y.sort_values('test.cat')

In [None]:
# optimal parameters
AA = (1)
CC = ((1, 4))
DD = ((0, 2))
BB = (0.5)