In [None]:
import pandas as pd
import numpy as np
train_data = pd.read_json('../input/train.json')
test_data = pd.read_json('../input/test.json')
full_data = pd.concat([train_data,test_data])
train_data['target'] = train_data['interest_level'].apply(lambda x: 0 if x=='low' else 1 if x=='medium' else 2)
train_data['low'] = train_data['interest_level'].apply(lambda x: 1 if x=='low' else 0)
train_data['medium'] = train_data['interest_level'].apply(lambda x: 1 if x=='medium' else 0)
train_data['high'] = train_data['interest_level'].apply(lambda x: 1 if x=='high' else 0)

#### find the best criterion for duplicates using cross validatiaon

In [None]:
# candiates: 'price', 'latitude', 'longitude', 'bathrooms','bedrooms', 'street_address', 'building_id'

In [None]:
%matplotlib inline
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import StratifiedKFold
import itertools as itertools
from sklearn.metrics import log_loss

In [None]:
def get_skf_indexes(df, target, kfold=4):
    X = df.values
    y = df[target].values
    skf = StratifiedKFold(n_splits=kfold);
    skf.get_n_splits(X, y);
    indexes = [[],[]]
    for train_index, test_index in skf.split(X, y):
        indexes[0].append(train_index) # Training indexes
        indexes[1].append(test_index) # test indexes
    return indexes

In [None]:
def get_lr_perf(df_train, df_test, feature='__to_check', target='target', n_quantile=20):
    results = {}
    # Inputs
    xtrain = df_train[feature].values.reshape(-1,1)
    ytrain = df_train[target].values
    xtest = df_test[feature].values.reshape(-1,1)
    ytest = df_test[target].values
    # Evaluation as a single feature
    lr = LogisticRegression()
    lr.fit(xtrain, ytrain)
    yptrain = lr.predict_proba(xtrain)
    yptest = lr.predict_proba(xtest)
    results['train.num'] = np.round(log_loss(ytrain, yptrain), 6)
    results['test.num'] = np.round(log_loss(ytest, yptest), 6)
    # Evaluation as a categorical feature using quantile buckets
    bins = np.unique(np.percentile(xtrain, np.arange(n_quantile, 100, n_quantile)))
    xtrainq = np.digitize(xtrain, bins)
    xtestq = np.digitize(xtest, bins)
    lb = LabelBinarizer()
    x1 = lb.fit_transform(xtrainq)
    x2 = lb.transform(xtestq)
    lr.fit(x1, ytrain);
    yptrain = lr.predict_proba(x1)
    yptest = lr.predict_proba(x2)
    results['train.cat'] = np.round(log_loss(ytrain, yptrain), 6)
    results['test.cat'] = np.round(log_loss(ytest, yptest), 6)
    return results

In [None]:
# number of duplicates, boolean for duplicates 
def duplicate(X,columns):
    dup_data = X[X.duplicated(columns,keep=False)]
    dup_data = dup_data.sort_values(columns,ascending=[True]*len(columns))
    keep = dup_data.drop_duplicates(columns,keep='first')
    
    X['has_dup'] = [0]*X.shape[0]
    X['num_dup'] = [1]*X.shape[0]
    
    for i in range(keep.shape[0]):
        df = keep.iloc[i]
        dup_df = X[(X[columns] == df[columns]).sum(axis=1) == len(columns)]
        X.loc[dup_df.index,'has_dup'] = [1]*dup_df.shape[0]
        X.loc[dup_df.index,'num_dup'] = [dup_df.shape[0]]*dup_df.shape[0]
        
    return X

In [None]:
# all combinations of features
import itertools
COLUMNS = ['price', 'latitude', 'longitude', 'bathrooms','bedrooms', 'street_address', 'building_id']
column_list = []
for r in range(len(COLUMNS)):
    for i in itertools.combinations(COLUMNS, (r+1)):
        print i
        column_list.append(i)
len(column_list) # 127

In [None]:
# Stratified kfold
idx_train, idx_test = get_skf_indexes(train_data, 'target', kfold=4) 

# Get results
Y = pd.DataFrame()

for columns in column_list:
    train_data = duplicate(train_data, columns)
    for iper,(i_train,i_test) in enumerate(zip(idx_train,idx_test)):
        print(iper)
        df_train = train_data.iloc[i_train, :].copy()
        df_test = train_data.iloc[i_test, :].copy()

        results = get_lr_perf(df_train, df_test, feature='num_dup', target='target', n_quantile=20)
        results.update({'fold': iper, 'columns': columns})
        Y =  Y.append(pd.DataFrame(pd.Series(results)).transpose())
    
for i in ['train.cat', 'train.num', 'test.cat', 'test.num']:
    Y[i] = Y[i].astype(float)