In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
import string
import warnings
from collections import Counter
from xgboost.sklearn import XGBClassifier
import math

warnings.filterwarnings('ignore')



In [2]:
trans = string.maketrans(string.punctuation, len(string.punctuation) * ' ')

RD = {}
RD['E'] = 'EAST'
RD['W'] = 'WEST'
RD['N'] = 'NORTH'
RD['S'] = 'SOUTH'
RD['ST'] = 'STREET'
RD['AVE'] = 'AVENUE'
RD['AV'] = 'AVENUE'
RD['PL'] = 'PLACE'
RD['PKWY'] = 'PARKWAY'
RD['RD'] = 'ROAD'
RD['BLVD'] = 'BOULEVARD'
RD['DR'] = 'DRIVE'
RD['CT'] = 'COURT'
RD['LN'] = 'LANE'
RD['LA'] = 'LANE'
RD['1ST'] = '1'
RD['2ND'] = '2'
RD['3RD'] = '3'
RD['FIRST'] = '1'
RD['SECOND'] = '2'
RD['THIRD'] = '3'
RD['FOURTH'] = '4'
RD['FIFTH'] = '5'
RD['SIXTH'] = '6'
RD['SEVENTH'] = '7'
RD['EIGHTH'] = '8'
RD['NINTH'] = '9'
RD['TENTH'] = '10'
RD['ELEVENTH'] = '11'
RD['11TH'] = '11'
RD['12TH'] = '12'
RD['13TH'] = '13'


RD2 ={}
RD2['1ST'] = '1'
RD2['2ND'] = '2'
RD2['3RD'] = '3'
RD2['4TH'] = '4'
RD2['5TH'] = '5'
RD2['6TH'] = '6'
RD2['7TH'] = '7'
RD2['8TH'] = '8'
RD2['9TH'] = '9'
RD2['0TH'] = '0'

RD3 = ['EAST', 'WEST', 'NORTH', 'SOUTH']
sn = ['STREET', 'AVENUE', 'PLACE', 'PARKWAY', 'ROAD', 'BOULEVARD', 'DRIVE', 'COURT', 'LANE']

In [3]:
# Extracting street name, eliminating any leading numbers 
def street_name(x):
    words = x.split()
    for s in sn:
        if s in words:
            if words[0].isdigit() and words.index(s)>1:
                return ' '.join(words[1:])
            
    for s in RD3:
        if s in words:
            if words[0].isdigit() and words.index(s)==1:
                return ' '.join(words[1:])
    return x

In [4]:
# Coding address using a coding list for cleaned addresses
def code_addr(addr, addr_coding_list):
    addr = str(addr.encode('utf-8'))
    addr = addr.translate(trans)
    addr = addr.replace('\n', ' ')
    addr = addr.replace('\r', ' ')
    addr = addr.upper()
    addr = addr.strip()
    
    addr = ' '.join([RD[w] if w in RD else w for w in addr.split()])
    
    for w1, w2 in RD2.iteritems():
        if w1 in addr:
            addr = addr.replace(w1, w2) 
    try:        
        r =  [(code,subaddr) for (code, subaddr) in addr_coding_list if subaddr in addr]
    except:
        print addr, 'errr'
        
    if len(r) > 1:
        r.sort(key=lambda x: len(str(x[1])), reverse=True)

    return r[0][0]  

In [5]:
# Cleaning addresses and creating adderss coding list
def create_address_list(data):
    street = list(data)
                  
    street = [str(x.encode('utf-8')) for x in street]
    street = [x.translate(trans) for x in street]
    street = [x.replace('\n', ' ') for x in street]
    street = [x.replace('\r', ' ') for x in street]
    street = [x.upper() for x in street]
    street = [x.strip() for x in street]
    
    street = [' '.join([RD[w] if w in RD else w for w in x.split()]) for x in street]

    for w1, w2 in RD2.iteritems():
        street = [x.replace(w1, w2) if w1 in x else x for x in street]
    
    street = [street_name(x) for x in street]

    street = list(set(street))
    
    contains =[]
                
    street = [x for x in street if x not in contains and not x.isdigit() and len(x) < 50 and len(x) > 3]
    
    l =  zip(range(len(street)), street)
    
    l.append((-1, ''))
    
    return l


In [6]:
# The initial features
all_features = ['bathrooms', 'bedrooms', 'desc_length', 'features_length', 'photos_no', 'price', 
                'price_per_bedroom', 'bedroom_per_bathroom', 'day', 'hour', 'price_diff', 
                'bedrooms_diff', 'latitude', 'longitude', 'price_per_bedroom_diff', 'disp_addr', 'area']

# Features that will be coded as probability of the target variable
prob_trans_features = ['disp_addr', 'building_id', 'manager_id', 'area',
                       'manager_id_area', 'area_price']

# Features to label_encode
label_encode_features = ['building_id', 'manager_id', 'street_address']

models_dict = {}
n_desc_topics = 10
n_features = 70


In [7]:
# Tokeniser and lemmatiser for the text of the description featuere
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        doc = doc.encode('ascii','ignore')
        doc = doc.translate(trans)
        return [self.wnl.lemmatize(t) for t in doc.split()]

In [8]:
# Coding high categerical varibales as target variable probability using empirical Bayesian  
from math import exp

def transform_categerical_feature(train, f, high_prior_prob, medium_prior_prob):
    _lambda = lambda n: 1.0 / (1.0 + exp( -(n - 10.0) / 1.0))
    
    est_high = train.groupby(f, as_index=False)['is_high'].agg({'post_prob':'mean', 
                                                                'cnt': 'count'})    
    est_high.loc[:, 'high_prob_'+f] = est_high.apply(lambda r: (_lambda(r['cnt']) * r['post_prob']) +
                                                 ((1-_lambda(r['cnt'])) * high_prior_prob), axis=1)
    
    
    est_medium = train.groupby(f, as_index=False)['is_medium'].agg({'post_prob':'mean', 
                                                                         'cnt': 'count'})
    est_medium.loc[:, 'medium_prob_'+f] = est_medium.apply(lambda r: (_lambda(r['cnt']) * r['post_prob']) +
                                                 ((1-_lambda(r['cnt'])) * medium_prior_prob), axis=1)
    
    tran_df = pd.merge(est_high[[f, 'high_prob_'+f]], est_medium[[f, 'medium_prob_'+f]], on=f)
        
    return tran_df


def add_prob_columns(data, trans_df, field):
        
    data = data.merge(trans_df, on=field, how='left')
    
    data.loc[data[field]==-1, 'high_prob_'+field] = np.random.uniform(low=0.001, high=0.5
                                                                     ,size=(len(data.loc[data[field]==-1])))
    data.loc[data[field]==-1, 'medium_prob_'+field] = np.random.uniform(low=0.001, high=0.5
                                                                        ,size=(len(data.loc[data[field]==-1])))
    
    data.loc[np.isnan(data['high_prob_'+field]), 'high_prob_'+field] = np.random.uniform(low=0.001, high=0.5
                                                                     ,size=(len(data.loc[np.isnan(data['high_prob_'+field])])))
    data.loc[np.isnan(data['medium_prob_'+field]), 'medium_prob_'+field] = np.random.uniform(low=0.001, high=0.5
                                                                        ,size=(len(data.loc[np.isnan(data['medium_prob_'+field])])))
    addresses
    return data

    
def add_prob_training(train_df, high_prior_prob, medium_prior_prob, field):
    kfold = StratifiedKFold(n_splits=5)
    tmp = pd.DataFrame()
    
    for itrain, itest in kfold.split(np.zeros(len(train_df)), train_df['interest_level']):
        trans_df = transform_categerical_feature(train_df.iloc[itrain], field, high_prior_prob, medium_prior_prob)
        tmp = tmp.append(add_prob_columns(train_df.iloc[itest], trans_df, field))
        
    return tmp

In [9]:
# Convert a list of features into a probability
punct = string.punctuation.replace('-','')+'\n\r\t'
trans_f = string.maketrans(punct, len(punct) * ' ')
from collections import OrderedDict
import re

cleaning_list = [
    ('BALCONY','BALCONY'),
    ('CENTRAL AC','AIR CONDITIONING'), 
    ('CENTRAL A C','AIR CONDITIONING'), 
    ('AIR-CONDITIONING', 'AIR CONDITIONING'),
    ('AIR CONDITIONING', 'AIR CONDITIONING'),
    ('CONCIERGE','DOORMAN'), 
    ('DOORMAN', 'DOORMAN'),
    ('HARDWOOD', 'HARDWOOD'),
    ('LAUNDRY', 'LAUNDRY'),
    ('PRE-WAR','PREWAR'), 
    ('PREWAR','PREWAR'), 
    ('PRE WAR' , 'PREWAR'),
    ('POST-WAR','POSTWAR'), 
    ('POSTWAR', 'POSTWAR'),  
    ('POST WAR', 'POSTWAR'),
    ('FITNESS','FITNESS'),
    ('GYM', 'FITNESS'),
    ('HEALTH CLUB', 'FITNESS'),
    ('OUTDOOR SPACE', 'OUTDOOR SPACE'), 
    ('OUTDOOR-SPACE', 'OUTDOOR SPACE'),
    ('ROOFDECK', 'ROOF DECK'), 
    ('ROOF-DECK', 'ROOF DECK'), 
    ('ROOF DECK','ROOF DECK'), 
    ('ROOFTOP', 'ROOF DECK'),
    ('PATIO', 'PATIO'),
    ('POOL', 'POOL'),
    ('MARBLE BATH','MARBLE BATH'),
    ('RENOVATED', 'RENOVATED'),
    ('PARKING', 'PARKING'),
    ('WALK IN CLOSET', 'WALK IN CLOSET'),
    ('WASHER DRYER', 'WASHER DRYER'),
    ('WASHER IN UNIT', 'WASHER DRYER'),
    ('WASHER AND DRYER', 'WASHER DRYER'),
    ('GRANITE COUNTER', 'GRANITE COUNTER'),
    ('GRANITE KITCHEN', 'GRANITE COUNTER'),
    ('SCREENING ROOM', 'SCREENING ROOM'),
    ('MASSIVE LIVING-ROOM', 'LARGE LIVING ROOM'),
    ('LARGE LIVING-ROOM', 'LARGE LIVING ROOM'),
    ('MASSIVE LIVING ROOM', 'LARGE LIVING ROOM'),
    ('LARGE LIVING ROOM', 'LARGE LIVING ROOM'),
    ('HUGE LIVING-ROOM', 'LARGE LIVING ROOM'),
    ('HUGE LIVING ROOM', 'LARGE LIVING ROOM'),
    ('LIVING DINING ROOM','LARGE LIVING ROOM'),
    ('BIKE STORAGE', 'BIKE STORAGE'),
    ('BICYCLE ROOM', 'BIKE STORAGE'),
    ('BIKE ROOM', 'BIKE STORAGE'),
    ('BIKE STROLLER', 'BIKE STORAGE'),
    ('DUPLEX', 'DUPLEX'),
    ('MICROWAVE', 'MICROWAVE'),
    ('WHEELCHAIR', 'WHEELCHAIR'),
    ('PLAYROOM', 'PLAYROOM'),
    ('SUBWAY', 'TRANSPORT'),
    ('TRANSPORT', 'TRANSPORT'),
    ('BACKYARD', 'BACKYARD'),
    ('UTILITIES INCLUDED', 'UTILITIES INCLUDED'),
    ('HEAT INCLUDED', 'UTILITIES INCLUDED'),
    ('GAS INCLUDED', 'UTILITIES INCLUDED'),
    ('WATER INCLUDED', 'UTILITIES INCLUDED'),
    ('RESIDENT LOUNGE', 'TENANT LOUNGE'),
    ('SOCIAL LOUNGE', 'TENANT LOUNGE')
    ]


def clean_features_list(features):
    return [clean_feature(re.sub(r'\s+', ' ',str(f.encode('utf_8')).upper().translate(trans_f).strip()))
            for f in features]

def clean_feature(f):
    for cf in cleaning_list:
            if cf[0] in f:
                return cf[1]
    return f

def features_probability(data):
    
    features = [item for subfeaturelist in list(data.features) for item in clean_features_list(subfeaturelist)] 
    
    features_count = {f:[0.0, 0.0, 0.0] for f,_ in  Counter(features).most_common(n=500)}
    
    idx = {'high':0, 'medium':1, 'low':2}

    for _,fl,il in data[['features', 'interest_level']].itertuples():
        for f in clean_features_list(fl):
            if f in features_count:
                features_count[f][idx[il]] += 1
            
    h_count = len(data.loc[data.interest_level == 'high'])
    m_count = len(data.loc[data.interest_level == 'medium'])
    l_count = len(data.loc[data.interest_level == 'low'])
    
    k = 0.05
    features_prob = []
    for f, c in features_count.iteritems():
        features_prob.append((f, 
                             (k + c[0]) / (2 * k + h_count), # high probability
                             (k + c[1] + c[2]) / (2 * k + m_count + l_count), # not high probability
                             (k + c[1]) / (2 * k + m_count), # medium probability
                             (k + c[0] + c[2]) / (2 * k + h_count + l_count), # not medium probability
                             ))
        
    h = [i[0] for i in sorted(features_prob, key=lambda x: x[1], reverse=True)][:int(n_features * 0.5)]
    m = [i[0] for i in sorted(features_prob, key=lambda x: x[3], reverse=True)]
    a = h 
    
    while len(a) < n_features:
        if m[0] not in a:
            a.append(m[0])
        m = m[1:]
        
    return features_prob, zip(range(len(a)), a)


def calc_features_probability(features_prob, features_list, features,  h_cnt=1, m_cnt=1, all_cnt=3):
    h_prob = h_cnt / float(all_cnt)
    not_h_prob = (all_cnt - h_cnt) / float(all_cnt)
    m_prob = m_cnt / float(all_cnt)
    not_m_prob = (all_cnt - m_cnt) / float(all_cnt)
    
    r = OrderedDict()
    
    if len(features) == 0:
        r.update({'h': np.random.uniform(high=0.8, low=0.005), 
                  'm': np.random.uniform(high=0.8, low=0.005)})
        r.update({'feat'+str(i):0 for i in range(n_features)})
        return pd.Series(r)
    
    fl = clean_features_list(features)
    
    f_h_prob = 0
    f_not_h_prob = 0
    f_m_prob = 0
    f_not_m_prob = 0
    
    coded = [0]*n_features
    f_idx = []
    
    for f, h, nh, m, nm in features_prob:
        if f in fl:
            f_h_prob += math.log(h)
            f_not_h_prob += math.log(nh)
            
            f_m_prob += math.log(m)
            f_not_m_prob += math.log(nm)
        else:
            f_h_prob += math.log(1.0 - h)
            f_not_h_prob += math.log(1.0 - nh)
            
            f_m_prob += math.log(1.0 - m)
            f_not_m_prob += math.log(1.0 - nm)
        
    for f in fl:
        idx = [code for (code, subfeat) in features_list if subfeat == f]
        if len(idx) > 0:
            coded[idx[0]] = 1    
        
    f_h_prob = math.exp(f_h_prob)
    f_not_h_prob = math.exp(f_not_h_prob)
    f_m_prob = math.exp(f_m_prob)
    f_not_m_prob = math.exp(f_not_m_prob)
    
    r.update({'h' :np.random.uniform(0.0, 0.0005) + (f_h_prob * h_prob) / (f_h_prob * h_prob + f_not_h_prob * not_h_prob), 
              'm' :np.random.uniform(0.0, 0.0005) + (f_m_prob * m_prob) / (f_m_prob * m_prob + f_not_m_prob * not_m_prob)})
    
    r.update({'feat'+str(i): coded[i] for i in range(n_features)})
    
    return pd.Series(r)


def add_feature_prob_columns(data, features_prob, features_list, training=False):

    if training:
        t = len(data)
        h = len(data.loc[data.interest_level=='high']) 
        m = len(data.loc[data.interest_level=='medium'])
    else:
        t = 3
        h = m = 1
    
    data[['high_prob_features', 'medium_prob_features']
         +['feat'+str(i) for i in range(n_features)]] = data.features.apply(lambda fl: 
                                                        calc_features_probability(features_prob, 
                                                                                  features_list, fl,
                                                                                  h, m, t))
    return data

    
def add_feature_prob_training(train_df, features_list):
    kfold = StratifiedKFold(n_splits=5)
    tmp = pd.DataFrame()
    
    for itrain, itest in kfold.split(np.zeros(len(train_df)), train_df['interest_level']):
        f_prob, _ = features_probability(train_df.iloc[itrain])
        tmp = tmp.append(add_feature_prob_columns(train_df.iloc[itest], f_prob, features_list, True))
        
    return tmp


In [10]:
# Dividing the area to small squares and number them 
def add_location_features(data):
    
    lat_max = 40.925
    lat_min = 40.491
    lon_max = -73.705
    lon_min = -74.251

    data.loc[data.latitude > lat_max, 'latitude'] = lat_max
    data.loc[data.latitude < lat_min, 'latitude'] = lat_min
    data.loc[data.longitude > lon_max, 'longitude'] = lon_max
    data.loc[data.longitude < lon_min, 'longitude'] = lon_min

    lon = np.floor((data.longitude.values - lon_min) / (lon_max - lon_min) * 100)
    lat = np.floor((data.latitude.values - lat_min) / (lat_max - lat_min) * 100) 

    data['area'] = lat * 100 + lon
    
    area_stats = data.groupby('area', as_index=False).agg({'price':'mean',
                                                           'building_id':'count',
                                                           'bedrooms':'mean',
                                                           'price_per_bedroom':'mean'
                                                          })
    
    area_stats = area_stats.rename(columns={'price':'price_avg',
                                            'building_id':'area_density',
                                            'bedrooms':'bedrooms_avg',
                                            'price_per_bedroom':'price_per_bedroom_avg'
                                             })
        
    data = data.merge(area_stats, on='area', how='left')
    
    data['price_diff'] = data['price'].values - data['price_avg'].values
    data['bedrooms_diff'] = data['bedrooms'].values - data['bedrooms_avg'].values
    data['price_per_bedroom_diff'] = data['price_per_bedroom'].values - data['price_per_bedroom_avg'].values
    return data


In [11]:
# Prepare variables for training the model
def prepare_data_first(train_df, test_df):
    
    global models_dict
    print 'Extracting numerical features...'
    train_df.loc[:, 'price_per_bedroom'] = train_df.apply(lambda r: r['price'] / float(r['bedrooms']) 
                                               if r['bedrooms']>0 else r['price'], axis=1)

    train_df.loc[:, 'bedroom_per_bathroom'] = train_df.apply(lambda r: r['bedrooms'] / float(r['bathrooms']) 
                                               if r['bathrooms']>0 else r['bedrooms'], axis=1)

    train_df.loc[:, 'price_per_all_rooms'] = train_df.apply(lambda r: r['price'] / float(r['bathrooms']+r['bedrooms']) 
                                               if r['bedrooms']+r['bathrooms']>0 else r['price'], axis=1)

    train_df.loc[:,'desc_length'] = train_df.description.apply(lambda x: len(x.split()))
    train_df.loc[:,'features_length'] = train_df.features.apply(len)
    train_df.loc[:,'photos_no'] = train_df.photos.apply(len)

    train_df.loc[:,'created'] = pd.to_datetime(train_df.loc[:, 'created'])
    train_df.loc[:,'day'] = train_df.created.apply(lambda d: d.day)
    train_df.loc[:,'hour'] = train_df.created.apply(lambda d: d.hour)
    
    print 'Building coding list for display address...'
    addr_coding_list = create_address_list(list(train_df.display_address) + list(test_df.display_address))
    models_dict['addr_coding_list'] = addr_coding_list
    
    print 'Encoding and extracting features from "display address"...'
    train_df['disp_addr'] = train_df.display_address.apply(lambda r : code_addr(r, addr_coding_list))
    
    print 'Adding location features...'
    train_df = add_location_features(train_df)
    
    print 'Pairing features...'
    train_df['t_price'] = np.ceil(train_df.price / 100)
    
    train_df['manager_id_area'] = train_df.manager_id + '-' + train_df.area.astype('str') 
    train_df['manager_id_building_id'] = train_df.manager_id + '-' + train_df.building_id
    train_df['manager_id_display_address'] = train_df.manager_id + '-' + train_df.display_address
    train_df['manager_id_price'] = train_df.manager_id + '-' + train_df.t_price.astype('str')
    
    train_df['disp_addr_price'] = train_df.disp_addr.astype('str') + '-' + train_df.t_price.astype('str')
    train_df['display_address_price'] = train_df.display_address + '-' + train_df.t_price.astype('str')
    train_df['area_price'] = train_df.area.astype('str') + '-' + train_df.t_price.astype('str')
    train_df['building_id_price'] = train_df.building_id + '-' + train_df.t_price.astype('str')
    
    train_df['manager_id_disp_addr_price'] = train_df.manager_id + '-' + train_df.disp_addr.astype('str') + '-' + train_df.t_price.astype('str')
    train_df['manager_id_building_id_price'] = train_df.manager_id + '-' + train_df.building_id + '-' + train_df.t_price.astype('str')
    train_df['manager_id_area_price'] = train_df.manager_id + '-' + train_df.area.astype('str') + '-' + train_df.t_price.astype('str')
    
    total_count = len(train_df)
    high_prior_prob = len(train_df.loc[train_df.interest_level == 'high']) / float(total_count)
    medium_prior_prob = len(train_df.loc[train_df.interest_level == 'medium']) / float(total_count)
    
    train_df.loc[:, 'is_high'] = 0
    train_df.loc[train_df.interest_level == 'high', 'is_high'] = 1
    
    train_df.loc[:, 'is_medium'] = 0
    train_df.loc[train_df.interest_level == 'medium', 'is_medium'] = 1
    
    train_df.loc[:, 'is_low'] = 0
    train_df.loc[train_df.interest_level == 'low', 'is_low'] = 1
    
    
    print 'Extracting TFIDF features from "description"...'
    TF_IDF_vectorizer_desc = TfidfVectorizer(stop_words='english', max_features=500, tokenizer=LemmaTokenizer())
    desc = TF_IDF_vectorizer_desc.fit_transform(train_df.description)
    
    lda_desc = LatentDirichletAllocation(n_topics=n_desc_topics, max_iter=20)
    desc_coded = lda_desc.fit_transform(desc)
    
    for col_idx in range(n_desc_topics):
        train_df.loc[:, 'desc'+ str(col_idx)] = desc_coded[:, col_idx]
        all_features.append('desc'+ str(col_idx))
    
    models_dict['TFIDF_desc'] = TF_IDF_vectorizer_desc
    models_dict['LDA_desc'] = lda_desc

    print 'Creating probability list for "features"...'
    models_dict['features_probability_list'], models_dict['features_list'] = features_probability(train_df)
    
    print 'Encoding features for training data...'
    train_df = add_feature_prob_columns(train_df, models_dict['features_probability_list'],
                                        models_dict['features_list'])
    
    all_features.extend(['high_prob_features', 'medium_prob_features'] + 
                        ['feat'+str(i) for i in range(n_features)])

    
    for field in prob_trans_features:
        print 'Transorming "%s"...'%field
        models_dict[field+'_trans_df'] = transform_categerical_feature(train_df, field, high_prior_prob, 
                                                                       medium_prior_prob)

        train_df = add_prob_training(train_df, high_prior_prob, medium_prior_prob, field)
        all_features.extend(['high_prob_'+field, 'medium_prob_'+field])
    
    print 'Encoding categorical features...'
    for field in label_encode_features:
        models_dict[field + '_label_encoder'] = LabelEncoder()
        models_dict[field + '_label_encoder'] = models_dict[field + '_label_encoder'].fit(list(train_df[field].values) +
                                                                                       list(test_df[field].values))
        train_df[field + '_encoded'] = models_dict[field + '_label_encoder'].transform(train_df[field].values)
        all_features.append(field + '_encoded')
    
    return train_df

In [12]:
# Prepare varibales for testing
def prepare_data(test_df):
    global models_dict
        
    print 'Extracting numerical features...'
    test_df.loc[:, 'price_per_bedroom'] = test_df.apply(lambda r: r['price'] / float(r['bedrooms']) 
                                               if r['bedrooms']>0 else r['price'], axis=1)

    test_df.loc[:, 'bedroom_per_bathroom'] = test_df.apply(lambda r: r['bedrooms'] / float(r['bathrooms']) 
                                               if r['bathrooms']>0 else r['bedrooms'], axis=1)

    test_df.loc[:, 'price_per_all_rooms'] = test_df.apply(lambda r: r['price'] / float(r['bathrooms']+r['bedrooms']) 
                                               if r['bedrooms']+r['bathrooms']>0 else r['price'], axis=1)

    test_df.loc[:,'desc_length'] = test_df.description.apply(lambda x: len(x.split()))
    test_df.loc[:,'features_length'] = test_df.features.apply(len)
    test_df.loc[:,'photos_no'] = test_df.photos.apply(len)

    test_df.loc[:,'created'] = pd.to_datetime(test_df.loc[:, 'created'])
    test_df.loc[:,'day'] = test_df.created.apply(lambda d: d.day)
    test_df.loc[:,'hour'] = test_df.created.apply(lambda d: d.hour)
    
    print 'Encoding and extracting features from "display address"...'
    test_df['disp_addr'] = test_df.display_address.apply(lambda r : code_addr(r, models_dict['addr_coding_list']))
    
    print 'Adding location features...'
    test_df = add_location_features(test_df)
    
    print 'Pairing features...'
    test_df['t_price'] = np.ceil(test_df.price / 100)
    
    test_df['manager_id_area'] = test_df.manager_id + '-' + test_df.area.astype('str') 
    test_df['manager_id_building_id'] = test_df.manager_id + '-' + test_df.building_id
    test_df['manager_id_display_address'] = test_df.manager_id + '-' + test_df.display_address
    test_df['manager_id_price'] = test_df.manager_id + '-' + test_df.t_price.astype('str')
    
    test_df['disp_addr_price'] = test_df.disp_addr.astype('str') + '-' + test_df.t_price.astype('str')
    test_df['display_address_price'] = test_df.display_address + '-' + test_df.t_price.astype('str')
    test_df['area_price'] = test_df.area.astype('str') + '-' + test_df.t_price.astype('str')
    test_df['building_id_price'] = test_df.building_id + '-' + test_df.t_price.astype('str')
    
    test_df['manager_id_disp_addr_price'] = test_df.manager_id + '-' + test_df.disp_addr.astype('str') + '-' + test_df.t_price.astype('str')
    test_df['manager_id_building_id_price'] = test_df.manager_id + '-' + test_df.building_id + '-' + test_df.t_price.astype('str')
    test_df['manager_id_area_price'] = test_df.manager_id + '-' + test_df.area.astype('str') + '-' + test_df.t_price.astype('str')
    
    print 'Encoding features probability...'
    test_df = add_feature_prob_columns(test_df,  models_dict['features_probability_list'], 
                                       models_dict['features_list'])
    
    print 'Extracting TFIDF features from "description"...'
    desc = models_dict['TFIDF_desc'].transform(test_df.description)
    
    desc_coded = models_dict['LDA_desc'].transform(desc)
    
    for col_idx in range(n_desc_topics):
        test_df.loc[:, 'desc'+ str(col_idx)] = desc_coded[:, col_idx]
    
    for field in prob_trans_features:
        print 'Transorming "%s"...'%field
        test_df = add_prob_columns(test_df, models_dict[field + '_trans_df'], field)
    
    print 'Encoding categorical features...'
    for field in label_encode_features:
        test_df[field + '_encoded'] = models_dict[field + '_label_encoder'].transform(test_df[field].values)
    

    return test_df

In [13]:
# Oversampling and Undersampling to balance the data
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import TomekLinks

def oversample_bin(df, features, target, ratio):   
    smote = ADASYN(ratio=ratio, random_state=np.random.randint(1000))
    s, y = smote.fit_sample(df[features+['weight']].values, df[target].values)
    new_df = pd.DataFrame(s, columns=features+['weight'])
    new_df[target] = y

    int_features = ['bathrooms', 'bedrooms', 'desc_length', 'features_length', 
                    'photos_no', 'day', 'hour']+ ['feat'+str(i) for i in range(n_features)] 

    for f in int_features:
        new_df.loc[:, f] = np.round( new_df.loc[:, f].values)

    return new_df


def undersample_random(df, features, target, ratio):
    undersample = RandomUnderSampler(ratio=ratio, random_state=np.random.randint(1000))
    s, y = undersample.fit_sample(df[features+['weight']].values, df[target].values)
    new_df = pd.DataFrame(s, columns=features+['weight'])
    new_df[target] = y
    return new_df

def undersample_tomek(df, features, target):    
    TL_hl = TomekLinks(random_state=np.random.randint(1000))
    s, y = TL_hl.fit_sample(df[features+['weight']], df[target].values)
    new_df = pd.DataFrame(s, columns=features+['weight'])
    new_df[target] = y
    return new_df


In [14]:
def make_prediction(in_data, out_data, model, prefix):
    pred_proba = model.predict_proba(in_data)
    
    idx = {col:idx for (idx, col) in  enumerate(model.classes_)}

    for col in model.classes_:
        out_data.loc[:, prefix + str(col) ] = pred_proba[:, idx[col]]
        
    return  out_data

In [15]:
f = ZipFile('./train.json.zip')
data = pd.read_json(f.open(f.filelist[0]))

f = ZipFile('./test.json.zip')
test = pd.read_json(f.open(f.filelist[0]))

print 'Training size:', data.shape
print 'Test size:', test.shape

Training size: (49352, 15)
Test size: (74659, 14)


In [16]:
data = prepare_data_first(data, test)

Extracting numerical features...
Building coding list for display address...
Encoding and extracting features from "display address"...
Adding location features...
Pairing features...
Extracting TFIDF features from "description"...
Creating probability list for "features"...
Encoding features for training data...
Transorming "disp_addr"...
Transorming "building_id"...
Transorming "manager_id"...
Transorming "area"...
Transorming "manager_id_area"...
Transorming "area_price"...
Encoding categorical features...


In [17]:
models_dict.keys()

['disp_addr_trans_df',
 'features_probability_list',
 'street_address_label_encoder',
 'manager_id_trans_df',
 'building_id_label_encoder',
 'area_price_trans_df',
 'features_list',
 'LDA_desc',
 'TFIDF_desc',
 'manager_id_area_trans_df',
 'addr_coding_list',
 'manager_id_label_encoder',
 'building_id_trans_df',
 'area_trans_df']

In [18]:
# Stacked model. Converting the problem to binary and coding it in different ways.

class XGB_Bin_OvR:
    def __init__(self):
        self.n_codes = 9
        self.models = {}
        self.fields_and_models = zip(['m'+str(i) for i in range(self.n_codes)], 
                                     ['model'+str(i) for i in range(self.n_codes)],
                                     [0, 0, 0, 1, 1, 1, 2, 2, 2])

    def fit_one_fold(self, data):
        h_ecoc = {f:v for f,v in zip(['m'+str(i) for i in  range(self.n_codes)], [1, 0, 0, 1, 0, 0, 1, 0, 0])}
        m_ecoc = {f:v for f,v in zip(['m'+str(i) for i in  range(self.n_codes)], [0, 1, 0, 0, 1, 0, 0, 1, 0])}
        l_ecoc = {f:v for f,v in zip(['m'+str(i) for i in  range(self.n_codes)], [0, 0, 1, 0, 0, 1, 0, 0, 1])}
        
        data[['m'+str(i) for i in  range(self.n_codes)]] = data.interest_level.apply(lambda x: pd.Series(h_ecoc) 
                                                                                if x=='high' 
                                                                                else 
                                                                                (pd.Series(m_ecoc 
                                                                                           if x=='medium' 
                                                                                           else 
                                                                                           pd.Series(l_ecoc))))
        for cls, model_name, trans_flag in self.fields_and_models:
            print 'Training: %s using %d...'%(model_name, trans_flag)

            n1 = len(data.loc[data[cls] == 1])
            n2 = len(data) - n1
            r = min([n1, n2]) /float(max([n1, n2]))

            if trans_flag == 0:
                in_data = undersample_tomek(data, all_features, cls)

            elif trans_flag == 1:
                r *= np.random.uniform(1.5, 3) if r < 0.334 else 1.0 
                r = 0.5 if r < 0.5 else r
                in_data = undersample_random(data, all_features, cls, r)

            elif trans_flag == 2:
                r *= (1.5 + np.random.uniform(0, 0.3)) 
                r = 0.5 if r > 0.5 else r
                in_data = oversample_bin(data, all_features, cls, r)


            self.models[model_name] = XGBClassifier(reg_alpha=0.5, reg_lambda=0.25, #scale_pos_weight=w,
                                               n_estimators=1000, max_depth=4, subsample=0.75, max_delta_step=1,
                                               learning_rate=0.05, colsample_bytree=0.75, objective='binary:logistic')

            self.models[model_name] = self.models[model_name].fit(in_data[all_features].values, in_data[cls].values, 
                                                        eval_metric='logloss', sample_weight=in_data.weight.values)
    def predict(self, data):
        proba_df = pd.DataFrame()
        for col_name, model_name, _ in self.fields_and_models:
            model_input4 = data[all_features].values
            proba_df = make_prediction(model_input4, proba_df, self.models[model_name], col_name + '_')
        return proba_df
    
    def fit_predict(self, data, kfold):
        tmp = pd.DataFrame()
        i = 0
        for itrain, itest in kfold.values():
            print 'Training fold %d ...' % i
            i += 1
            self.fit_one_fold(data.iloc[itrain])
            x = self.predict(data.iloc[itest])
            x['interest_level'] = data.iloc[itest]['interest_level']
            tmp = tmp.append(x)
            print '--------------------------------------------------'
            
        self.fit_one_fold(data)
        return tmp


In [19]:
class XGB_Bin_OvO:
    def __init__(self):
        self.models = {}
        self.exclude_list = zip(['low', 'high', 'medium'], ['is_high', 'is_medium', 'is_low'])

    def fit_one_fold(self, data):
        for exclude, cls in self.exclude_list:
            model_name = 'not_' + exclude
            print 'Training: %s...'% model_name
            
            in_data = data.loc[data.interest_level != exclude]
            
            n1 = len(in_data.loc[in_data[cls] == 1])
            n2 = len(in_data) - n1
            r = min([n1, n2]) /float(max([n1, n2]))
            
            in_data = SMOTE_tomek(in_data, all_features, cls, r)

            self.models[model_name] = XGBClassifier(reg_alpha=0.5, reg_lambda=0.25, #scale_pos_weight=w,
                                               n_estimators=1000, max_depth=4, subsample=0.75, max_delta_step=1,
                                               learning_rate=0.05, colsample_bytree=0.75, objective='binary:logistic')

            self.models[model_name] = self.models[model_name].fit(in_data[all_features].values, in_data[cls].values, 
                                                        eval_metric='logloss')
    def predict(self, data):
        proba_df = pd.DataFrame()
        for exclude, cls in self.exclude_list:
            model_input4 = data[all_features].values
            proba_df = make_prediction(model_input4, proba_df, self.models['not_'+exclude], 'not_' + exclude + '_')
        return proba_df
    
    def fit_predict(self, data, kfold):
        tmp = pd.DataFrame()
        i = 0
        for itrain, itest in kfold.values():
            print 'Training fold %d ...' % i
            i += 1
            self.fit_one_fold(data.iloc[itrain])
            x = self.predict(data.iloc[itest])
            x['interest_level'] = data.iloc[itest]['interest_level']
            tmp = tmp.append(x)
            print '--------------------------------------------------'
            
        self.fit_one_fold(data)
        return tmp


In [20]:
class XGB_Multi:
    def __init__(self):
        self.models = None
        self.comb_features =None
        
    def fit_one_fold(self, data):
        self.comb_features = list(data.columns)
        self.comb_features.remove('interest_level')
        
        self.model = XGBClassifier(reg_alpha=0.5, reg_lambda=0.25, #scale_pos_weight=w,
                                   n_estimators=2000, max_depth=4, subsample=0.75, max_delta_step=1,
                                   learning_rate=0.03, colsample_bytree=0.75, objective='multi:softprob')
        self.model = self.model.fit(data[all_features].values, data.interest_level.values, eval_metric='mlogloss',
                                    sample_weight=data.weight.values)
        
    def predict(self, data):
        proba_df = pd.DataFrame()
        model_input4 = data[all_features].values
        proba_df = make_prediction(model_input4, proba_df, self.model, '')
        return proba_df
    
    def fit_predict(self, data, kfold):
        tmp = pd.DataFrame()
        i = 0
        for itrain, itest in kfold.values():
            print 'Training fold %d ...' % i
            i += 1
            self.fit_one_fold(data.iloc[itrain])
            x = self.predict(data.iloc[itest])
            x['interest_level'] = data.iloc[itest]['interest_level']
            tmp = tmp.append(x)
            
        self.fit_one_fold(data)
        return tmp


In [21]:
class Comb_RF:
    def __init__(self):
        self.models = None
        self.comb_features = None
        
    def fit_one_fold(self, data):
        self.comb_features = list(data.columns)
        self.comb_features.remove('interest_level')
        
        self.model = RandomForestClassifier(max_depth=6, n_estimators=300, max_features='auto')
        self.model = self.model.fit(data[self.comb_features].values, data.interest_level.values)
        
    def predict(self, data):
        proba_df = pd.DataFrame()
        model_input4 = data[self.comb_features].values
        proba_df = make_prediction(model_input4, proba_df, self.model, '')
        return proba_df
    
    def fit_predict(self, data, kfold):
        tmp = pd.DataFrame()
        i = 0
        for itrain, itest in kfold.values():
            print 'Training fold %d ...' % i
            i += 1
            self.fit_one_fold(data.iloc[itrain])
            x = self.predict(data.iloc[itest])
            x['interest_level'] = data.iloc[itest]['interest_level']
            tmp = tmp.append(x)
            
        self.fit_one_fold(data)
        return tmp


In [22]:
skf = StratifiedKFold(n_splits=5)
i = 0
kfold = {}
for itrain, itest in skf.split(np.zeros(len(data)), data['interest_level']):
    kfold[i] = (itrain, itest)
    i += 1

In [23]:
data['weight'] = data.interest_level.apply(lambda x: np.random.uniform(-0.2,0.2) + 
                                    2.5 if x=='high' else(1.5 if x=='medium' else 1.1)) 

In [24]:
xgb_bin_ovr = XGB_Bin_OvR()   
xgb_bin_ovr_out = xgb_bin_ovr.fit_predict(data, kfold)
print 'Done.'

Training fold 0 ...
Training: model0 using 0...
Training: model1 using 0...
Training: model2 using 0...
Training: model3 using 1...
Training: model4 using 1...
Training: model5 using 1...
Training: model6 using 2...
Training: model7 using 2...
Training: model8 using 2...
--------------------------------------------------
Training fold 1 ...
Training: model0 using 0...
Training: model1 using 0...
Training: model2 using 0...
Training: model3 using 1...
Training: model4 using 1...
Training: model5 using 1...
Training: model6 using 2...
Training: model7 using 2...
Training: model8 using 2...
--------------------------------------------------
Training fold 2 ...
Training: model0 using 0...
Training: model1 using 0...
Training: model2 using 0...
Training: model3 using 1...
Training: model4 using 1...
Training: model5 using 1...
Training: model6 using 2...
Training: model7 using 2...
Training: model8 using 2...
--------------------------------------------------
Training fold 3 ...
Training: m

In [25]:
comb_ovr = Comb_RF()
ovr_out = comb_ovr.fit_predict(xgb_bin_ovr_out, kfold)

Training fold 0 ...
Training fold 1 ...
Training fold 2 ...
Training fold 3 ...
Training fold 4 ...


In [26]:
xgb_bin_ovo = XGB_Bin_OvO()   
xgb_bin_ovo_out = xgb_bin_ovo.fit_predict(data, kfold)
print 'Done.'

Training fold 0 ...
Training: not_low...
Training: not_high...
Training: not_medium...
--------------------------------------------------
Training fold 1 ...
Training: not_low...
Training: not_high...
Training: not_medium...
--------------------------------------------------
Training fold 2 ...
Training: not_low...
Training: not_high...
Training: not_medium...
--------------------------------------------------
Training fold 3 ...
Training: not_low...
Training: not_high...
Training: not_medium...
--------------------------------------------------
Training fold 4 ...
Training: not_low...
Training: not_high...
Training: not_medium...
--------------------------------------------------
Training: not_low...
Training: not_high...
Training: not_medium...
Done.


In [27]:
comb_ovo = Comb_RF()
ovo_out = comb_ovo.fit_predict(xgb_bin_ovo_out, kfold)

Training fold 0 ...
Training fold 1 ...
Training fold 2 ...
Training fold 3 ...
Training fold 4 ...


In [28]:
data['weight'] = data.interest_level.apply(lambda x: np.random.uniform(-0.2,0.2) + 
                                    1.5 if x=='high' else(1.2 if x=='medium' else 1.1)) 

In [29]:
xgb_multi = XGB_Multi()   
xgb_multi_out = xgb_multi.fit_predict(data, kfold)
print 'Done.'

Training fold 0 ...
Training fold 1 ...
Training fold 2 ...
Training fold 3 ...
Training fold 4 ...
Done.


In [30]:
def change_col_name(l, suffix):
    return [x + suffix if x in ['high', 'low', 'medium'] else x for x in l]

del xgb_multi_out['interest_level']
del ovo_out['interest_level']
comb_in = pd.concat([ovr_out, ovo_out, xgb_multi_out], axis=1)

In [31]:
comb_model= Comb_RF()
comb_model.fit_one_fold(comb_in)

In [32]:
d2  = prepare_data(test)
print d2[all_features].shape

Extracting numerical features...
Encoding and extracting features from "display address"...
Adding location features...
Pairing features...
Encoding features probability...
Extracting TFIDF features from "description"...
Transorming "disp_addr"...
Transorming "building_id"...
Transorming "manager_id"...
Transorming "area"...
Transorming "manager_id_area"...
Transorming "area_price"...
Encoding categorical features...
(74659, 114)


In [33]:
ovr_prob_df = xgb_bin_ovr.predict(d2)
ovr_prob_df = comb_ovr.predict(ovr_prob_df)

ovo_prob_df = xgb_bin_ovo.predict(d2)
ovo_prob_df = comb_ovo.predict(ovo_prob_df)

multi_prob_df = xgb_multi.predict(d2)

comb_in = pd.concat([ovr_prob_df, ovo_prob_df, multi_prob_df], axis=1)

prob_df = comb_model.predict(comb_in)

prob_df["listing_id"] = d2.listing_id.values

In [35]:
prob_df[['listing_id', 'high', 'low', 'medium']].to_csv('Submission.csv', index=False)