In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import os
#Models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score, cross_val_predict, KFold
from sklearn.pipeline import make_pipeline
from pylab import rcParams
from sklearn.metrics import confusion_matrix
import pyltr

#import data_preprocessing
%matplotlib inline
params = {'axes.labelsize': 14,'axes.titlesize':14, 'text.fontsize': 14, 'legend.fontsize': 14,
          'xtick.labelsize': 13, 'ytick.labelsize': 14}
rcParams['figure.figsize'] = 6.5, 4

matplotlib.rcParams.update(params)

In [None]:
#Training file
data = pd.read_csv('C:/Users/John/Desktop/DM/Data Mining VU data/training_set_VU_DM_2014.csv')

In [None]:
#Training dataset - 70%
trainset = data.head(3500)

#Validation dataset - 20%
valset = data[3501:4500]

#Test dataset - 10%
testset = data[4501:5000]

In [None]:
#Efi's code for data cleaning

def remove_outliers(df):
    # removes outliers
    # if normal, use standard deviation
    # if not normal uses percentiles

    return(df)


def convert_type(df):

    return df


def create_composite_features(df):
    #df['date_time']= pd.to_datetime(df['date_time'])
    #data.date_time.map(lambda x: x.month)

    #df['season'] = df.date_time.apply(lambda dt: (dt.month%12 + 3)//3)


    # Rank within the same srch id
    #
    df['price_rank'] = df.groupby(['srch_id'])['price_usd'].rank(method='dense')
    df['star_rank'] = df.groupby(['srch_id'])['price_usd'].rank(method='dense')

    df['value_for_money']=df.price_usd/df.prop_review_score
    df['value_for_money'] = df.prop_review_score/df.price_usd

    return df


def normalize_within_group(df):

    # Normalize
    df['price_usd_normalized'] = df[['price_usd','srch_id']].groupby('srch_id').transform(lambda x: (x - x.min()) / (x.max()-x.min()))
    return df


def missing_values(df):
    # continuous

    # discrete

    return df

In [None]:
#Using Efi's code for data cleaning for each datasubset

train = create_composite_features(trainset)
train = normalize_within_group(trainset)

val = create_composite_features(valset)
val = normalize_within_group(valset)

test = create_composite_features(testset)
test = normalize_within_group(testset)

In [None]:
##Unorthodox Implementation - vectors or fearures need to be reduced/rearranged

#Select the same features from each dataset
selected_features = ['value_for_money','price_usd_normalized', 'star_rank']
features_to_keep = ['booking_bool', 'srch_id', 'click_bool', 'prop_id']
all_possible_features = features_to_keep + selected_features
df_to_train = train.ix[:,all_possible_features]
df_to_val = val.ix[:,all_possible_features]
df_to_test = test.ix[:,all_possible_features]

#Relevance = booking + clicking
df_to_train['relevance']=df_to_train.booking_bool+df_to_train.click_bool
df_to_val['relevance']=df_to_val.booking_bool+df_to_val.click_bool
df_to_test['relevance']=df_to_test.booking_bool+df_to_test.click_bool

#Since we've created a new variable that combined those two, there's no need in keeping them
df_to_train.drop(['booking_bool', 'click_bool'], axis = 1)
df_to_val.drop(['booking_bool', 'click_bool'], axis = 1)
df_to_test.drop(['booking_bool', 'click_bool'], axis = 1)

#Rearrange the columns because lambdaMART wants the target variable and the IDs in the first and second column respectively
df_to_train = df_to_train[['relevance', 'srch_id', 'prop_id', 'value_for_money', 'price_usd_normalized', 'star_rank']]
df_to_val = df_to_val[['relevance', 'srch_id', 'prop_id', 'value_for_money', 'price_usd_normalized', 'star_rank']]
df_to_test = df_to_test[['relevance', 'srch_id', 'prop_id', 'value_for_money', 'price_usd_normalized', 'star_rank']]

#Data cleaning
for feature in ['value_for_money', 'price_usd_normalized', 'star_rank']:
    df_to_train[feature][df_to_train[feature].isnull()] = df_to_train[feature].median()
    df_to_val[feature][df_to_val[feature].isnull()] = df_to_val[feature].median()
    df_to_test[feature][df_to_test[feature].isnull()] = df_to_test[feature].median()

In [None]:
#From dataframes to arrays
trrelevance_arr = np.array(df_to_train['relevance'])
trfeature_arr = np.array(df_to_train[selected_features])
trid_arr = np.array(df_to_train.srch_id)

vrelevance_arr = np.array(df_to_val['relevance'])
vfeature_arr = np.array(df_to_val[selected_features])
vid_arr = np.array(df_to_val.srch_id)

terelevance_arr = np.array(df_to_test['relevance'])
tefeature_arr = np.array(df_to_test[selected_features])
teid_arr = np.array(df_to_test.srch_id)

In [None]:
#Choose nDCG as metric (k arbitrary number)
metric = pyltr.metrics.NDCG(k=10)

#Use validation set, stop_after arbitrary number
# Only needed if you want to perform validation (early stopping & trimming)
monitor = pyltr.models.monitors.ValidationMonitor(
    vfeature_arr, vrelevance_arr, vid_arr, metric=metric, stop_after=250)

In [None]:
#Use lambdaMART - have to find the best values for the parametes

model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=1000,
    learning_rate=0.03,
    #max_features=0.5,
    #query_subsample=0.5,
    #max_leaf_nodes=10,
    #min_samples_leaf=64,
    verbose=1
)

model.fit(trfeature_arr, trrelevance_arr, trid_arr, monitor=monitor)

In [None]:
tepred = model.predict(tefeature_arr)
print('Random ranking:', metric.calc_mean_random(teid_arr, terelevance_arr))
print('Our model:', metric.calc_mean(teid_arr, terelevance_arr, tepred))

#Need to add Search and Property IDs to the output (easy)
tepred