In [None]:
pip install lightgbm

In [1]:
import lightgbm as lgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
import gc
from sklearn.preprocessing import StandardScaler,OneHotEncoder,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score
from sklearn.impute import SimpleImputer

In [2]:
def replace_nan_with_mean(df,cols):
    """
    Completing missing numerical values (NaN) from columns. NaN values are replaced by the column's mean
    Args:
        @string_cols:  (list of column indexes) string columns to complete
    """
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

    imp_mean = imp_mean.fit(df[cols])

    df[cols] = imp_mean.transform(df[cols])

    return df

In [3]:
def replace_nan_with_median(df,cols):
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='median')

    imp_mean = imp_mean.fit(df[cols])

    df[cols] = imp_mean.transform(df[cols])

    return df

In [4]:
def replace_nan_with_value(df,col,value):
    imp_mean = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=value)

    imp_mean = imp_mean.fit(df[[col]])

    df[col] = imp_mean.transform(df[[col]])

    return df

In [5]:
def country_score(df, p_id, c_id):
    score = 0 
    hotel_click_ratio = 0
    hotel_booking_ratio = 0
    
    # get the number of clicks and bookings for a specific hotel
    subset_hotel = df[df["prop_id"] == p_id]
    hotel_clicks = len(subset_hotel.loc[subset_hotel['click_bool'] == 1])
    hotel_bookings = len(subset_hotel.loc[subset_hotel['booking_bool'] == 1])
    
    # get the number of clicks and bookings for a specific country
    subset_country = df[df["prop_country_id"] == c_id]
    country_clicks = len(subset_country.loc[subset_country['click_bool'] == 1])
    country_bookings = len(subset_country.loc[subset_country['booking_bool'] == 1])
    
    # avoid devision by zero, compute the ratios 
    if hotel_clicks > 0:
        hotel_click_ratio = hotel_clicks / country_clicks
        
    if hotel_bookings > 0:
        hotel_booking_ratio = hotel_bookings / country_bookings
    
    # compute the country score
    score = (hotel_click_ratio + hotel_booking_ratio) / 2
    
    return score

In [6]:
def get_season(date_time):
    season = 0
    
    date = datetime.datetime.strptime(date_time, "%Y-%m-%d %H:%M:%S")
    m = date.month
    # spring
    if m in [3,4,5]:
        season = 1
    # summer
    elif m in [6,7,8]:
        season = 2
    # autumn
    elif m in [9,10,11]:
        season = 3
    # winter
    elif m in [12,1,2]:
        season = 4
        
    return season

In [None]:
#test_data  = pd.read_csv("test_set_VU_DM.csv")

In [None]:
#training_data  = pd.read_csv("training_set_VU_DM.csv")

In [None]:
"""  THIS IS ALL REPLACED BY READING THE SAVED TRANSFORMED DATASET


#create new column for NDCG evaluation
training_data['target_col'] = training_data.apply(
    lambda row: (5 if row.booking_bool == 1 else (1 if row.click_bool == 1 else 0)), axis=1)

prop_counter = {}
for prop_id in training_data["prop_id"].unique():
    prop_counter[prop_id] = len(training_data.loc[training_data['prop_id'] == prop_id])
    
#create new column count_prop_id
training_data['count_prop_id'] = training_data.apply(
    lambda row: (prop_counter[row.prop_id]), axis=1)    

# save transformations
training_data.to_csv("training_set_TRANS.csv", index=False)    


"""

In [7]:
training_data = pd.read_csv("training_set_TRANS.csv")

In [None]:
# to libearete memory
#del prop_counter
gc.collect()

In [None]:
#create new colum country_score
training_data['country_score'] = training_data.apply(
    lambda row: country_score(training_data, row.prop_id, row.prop_country_id), axis=1)

In [None]:
aux_training

In [None]:
#create new colum season
training_data['season'] = training_data.apply(
    lambda row: get_season(row.date_time), axis=1)

In [9]:
#aux_training = copy.deepcopy(training_data)
aux_training = copy.copy(training_data) #shallow copy for now because of memory problems
#aux_training.head()

In [None]:
aux_training.head(50)

In [14]:
print("Empty values per column: \n",aux_training.isnull().sum())

Empty values per column: 
 srch_id                              0
site_id                              0
visitor_location_country_id          0
visitor_hist_starrating              0
visitor_hist_adr_usd                 0
prop_country_id                      0
prop_id                              0
prop_starrating                      0
prop_review_score                 7364
prop_brand_bool                      0
prop_location_score1                 0
prop_location_score2           1090348
prop_log_historical_price            0
price_usd                            0
promotion_flag                       0
srch_destination_id                  0
srch_length_of_stay                  0
srch_booking_window                  0
srch_adults_count                    0
srch_children_count                  0
srch_room_count                      0
srch_saturday_night_bool             0
srch_query_affinity_score      4640941
orig_destination_distance      1607782
random_bool                          

---------------------------------------
beginning of pre-processing and data engineering

In [10]:
aux_training = aux_training.drop('date_time', axis=1) # cause me dont care datee

In [11]:
##  drop columns that are not in test set (??? ) 
aux_training = aux_training.drop('click_bool', axis=1)   #replaced by target_col and not in test set 
aux_training = aux_training.drop('booking_bool', axis=1) #replaced by target_col and not in test set 

aux_training = aux_training.drop('position', axis=1)     # not in test set 
aux_training = aux_training.drop('gross_bookings_usd', axis=1) # not in test set


In [16]:
aux_training = replace_nan_with_mean(aux_training,["visitor_hist_starrating","visitor_hist_adr_usd"])
aux_training = replace_nan_with_median(aux_training,["orig_destination_distance"])

col_min = aux_training["srch_query_affinity_score"].min() # minimum of col
aux_training = replace_nan_with_value(aux_training,'srch_query_affinity_score',col_min)

col_min = aux_training["prop_starrating"].min() # minimum of col
aux_training = replace_nan_with_value(aux_training,'prop_starrating',col_min)

col_min = aux_training["prop_review_score"].min() # minimum of col
aux_training = replace_nan_with_value(aux_training,'prop_review_score',col_min)

col_min = aux_training["prop_location_score1"].min() # minimum of col
aux_training = replace_nan_with_value(aux_training,'prop_location_score1',col_min)

col_min = aux_training["prop_location_score2"].min() # minimum of col
aux_training = replace_nan_with_value(aux_training,'prop_location_score2',col_min)

In [17]:
print("Empty values per column: \n",aux_training.isnull().sum())

Empty values per column: 
 srch_id                              0
site_id                              0
visitor_location_country_id          0
visitor_hist_starrating              0
visitor_hist_adr_usd                 0
prop_country_id                      0
prop_id                              0
prop_starrating                      0
prop_review_score                    0
prop_brand_bool                      0
prop_location_score1                 0
prop_location_score2                 0
prop_log_historical_price            0
price_usd                            0
promotion_flag                       0
srch_destination_id                  0
srch_length_of_stay                  0
srch_booking_window                  0
srch_adults_count                    0
srch_children_count                  0
srch_room_count                      0
srch_saturday_night_bool             0
srch_query_affinity_score            0
orig_destination_distance            0
random_bool                          

In [None]:
aux_training

In [18]:
feature = aux_training.drop('target_col', axis=1)
target  =  aux_training['target_col']

In [19]:
#X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(feature, target, test_size=0.2, random_state=1)

#print("X_train.shape = ",X_train.shape)
#print("X_test.shape = ",X_test.shape)
#print("\ny_train.shape = ",y_train.shape)
#print("y_test.shape = ",y_test.shape)

In [None]:
#X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

print("X_train.shape = ",X_train.shape)
print("X_val.shape = ",X_val.shape)
print("\ny_train.shape = ",y_train.shape)
print("y_val.shape = ",y_val.shape)

In [20]:
X_train = X_train.sort_index()
y_train = y_train.sort_index()

X_val = X_val.sort_index()
y_val = y_val.sort_index()

#X_test = X_test.sort_index()
#y_test = y_test.sort_index()

In [21]:
# get number of rows of each query (srch_id). Needed to train the model.
qids_train = X_train.groupby("srch_id")["srch_id"].count().to_numpy() 
qids_val   = X_val.groupby("srch_id")["srch_id"].count().to_numpy()
#qids_test   = X_test.groupby("srch_id")["srch_id"].count().to_numpy()


In [None]:
print("query train ids mean: ",qids_train.mean()) 
print("query val   ids mean: ",qids_val.mean()) 

In [None]:
y_test #---->> THIS ARE THE TRUE VALUES WE NEED TO COMPARE AFTER MAKING PREDICTION OVER X_TEST

In [24]:
FIXED_PARAMS={
              'objective'             : 'lambdarank',
              #'objective'               : 'rank_xendcg',
              'metric'                : 'ndcg',
              'boosting_type'         : 'gbdt',
              'is_unbalance'          : True,
              'num_iterations'        : 500,
              'early_stopping_rounds' : 50}

SEARCH_PARAMS = {'learning_rate': 0.15,
                 'max_depth': 15,
                 'num_leaves': 25,
                 'feature_fraction': 0.8,
                 'subsample': 0.2,
                 'n_estimators' : 5000}
#n_estimators = 5000

gbm = lgb.LGBMRanker( 
        objective               = FIXED_PARAMS["objective"],
        metric                  = FIXED_PARAMS["metric"],
        boosting_type           = FIXED_PARAMS["boosting_type"],
        is_unbalance            = FIXED_PARAMS["is_unbalance"],
        num_iterations          = FIXED_PARAMS["num_iterations"],
        early_stopping_rounds   = FIXED_PARAMS["early_stopping_rounds"],
        learning_rate           = SEARCH_PARAMS["learning_rate"],
        max_depth               = SEARCH_PARAMS["max_depth"],
        num_leaves              = SEARCH_PARAMS["num_leaves"],
        feature_fraction        = SEARCH_PARAMS["feature_fraction"],
        subsample               = SEARCH_PARAMS["subsample"],
        n_estimators            = SEARCH_PARAMS["n_estimators"],
)

# gbm = lgb.LGBMRanker( params )



In [None]:
len(X_train)

In [25]:
# Training
gbm.fit(X_train, 
        y_train, 
        group=qids_train,
        eval_set=[(X_val, y_val)], 
        eval_group=[qids_val],
        eval_at=[1,2,3,4,5]
)



[1]	valid_0's ndcg@1: 0.851223	valid_0's ndcg@2: 0.880659	valid_0's ndcg@3: 0.896812	valid_0's ndcg@4: 0.907198	valid_0's ndcg@5: 0.913645
[2]	valid_0's ndcg@1: 0.868405	valid_0's ndcg@2: 0.898582	valid_0's ndcg@3: 0.91399	valid_0's ndcg@4: 0.922419	valid_0's ndcg@5: 0.927539
[3]	valid_0's ndcg@1: 0.871936	valid_0's ndcg@2: 0.902034	valid_0's ndcg@3: 0.917125	valid_0's ndcg@4: 0.925202	valid_0's ndcg@5: 0.930005
[4]	valid_0's ndcg@1: 0.87353	valid_0's ndcg@2: 0.903476	valid_0's ndcg@3: 0.918364	valid_0's ndcg@4: 0.926278	valid_0's ndcg@5: 0.931078
[5]	valid_0's ndcg@1: 0.875028	valid_0's ndcg@2: 0.904944	valid_0's ndcg@3: 0.91971	valid_0's ndcg@4: 0.927632	valid_0's ndcg@5: 0.932242
[6]	valid_0's ndcg@1: 0.87628	valid_0's ndcg@2: 0.906028	valid_0's ndcg@3: 0.920503	valid_0's ndcg@4: 0.928377	valid_0's ndcg@5: 0.932998
[7]	valid_0's ndcg@1: 0.876399	valid_0's ndcg@2: 0.906371	valid_0's ndcg@3: 0.920751	valid_0's ndcg@4: 0.928481	valid_0's ndcg@5: 0.933159
[8]	valid_0's ndcg@1: 0.877222	

[59]	valid_0's ndcg@1: 0.882534	valid_0's ndcg@2: 0.912187	valid_0's ndcg@3: 0.92609	valid_0's ndcg@4: 0.933224	valid_0's ndcg@5: 0.937471
[60]	valid_0's ndcg@1: 0.882648	valid_0's ndcg@2: 0.91226	valid_0's ndcg@3: 0.926184	valid_0's ndcg@4: 0.933309	valid_0's ndcg@5: 0.937491
[61]	valid_0's ndcg@1: 0.882597	valid_0's ndcg@2: 0.912285	valid_0's ndcg@3: 0.926183	valid_0's ndcg@4: 0.93329	valid_0's ndcg@5: 0.937497
[62]	valid_0's ndcg@1: 0.882762	valid_0's ndcg@2: 0.912296	valid_0's ndcg@3: 0.926232	valid_0's ndcg@4: 0.933324	valid_0's ndcg@5: 0.937552
[63]	valid_0's ndcg@1: 0.882762	valid_0's ndcg@2: 0.912294	valid_0's ndcg@3: 0.92626	valid_0's ndcg@4: 0.933327	valid_0's ndcg@5: 0.93755
[64]	valid_0's ndcg@1: 0.882613	valid_0's ndcg@2: 0.912252	valid_0's ndcg@3: 0.926271	valid_0's ndcg@4: 0.933314	valid_0's ndcg@5: 0.937503
[65]	valid_0's ndcg@1: 0.882644	valid_0's ndcg@2: 0.91233	valid_0's ndcg@3: 0.926304	valid_0's ndcg@4: 0.933376	valid_0's ndcg@5: 0.937542
[66]	valid_0's ndcg@1: 0.8

[119]	valid_0's ndcg@1: 0.883777	valid_0's ndcg@2: 0.91358	valid_0's ndcg@3: 0.927259	valid_0's ndcg@4: 0.934367	valid_0's ndcg@5: 0.938407
[120]	valid_0's ndcg@1: 0.883762	valid_0's ndcg@2: 0.913602	valid_0's ndcg@3: 0.927261	valid_0's ndcg@4: 0.934384	valid_0's ndcg@5: 0.938424
[121]	valid_0's ndcg@1: 0.883736	valid_0's ndcg@2: 0.913598	valid_0's ndcg@3: 0.927278	valid_0's ndcg@4: 0.934372	valid_0's ndcg@5: 0.938421
[122]	valid_0's ndcg@1: 0.883711	valid_0's ndcg@2: 0.913568	valid_0's ndcg@3: 0.927289	valid_0's ndcg@4: 0.93435	valid_0's ndcg@5: 0.938413
[123]	valid_0's ndcg@1: 0.883736	valid_0's ndcg@2: 0.913595	valid_0's ndcg@3: 0.927297	valid_0's ndcg@4: 0.934364	valid_0's ndcg@5: 0.938419
[124]	valid_0's ndcg@1: 0.88384	valid_0's ndcg@2: 0.913686	valid_0's ndcg@3: 0.927287	valid_0's ndcg@4: 0.934434	valid_0's ndcg@5: 0.938471
[125]	valid_0's ndcg@1: 0.883814	valid_0's ndcg@2: 0.91365	valid_0's ndcg@3: 0.927296	valid_0's ndcg@4: 0.934414	valid_0's ndcg@5: 0.938461
[126]	valid_0's n

[179]	valid_0's ndcg@1: 0.884476	valid_0's ndcg@2: 0.913952	valid_0's ndcg@3: 0.927652	valid_0's ndcg@4: 0.934722	valid_0's ndcg@5: 0.9388
[180]	valid_0's ndcg@1: 0.88458	valid_0's ndcg@2: 0.913958	valid_0's ndcg@3: 0.927669	valid_0's ndcg@4: 0.934729	valid_0's ndcg@5: 0.938832
[181]	valid_0's ndcg@1: 0.884606	valid_0's ndcg@2: 0.914007	valid_0's ndcg@3: 0.927697	valid_0's ndcg@4: 0.93474	valid_0's ndcg@5: 0.938855
[182]	valid_0's ndcg@1: 0.884606	valid_0's ndcg@2: 0.914032	valid_0's ndcg@3: 0.927698	valid_0's ndcg@4: 0.934745	valid_0's ndcg@5: 0.93886
[183]	valid_0's ndcg@1: 0.884647	valid_0's ndcg@2: 0.914059	valid_0's ndcg@3: 0.927731	valid_0's ndcg@4: 0.934794	valid_0's ndcg@5: 0.93888
[184]	valid_0's ndcg@1: 0.884626	valid_0's ndcg@2: 0.91403	valid_0's ndcg@3: 0.927731	valid_0's ndcg@4: 0.934787	valid_0's ndcg@5: 0.938874
[185]	valid_0's ndcg@1: 0.884616	valid_0's ndcg@2: 0.914064	valid_0's ndcg@3: 0.927747	valid_0's ndcg@4: 0.934793	valid_0's ndcg@5: 0.938878
[186]	valid_0's ndcg

[240]	valid_0's ndcg@1: 0.884606	valid_0's ndcg@2: 0.914216	valid_0's ndcg@3: 0.927955	valid_0's ndcg@4: 0.934965	valid_0's ndcg@5: 0.938998
[241]	valid_0's ndcg@1: 0.884617	valid_0's ndcg@2: 0.914217	valid_0's ndcg@3: 0.927952	valid_0's ndcg@4: 0.934969	valid_0's ndcg@5: 0.938998
[242]	valid_0's ndcg@1: 0.884627	valid_0's ndcg@2: 0.914213	valid_0's ndcg@3: 0.927955	valid_0's ndcg@4: 0.934977	valid_0's ndcg@5: 0.939002
[243]	valid_0's ndcg@1: 0.884632	valid_0's ndcg@2: 0.914194	valid_0's ndcg@3: 0.927957	valid_0's ndcg@4: 0.934983	valid_0's ndcg@5: 0.939011
[244]	valid_0's ndcg@1: 0.88458	valid_0's ndcg@2: 0.914143	valid_0's ndcg@3: 0.927913	valid_0's ndcg@4: 0.934949	valid_0's ndcg@5: 0.938967
[245]	valid_0's ndcg@1: 0.884606	valid_0's ndcg@2: 0.914183	valid_0's ndcg@3: 0.927921	valid_0's ndcg@4: 0.93495	valid_0's ndcg@5: 0.938967
[246]	valid_0's ndcg@1: 0.88459	valid_0's ndcg@2: 0.914181	valid_0's ndcg@3: 0.927911	valid_0's ndcg@4: 0.934946	valid_0's ndcg@5: 0.93896
[247]	valid_0's n

LGBMRanker(early_stopping_rounds=50, feature_fraction=0.8, is_unbalance=True,
           learning_rate=0.15, max_depth=15, metric='ndcg', n_estimators=5000,
           num_iterations=500, num_leaves=25, objective='lambdarank',
           subsample=0.2)

------------------------------------------------------------------------------------------------------------
## prediction over crossover set to get an estimate of NDCG evaluation

In [None]:
#X_test = X_test.drop("predicted_ranking",axis= 1)

In [None]:
# Making prediction over crossover set (ranking)
test_pred = gbm.predict(X_test)

print(test_pred)

In [None]:
X_test.head(50)

In [None]:
X_test["predicted_ranking"] = test_pred
#X_test[["srch_id","prop_id","predicted_ranking"]].head(50)

In [None]:
y_test.head(50)

In [None]:
# This is our own evaluation of the model, before getting real value in kaggle score
print("--> NDCG : ", ndcg_score([y_test],[X_test["predicted_ranking"]]))

------------------------------------------------------------------------------------------
## real prediction over test_set and output for kaggle competition

In [30]:
del training_data, aux_training
gc.collect()

111

In [27]:
test_data  = pd.read_csv("test_set_VU_DM.csv")

In [31]:
prop_counter = {}
for prop_id in test_data["prop_id"].unique():
    prop_counter[prop_id] = len(test_data.loc[test_data['prop_id'] == prop_id])
    
#create new column count_prop_id
test_data['count_prop_id'] = test_data.apply(
    lambda row: (prop_counter[row.prop_id]), axis=1)    



In [33]:
# save transformations
test_data.to_csv("test_set_TRANS.csv", index=False) 

In [None]:
del prop_counter
gc.collect()

In [34]:
test_data = replace_nan_with_mean(test_data,["visitor_hist_starrating","visitor_hist_adr_usd"])
test_data = replace_nan_with_median(test_data,["orig_destination_distance"])

col_min = test_data["srch_query_affinity_score"].min() # minimum of col
test_data = replace_nan_with_value(test_data,'srch_query_affinity_score',col_min)

col_min = test_data["prop_starrating"].min() # minimum of col
test_data = replace_nan_with_value(test_data,'prop_starrating',col_min)

col_min = test_data["prop_review_score"].min() # minimum of col
test_data = replace_nan_with_value(test_data,'prop_review_score',col_min)

col_min = test_data["prop_location_score1"].min() # minimum of col
test_data = replace_nan_with_value(test_data,'prop_location_score1',col_min)

col_min = test_data["prop_location_score2"].min() # minimum of col
test_data = replace_nan_with_value(test_data,'prop_location_score2',col_min)

In [35]:
# save transformations
test_data.to_csv("test_set_TRANS_2.csv", index=False) 

In [37]:
test_data

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,...,comp6_rate,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,count_prop_id
0,1,24,216,3.374933,177.15073,219,3180,3,4.5,1,...,,,,,,,,,,185
1,1,24,216,3.374933,177.15073,219,5543,3,4.5,1,...,,,,,,,,,,252
2,1,24,216,3.374933,177.15073,219,14142,2,3.5,1,...,,,,,,,,,,150
3,1,24,216,3.374933,177.15073,219,22393,3,4.5,1,...,,,,,,,,,,147
4,1,24,216,3.374933,177.15073,219,24194,3,4.5,1,...,,,,,,,,,,214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4959178,332787,24,216,3.374933,177.15073,117,32019,4,3.5,0,...,,,,,,,,,,13
4959179,332787,24,216,3.374933,177.15073,117,33959,4,3.0,1,...,,,,,,,,,,13
4959180,332787,24,216,3.374933,177.15073,117,35240,4,0.0,0,...,,,,,,,,,,2
4959181,332787,24,216,3.374933,177.15073,117,94437,4,0.0,0,...,,,,,,,,,,7


In [36]:
test_data = test_data.drop('date_time', axis=1)

KeyError: "['date_time'] not found in axis"

In [38]:
# Create prediction over the test set
test_pred = gbm.predict(test_data)

print(test_pred)

[-0.63257319 -0.76302679 -0.62844607 ... -1.63347539 -0.90981838
 -0.50561618]


In [39]:
# Add new column "predicted_ranking" to the test_set with predicted values of the model
test_data["predicted_ranking"] = test_pred
test_data.head(15)


Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,...,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,count_prop_id,predicted_ranking
0,1,24,216,3.374933,177.15073,219,3180,3,4.5,1,...,,,,,,,,,185,-0.632573
1,1,24,216,3.374933,177.15073,219,5543,3,4.5,1,...,,,,,,,,,252,-0.763027
2,1,24,216,3.374933,177.15073,219,14142,2,3.5,1,...,,,,,,,,,150,-0.628446
3,1,24,216,3.374933,177.15073,219,22393,3,4.5,1,...,,,,,,,,,147,-1.232246
4,1,24,216,3.374933,177.15073,219,24194,3,4.5,1,...,,,,,,,,,214,0.363629
5,1,24,216,3.374933,177.15073,219,28181,3,4.5,1,...,,,,,,,,,381,0.629128
6,1,24,216,3.374933,177.15073,219,34263,3,4.5,1,...,,,,,,,,,202,-0.07059
7,1,24,216,3.374933,177.15073,219,37567,2,4.5,0,...,,,,,,,,,175,-0.545314
8,1,24,216,3.374933,177.15073,219,50162,2,3.5,1,...,,,,,,,,,293,0.060516
9,1,24,216,3.374933,177.15073,219,54937,3,4.0,1,...,,,,,,,,,403,0.844006


In [40]:
# Sort by srch_id and predicted_ranking
test_data = test_data.sort_values(["srch_id","predicted_ranking"], ascending=[True,False])

In [41]:
test_data.head()

Unnamed: 0,srch_id,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,...,comp6_inv,comp6_rate_percent_diff,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,count_prop_id,predicted_ranking
23,1,24,216,3.374933,177.15073,219,99484,3,4.0,1,...,,,,,,,,,368,1.214638
9,1,24,216,3.374933,177.15073,219,54937,3,4.0,1,...,,,,,,,,,403,0.844006
12,1,24,216,3.374933,177.15073,219,61934,3,4.5,1,...,,,,,,,,,377,0.822917
5,1,24,216,3.374933,177.15073,219,28181,3,4.5,1,...,,,,,,,,,381,0.629128
4,1,24,216,3.374933,177.15073,219,24194,3,4.5,1,...,,,,,,,,,214,0.363629


In [42]:
# Drop all columns but srch_id and prop_id for output format
test_data.drop(test_data.columns.difference(['srch_id','prop_id']), 1, inplace=True)
test_data.head()

Unnamed: 0,srch_id,prop_id
23,1,99484
9,1,54937
12,1,61934
5,1,28181
4,1,24194


In [43]:
# Create output file for competition
test_data.to_csv("out4.csv", index=False)    

# -------------------------------------------------------------------------------------------------------------

In [44]:
gc.collect()

165