In [1]:
import numpy as np 
import pandas as pd 
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from datetime import datetime
import os


In [133]:
%%time
train = pd.read_csv('../data/train_2013.csv',
                    dtype={'is_booking':bool,'srch_destination_id':np.int32, 'hotel_cluster':np.int32},
                    usecols=['srch_destination_id','is_booking','hotel_cluster'],
                    chunksize=1000000)

aggs = []
print('-'*38)
for chunk in train:
    agg = chunk.groupby(['srch_destination_id',
                         'hotel_cluster'])['is_booking'].agg(['sum','count'])
    agg.reset_index(inplace=True)
    aggs.append(agg)
    print('.',end='')
    
print('')

aggs = pd.concat(aggs, axis=0)

agg = aggs.groupby(['srch_destination_id','hotel_cluster']).sum().reset_index()
agg.head()

agg['count'] -= agg['sum']
agg = agg.rename(columns={'sum':'bookings','count':'clicks'})
agg['relevance'] = agg['bookings'] + 0.1 * agg['clicks']

print("preprocess end")

def most_popular(group, n_max=5):
    relevance = group['relevance'].values
    hotel_cluster = group['hotel_cluster'].values
    most_popular = hotel_cluster[np.argsort(relevance)[::-1]][:n_max]
    return np.array_str(most_popular)[1:-1] 

most_pop = agg.groupby(['srch_destination_id']).apply(most_popular)
most_pop = pd.DataFrame(most_pop).rename(columns={0:'hotel_cluster'})

train_df = pd.read_csv("../data/train_2013.csv", usecols=['srch_destination_id','is_booking','hotel_cluster', 'user_location_city', 'orig_destination_distance'])

test = pd.read_csv('../data/test.csv',
                    dtype={'srch_destination_id':np.int32},
                    usecols=['srch_destination_id', 'user_location_city', 'orig_destination_distance'])

test = test.merge(most_pop, how='left',left_on='srch_destination_id',right_index=True)

train_df = train_df.fillna(-1)
train_df = train_df[train_df["is_booking"] == 1]
np.random.seed(402)
train_df = train_df.ix[np.random.choice(train_df.index, 50000, replace=False)]

train_x = train_df.ix[:,:3]
train_y = train_df.ix[:,4:5]

%%time
print("modeling strart")
model = RandomForestClassifier(n_estimators=10, max_depth=7, n_jobs=-1, random_state=777)
print('='*50)
print('# Test shape : {}'.format(test.shape))

model.fit(train_x,train_y)

print("preprocess test data")
test1 = test1.fillna(-1)
test1 = test1.drop("hotel_cluster", axis=1)

print("printing predict test data")
preds = model.predict_proba(test1)
preds = np.fliplr(np.argsort(preds, axis=1))

random_model = pd.DataFrame([ " ".join(row) for row in preds[:,:5].astype(str)], columns=["hotel_cluster"])

test_sample = test.reset_index()
test1 = test1.reset_index()


test1 = pd.concat([test1,random_model], axis=1)

test1 = test1[["index","hotel_cluster"]]

result_df = test_sample.merge(test1, how = "left", on="index", copy=False)[["hotel_cluster_x", "hotel_cluster_y"]]

result_df = result_df.fillna("")
result_df = result_df["hotel_cluster_x"] + result_df["hotel_cluster_y"]
result_df = result_df.to_frame(name="hotel_cluster")
result_df.index.names = ["id"]

file_name = datetime.now().strftime("result_%Y%m%d%H%M%S") + '.csv'
result_df.to_csv(os.path.join('../output',file_name), index=True)

--------------------------------------
............
