In [1]:
import pandas as pd
import numpy as np
import os
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import matplotlib.pyplot as plt
from numpy.linalg import inv
from IPython.display import display
from tqdm import tqdm
from joblib import Parallel, delayed 
import random
import time
import pickle
from copy import deepcopy

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
%matplotlib inline

In [2]:
ROOT_PATH = os.getcwd()
DATA_PATH = os.path.join(ROOT_PATH, "data")
train_data_file = os.path.join(DATA_PATH, "train.csv")
test_data_file = os.path.join(DATA_PATH, "test.csv")

train = pd.read_csv(train_data_file)
test = pd.read_csv(test_data_file)

In [3]:
def rank_properties(user_id, list_prop, search_query_params=None):
    new_prop_list = deepcopy(list_prop)
    random.shuffle(new_prop_list)
    
    return new_prop_list

In [4]:
def get_search_results(user_id, list_prop, search_query_params=None):
    ranked_properties = rank_properties(user_id, list_prop, search_query_params)
    user_id_duplicated = [user_id] * len(ranked_properties)
    
    return list(zip(user_id_duplicated, ranked_properties))

In [5]:
relevant_columns = ['srch_id', 'prop_key']
relevant_data = test[relevant_columns]
relevant_data.head()

Unnamed: 0,srch_id,prop_key
0,-1087756044,3075608
1,-1087756044,242706
2,-1087756044,247231
3,-1087756044,258704
4,-1087756044,3407116


In [6]:
sorted_search_ids_data = \
    relevant_data \
        .groupby('srch_id')['prop_key'] \
        .apply(list) \
        .reset_index(name='prop_ids') \
        .sort_values(by='srch_id') \
        
sorted_search_ids_data.head()

Unnamed: 0,srch_id,prop_ids
0,-2147403968,"[358698, 242180, 270584, 248136, 245287, 36386..."
1,-2145422147,"[260096, 282197, 1098965, 342228, 282202, 2479..."
2,-2145205355,"[444331, 248735, 299983, 552966, 247024, 26321..."
3,-2144648286,"[588300, 3270102, 253920, 453199, 3877123, 480..."
4,-2144249235,"[277410, 289047, 349893, 254365, 255737, 27347..."


In [7]:
final_results = []
for idx, row in sorted_search_ids_data.iterrows():
    sorted_results = get_search_results(user_id=row['srch_id'], 
                                        list_prop=row['prop_ids'])
    final_results.extend(sorted_results)

In [8]:
results = pd.DataFrame(final_results, columns=['srch_id', 'prop_key'])
results.head()

Unnamed: 0,srch_id,prop_key
0,-2147403968,3182870
1,-2147403968,265073
2,-2147403968,3895289
3,-2147403968,3802344
4,-2147403968,287623


In [9]:
results.to_csv('output/rresults.csv', index=False)

## _Fin._