## Imports

In [1]:
import math
import pandas as pd
import numpy as np
from score_submission import score_subm as ss

## Cols for submission

In [2]:
GR_COLS = ["user_id", "session_id", "timestamp", "step"]

## Convert string to array 

Code comes from baseline_algorithm

In [3]:
def string_to_array(s):
    """Convert pipe separated string to array."""

    if isinstance(s, str):
        out = s.split("|")
    elif math.isnan(s):
        out = []
    else:
        raise ValueError("Value must be either string of nan")
    return out

## Explode

Code comes from baseline_algorithm

In [4]:
def explode(df_in, col_expl):
    """Explode column col_expl of array type into multiple rows."""

    df = df_in.copy()
    df.loc[:, col_expl] = df[col_expl].apply(string_to_array)

    df_out = pd.DataFrame(
        {col: np.repeat(df[col].values,
                        df[col_expl].str.len())
         for col in df.columns.drop(col_expl)}
    )

    df_out.loc[:, col_expl] = np.concatenate(df[col_expl].values)
    df_out.loc[:, col_expl] = df_out[col_expl].apply(int)

    return df_out

## Concatante groups

Code comes from baseline_algorithm

In [5]:
def group_concat(df, gr_cols, col_concat):
    """Concatenate multiple rows into one."""

    df_out = (
        df
            .groupby(gr_cols)[col_concat]
            .apply(lambda x: ' '.join(x))
            .to_frame()
            .reset_index()
    )

    return df_out

## Identify rows with missing click outs

Code comes from baseline_algorithm

In [6]:
def get_submission_target(df):
    mask = df["reference"].isnull() & (df["action_type"] == "clickout item")
    df_out = df[mask]

    return df_out

## Geting popularity

In [7]:
def get_popularity(df):
    """Get number of clicks that each item received in the df."""

    # Select clickouts from dataframe
    df_clickout = df[(df.action_type == "clickout item")]

    # Count clickouts
    df_item_clicks = (
        df_clickout
            .groupby("reference")
            .size()
            .reset_index(name="n_clicks")
            .transform(lambda x: x.astype(int))
    )

    # Count hotels with explode - splition impressions
    hotels = explode(df_clickout, "impressions")
    hotels_grouped = hotels.groupby("impressions").size().reset_index(name="n_apperarances").transform(
        lambda x: x.astype(int))

    # Find unique prices
    prices = explode(df_clickout, "prices")
    unique_prices = prices.drop_duplicates(["prices"]).sort_values(["prices"]).reset_index(drop=True)
    unique_prices_list = unique_prices['prices'].tolist()

    # Merge hotels with price - dictionary then dataframe
    hotel_prices = {'reference': hotels.impressions, 'Price': prices.prices}
    hotel_prices_df = pd.DataFrame(hotel_prices)
    hotel_prices_df = hotel_prices_df.drop_duplicates(["reference"])

    # Merge hotels with clicks
    hotels_grouped.rename(columns={"impressions": "reference"}, inplace=True)
    hotels_info = pd.merge(hotels_grouped, df_item_clicks, on='reference', how='left')
    hotels_info['n_clicks'] = hotels_info['n_clicks'].fillna(0)


    # Add CTR
    hotels_info['CTR'] = hotels_info.n_clicks / hotels_info.n_apperarances * 100
    hotels_info['CTR'] = hotels_info['CTR'].transform(lambda x: x.astype(int))

    # Add Price
    hotels_info = pd.merge(hotels_info, hotel_prices_df, on='reference')


    price_cuttoff = unique_prices_list[int(unique_prices.shape[0] * 0.75)]

    # wybiera tylko wiersze ktore mają CTR większy od 50 i cene mniejszą lub równą progu cenowego
    # hotels_info.n_click!=0 jest po to bo jak było jedno wyswietlenie i zero kliknieć to ctr dawał 100
    hotels_info = hotels_info[
        (hotels_info.n_clicks != 0) & (hotels_info.CTR >= 50) & (hotels_info.Price <= price_cuttoff)]
    hotels_info = hotels_info.sort_values(["n_clicks"], ascending=False)

    return hotels_info


## Calc recomendations

In [8]:
def calc_recommendation(df_expl, df_pop):
    """Calculate recommendations based on popularity of items.

   The final data frame will have an impression list sorted according to the number of clicks per item in a reference data frame.

   :param df_expl: Data frame with exploded impression list
   :param df_pop: Data frame with items and number of clicks
   :return: Data frame with sorted impression list according to popularity in df_pop
   """

    df_expl_clicks = (
        df_expl[GR_COLS + ["impressions"]]
            .merge(df_pop,
                   left_on="impressions",
                   right_on="reference",
                   how="left")
    )

    df_out = (
        df_expl_clicks
            .assign(impressions=lambda x: x["impressions"].apply(str))
            .sort_values(GR_COLS + ["n_clicks"],
                         ascending=[True, True, True, True, False])
    )

    df_out = group_concat(df_out, GR_COLS, "impressions")
    df_out.rename(columns={'impressions': 'item_recommendations'}, inplace=True)

    return df_out

## Test

In [9]:
    train_csv = './data/train.csv'
    test_csv = './data/test.csv'
    ground_truth_csv = './data/groundTruth.csv'
    subm_csv = './data/submission_popular.csv'

    df_train = pd.read_csv(train_csv)
    df_test = pd.read_csv(test_csv)

    df_popular = get_popularity(df_train)

    df_target = get_submission_target(df_test)

    df_expl = explode(df_target, "impressions")
    df_out = calc_recommendation(df_expl, df_popular)

    df_out.to_csv(subm_csv, index=False)

    ss.main(ground_truth_csv, subm_csv)

Reading ground truth data ./data/groundTruth.csv ...
Reading submission data ./data/submission_popular.csv ...
Mean reciprocal rank: 0.5001593724458934
