In [27]:
import pandas as pd
import datatable as dt
from tqdm import tqdm
import graphviz

ACTIONS_WITH_ITEM_REFERENCE = {
    "search for item",
    "interaction item info",
    "interaction item image",
    "interaction item deals",
    "interaction item rating",
    "clickout item",
}

In [3]:
df = pd.read_csv("../../../data/events_sorted.csv", nrows=100000) 

In [46]:
all_obs = []
for user_id, df_user in tqdm(df.iloc[:100000].groupby("user_id")):
    obs = {}
    obs["user_id"] = user_id
    interacted_ranks = []
    for row in df_user.to_dict(orient='records'):
        
        action_type = row["action_type"].replace(" ", "_")
        
        if action_type == "clickout_item" and row["clickout_step_rev"] > 1:
            impressions = row["impressions"].split("|")
                
        if action_type == "clickout_item" and row["clickout_step_rev"] == 1:
            impressions = row["impressions"].split("|")
            if row["reference"] not in impressions:
                continue
            obs["index_clicked"] = impressions.index(row["reference"])
            for rank, item_id in enumerate(row["impressions"].split("|")):
                obs["item_id_impressions_clickout_item_{:02d}".format(rank+1)] = int(item_id)
            for rank, price in enumerate(row["prices"].split("|")):
                obs["price_clickout_item_{:02d}".format(rank+1)] = int(price)
        
        if row["clickout_step_rev"] <= 10:
            obs["timestamp_{}_{:02d}".format(action_type, row["clickout_step_rev"])] = "TS="+str(row["timestamp"])
            if row["action_type"] in ACTIONS_WITH_ITEM_REFERENCE:
                try:
                    impressions = row["impressions"].split("|")
                    if row["reference"] in impressions:
                        interacted_ranks.append(impressions.index(row["reference"]))
                except:
                    pass
                        
                obs["item_id_{}_{:02d}".format(action_type, row["clickout_step_rev"])] = "ITEM_ID="+str(row["reference"])
                
    for i, rank in enumerate(interacted_ranks[::-1]):
        obs["interacted_rank_{}".format(i)] = rank

    all_obs.append(obs)

100%|██████████| 6241/6241 [00:15<00:00, 412.33it/s]


In [17]:
df_all = pd.DataFrame.from_records(all_obs)

In [18]:
price_cols = [col for col in df_all.columns if col.startswith("price_")]
item_id_cols = [col for col in df_all.columns if col.startswith("item_id_impressions_clickout_item")]
df_all = df_all[df_all["index_clicked"] > 0]
X = df_all[price_cols + item_id_cols].fillna(-1)
y = df_all["index_clicked"]

In [19]:
X.dtypes

price_clickout_item_01                  float64
price_clickout_item_02                  float64
price_clickout_item_03                  float64
price_clickout_item_04                  float64
price_clickout_item_05                  float64
price_clickout_item_06                  float64
price_clickout_item_07                  float64
price_clickout_item_08                  float64
price_clickout_item_09                  float64
price_clickout_item_10                  float64
price_clickout_item_11                  float64
price_clickout_item_12                  float64
price_clickout_item_13                  float64
price_clickout_item_14                  float64
price_clickout_item_15                  float64
price_clickout_item_16                  float64
price_clickout_item_17                  float64
price_clickout_item_18                  float64
price_clickout_item_19                  float64
price_clickout_item_20                  float64
price_clickout_item_21                  

In [38]:
from gplearn.genetic import SymbolicTransformer, SymbolicRegressor
from lightgbm import LGBMClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split

est = make_pipeline(
        SymbolicTransformer(parsimony_coefficient=.001,
                          random_state=1,
                          function_set=['add', 'sub', 'mul', 'div', 'min', 'max'],
                          verbose=True),
        LGBMClassifier()
)

X_tr, X_va, y_tr, y_va = train_test_split(X, y)
est.fit(X_tr, y_tr)

pred = est.predict(X_va)

print((pred == y_va).mean())

    |    Population Average   |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    26.42 0.04248540414570351       63 0.20787176829398604              N/A      7.77s
   1     7.42 0.1065611350014104        5 0.20802811444839048              N/A      8.88s
   2      3.8 0.16186912772494683        3 0.20801685700800113              N/A      7.66s
   3     2.97 0.17053920454932792        3 0.20801685700800113              N/A      6.64s
   4      3.0 0.17226940406588387        3 0.20801685700800113              N/A      5.91s
   5     3.07 0.17249396442292894        5 0.20801894145969121              N/A      5.32s
   6      3.0 0.1684569651675334        3 0.20801685700800113              N/A      4.82s
   7     3.05 0.17002354842603631        3 0.20801685700800113              N/A      4.37s
   8      3.0 0.1723708700993

ValueError: Lengths must match to compare

In [36]:
dot_data = est._program.export_graphviz()
graph = graphviz.Source(dot_data)
graph

AttributeError: 'Pipeline' object has no attribute '_program'

In [44]:
print((pred == y_va.values).mean())

0.10922330097087378


In [43]:
pred

array([ 1.,  1.,  7.,  3.,  5.,  1.,  6.,  5.,  2.,  1.,  9.,  1.,  1.,
        2.,  1.,  1.,  1.,  1.,  1.,  4.,  1.,  1.,  1., 15.,  5.,  2.,
        2.,  1., 19., 19.,  1.,  2.,  6.,  3.,  2.,  1.,  2.,  1.,  1.,
        8.,  1.,  1.,  9.,  7.,  1.,  1.,  1.,  1.,  1.,  2.,  2., 11.,
        4.,  2.,  2.,  1.,  1.,  1.,  1.,  1.,  7.,  1.,  2.,  1.,  6.,
        2.,  1., 18.,  3.,  1.,  1., 11.,  1.,  1.,  2.,  4.,  7.,  1.,
        5.,  5.,  1.,  3.,  1.,  1.,  1.,  7.,  4.,  1., 11.,  1.,  6.,
        1.,  1.,  1.,  6.,  4.,  2.,  2.,  1.,  1.,  1.,  2.,  1.,  7.,
        4.,  1.,  4.,  1.,  7.,  6.,  3.,  4.,  3.,  1.,  2.,  2.,  4.,
       10.,  2., 10.,  3.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  5.,  1.,
        1.,  2.,  1.,  1.,  7.,  6.,  1.,  1.,  2.,  2.,  1.,  2.,  1.,
        1.,  4.,  1.,  3.,  2.,  2.,  1.,  8.,  4.,  1.,  9.,  1.,  1.,
        1.,  2.,  4.,  1.,  1.,  1.,  4., 19.,  1.,  2.,  1.,  6.,  1.,
        2.,  1.,  1.,  1., 13.,  2.,  3.,  2.,  2.,  3., 14.,  1