In [2]:
from datetime import datetime
from modelhub import ModelHub
from bach import display_sql_as_markdown

In [3]:
modelhub = ModelHub(time_aggregation='%Y-%m-%d')

In [4]:
df = modelhub.get_objectiv_dataframe(start_date='2022-02-02')
df['feature_nice_name'] = df.location_stack.ls.nice_name

In [5]:
# construct steps column which is a list of strings 
df_steps = df.groupby('session_id')['feature_nice_name'].to_json_array().reset_index()
df_steps = df_steps.rename(columns={'feature_nice_name': 'steps'})
df_steps = df.merge(df_steps, on='session_id')[['session_id', 'steps']].drop_duplicates()
df_steps.to_pandas()

Unnamed: 0_level_0,session_id,steps
event_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6307e080-d6a4-4e7d-b47f-a029b90cf6e1,97,"[Root Location: blog, Media Player: objectiv-i..."
03bfb487-a6ae-4175-bf3a-db127a75a85f,1651,"[Root Location: home, Media Player: 2-minute-v..."
21629b85-c17a-458b-a5ab-ff4c074a963c,2872,"[Root Location: home, Media Player: 2-minute-v..."
0f3b8b89-e4ff-4256-9beb-c3f252216699,2116,[Content: post-meet-objectiv-open-source-produ...
02ad3934-665c-47e9-8d76-c2a7a79f2c60,3600,[Pressable: hamburger located at Root Location...
...,...,...
0f0fc271-a3ca-4952-bed7-90ca7656dbb1,3836,[Pressable: after located at Root Location: ho...
1ba7c155-0521-431d-8f5d-564aa3e8daa6,3852,"[Root Location: home, Link: docs located at Ro..."
0d74fe99-d796-423b-bccb-f7c64a155aa6,3720,[Expandable: Reference located at Root Locatio...
044ec938-81c0-496a-9a11-f35412f81614,3975,"[Root Location: home, Link: docs located at Ro..."


## My custom lambda func

In [6]:
func_ngram = lambda data, n: [data[i: i + n] for i in range(len(data) - n + 1)]

func_ngram(['a', 'b', 'c', 'd', 'e', 'f'], 3)

[['a', 'b', 'c'], ['b', 'c', 'd'], ['c', 'd', 'e'], ['d', 'e', 'f']]

In [15]:
x = ['a', 'b', 'c', 'd', 'e', 'f']
len(x) - 3 + 1

4

In [16]:
x[:4]

['a', 'b', 'c', 'd']

### For each row of `df_steps.steps` want to apply `func_ngram` func

In [7]:
df_steps_pandas = df_steps.to_pandas()
n_gram = 3 # this value will provide the user

df_steps_pandas['desired_col'] = df_steps_pandas['steps'].apply(lambda x: func_ngram(x, n_gram))

In [8]:
df_steps_pandas['steps'].iloc[0]

['Root Location: blog',
 'Media Player: objectiv-in-2-minutes located at Root Location: blog => Content: post-meet-objectiv-open-source-product-analytics-designed-for-data-sc']

In [9]:
df_steps_pandas['desired_col'].iloc[0]

[]

In [10]:
df_steps_pandas['steps'].apply(lambda x: func_ngram(x, n_gram))

event_id
6307e080-d6a4-4e7d-b47f-a029b90cf6e1                                                   []
03bfb487-a6ae-4175-bf3a-db127a75a85f    [[Root Location: home, Media Player: 2-minute-...
21629b85-c17a-458b-a5ab-ff4c074a963c    [[Root Location: home, Media Player: 2-minute-...
0f3b8b89-e4ff-4256-9beb-c3f252216699    [[Content: post-meet-objectiv-open-source-prod...
02ad3934-665c-47e9-8d76-c2a7a79f2c60    [[Pressable: hamburger located at Root Locatio...
                                                              ...                        
0f0fc271-a3ca-4952-bed7-90ca7656dbb1    [[Pressable: after located at Root Location: h...
1ba7c155-0521-431d-8f5d-564aa3e8daa6    [[Root Location: home, Link: docs located at R...
0d74fe99-d796-423b-bccb-f7c64a155aa6    [[Expandable: Reference located at Root Locati...
044ec938-81c0-496a-9a11-f35412f81614    [[Root Location: home, Link: docs located at R...
110f27ac-183f-44d6-8800-ad8655825699    [[Expandable: Reference located at Root Locati...
N

In [31]:
import bach
from sql_models.util import is_postgres, is_bigquery, DatabaseNotSupportedException

In [None]:
def _bach_func_gram_pg(list_series: bach.SeriesJson, n: int) -> bach.SeriesJson:
    ...

def _bach_func_gram_bq(list_series: bach.SeriesJson, n: int) -> bach.SeriesJson:
    """
    Steps for generating shifted sublists:
        1. Cast list_series expression to array (JSON_QUERY_ARRAY)
        2. Generate index of items per sublist
        3. Iterate over generated indexes and extract items from original array
        4. Generate expression for unnesting original array and creating array
            with final sub-lists
    """
    # stmt for converting array type to valid json arrays
    _ARRAY_TO_STR_STMT = "'[' || ARRAY_TO_STRING(ARRAY({}), ', ') || ']'" 
    _FIRST_ELEMENT_SUBLIST_OFFSET = '__first_sublist_offset'
    _GENERATED_OFFSET_SUBLIST = '__sublist_item_offset'
    
    
    # step 1. cast list string as BQ array
    arr_parsed_list_series = list_series.copy_override(
        expression=bach.expression.Expression.construct(
            'JSON_QUERY_ARRAY({})',
            list_series
        )
    )


    # step 2.
    # Example: n = 6 and current item_pos = 2. Expression will generate:
    # [2, 3, 4, 5]
    generated_offsets_expr = bach.expression.Expression.raw(
        f"""
            GENERATE_ARRAY(
                {_FIRST_ELEMENT_SUBLIST_OFFSET}, {_FIRST_ELEMENT_SUBLIST_OFFSET} + {n - 1}
            )
        """
    )
    
    # step 3. 
    extracted_sub_list_expr = bach.expression.Expression.construct(
        f"""
        SELECT {{}}[OFFSET({_GENERATED_OFFSET_SUBLIST})]
        FROM UNNEST({{}}) as {_GENERATED_OFFSET_SUBLIST}
        """,
        arr_parsed_list_series,
        generated_offsets_expr
    )
    
    # step 4.
    sub_array_list_series = list_series.copy_override(
        expression=bach.expression.Expression.construct(
            (
                f'SELECT {_ARRAY_TO_STR_STMT} FROM UNNEST({{}}) WITH OFFSET '
                f'AS {_FIRST_ELEMENT_SUBLIST_OFFSET} \n\t'
                f'WHERE {_FIRST_ELEMENT_SUBLIST_OFFSET} <= {{}} - {n}\n'
            ),
            extracted_sub_list_expr,
            arr_parsed_list_series,
            list_series.json.get_array_length(),
        ),
    )
    
    # group all sub-lists into one main list
    final_array_list_series = sub_array_list_series.copy_override(
        expression=bach.expression.Expression.construct(
            _ARRAY_TO_STR_STMT, sub_array_list_series,
        )
    )
    return final_array_list_series

    
def bach_func_gram(list_series: bach.SeriesJson, n: int) -> bach.SeriesJson:
    engine = list_series.engine
    if is_postgres(engine):
        return _bach_func_gram_pg(list_series, n)
    
    if is_big_query(engine):
        return _bach_func_gram_bq(list_series, n)
    
    raise DatabaseNotSupportedException(li)


In [None]:
print(_bach_func_gram_bq(df_steps['steps'], n=3).view_sql())