# Netflix recommendations

<div class="alert alert-block alert-warning">
Replace <code>YOUR_GITHUB_TOKEN</code> in the install script. To get your token follow the instructions in the <a href="../README.md">README.md</a>
</div>

## Boilerplate

In [1]:
%pip install pandas
from google.colab import userdata
github_token = userdata.get('github_token')
url = f"https://us-central1-data-359211.cloudfunctions.net/github-proxy/superlinked-0.1.0-py3-none-any.whl?token={github_token}"
!pip install "$url"

Collecting superlinked==0.1.0
  Downloading https://us-central1-data-359211.cloudfunctions.net/github-proxy/superlinked-0.1.0-py3-none-any.whl?token=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 (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.8/57.8 kB[0m [31m59.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jsonpath-ng (from superlinked==0.1.0)
  Downloading jsonpath_ng-1.6.0-py3-none-any.whl (29 kB)
Collecting sentence-transformers (from superlinked==0.1.0)
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ply (from jsonpath-ng->superlinked==0.1.0)
  Downloading ply-3.11-py2.py3-none-any.whl (49 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.6/49.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00

## Imports and constants

In [2]:
import pandas as pd
from datetime import datetime, timezone
import pytz

from superlinked.framework.common.dag.recency_node import PeriodTimeParam
from superlinked.framework.common.schema.schema import schema, String, Timestamp, IdField, Integer
from superlinked.framework.common.parser.dataframe_parser import DataFrameParser
from superlinked.framework.dsl.executor.in_memory.in_memory_executor import InMemoryExecutor, InMemoryApp
from superlinked.framework.dsl.index.index import Index
from superlinked.framework.dsl.query.param import Param
from superlinked.framework.dsl.query.query import Query
from superlinked.framework.dsl.query.result import Result
from superlinked.framework.dsl.source.in_memory_source import InMemorySource
from superlinked.framework.dsl.space.text_similarity_space import TextSimilaritySpace
from superlinked.framework.dsl.space.recency_space import RecencySpace

In [3]:
MODEL = "sentence-transformers/paraphrase-MiniLM-L3-v2"
YEAR_IN_DAYS = 365
TOP_N = 10

### Helpers

In [4]:
def get_ordered_result_tuples(result: Result, top_n: int) -> list[tuple[int]]:
    return [(i+1, int(entity.id_.object_id)) for i, entity in enumerate(result.entities[:top_n])]

def get_events_by_id_list(id_list_tuple: list[tuple[int]], df: pd.DataFrame) -> pd.DataFrame:
    if df.index.name != "id":
        df = df.set_index("id")
    result_df = df.loc[[id_tuple[1] for id_tuple in id_list_tuple]]
    print(result_df)
    result_df["order"] = [id_tuple[0] for id_tuple in id_list_tuple]
    return result_df

def parse_results(result: Result, df: pd.DataFrame, top_n: int = TOP_N) -> pd.DataFrame:
    id_tuples = get_ordered_result_tuples(result=result, top_n=top_n)
    return get_events_by_id_list(id_list_tuple=id_tuples, df=df)

def datetimestr_to_epoch(date_str):
    # Parse the date string to a datetime object
    dt = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S.%fZ")
    # Convert the datetime object to UTC timezone
    dt = dt.replace(tzinfo=timezone.utc)
    # Get the total seconds since Unix Epoch
    epoch_time = (dt - datetime(1970, 1, 1, tzinfo=timezone.utc)).total_seconds()
    return epoch_time

def datestr_to_epoch(date_str):
    # Parse the date string to a datetime object without time information
    dt = datetime.strptime(date_str, "%Y-%m-%d")
    # Localize the datetime object to UTC timezone
    utc_tz = pytz.utc
    dt = utc_tz.localize(dt)
    # Get the total seconds since Unix Epoch
    epoch_time = (dt - datetime(1970, 1, 1, tzinfo=utc_tz)).total_seconds()
    return epoch_time

## Explore dataset

In [6]:
event_df = pd.read_csv("/content/blox-event-table-data.csv")
event_df = event_df.fillna('')
event_df = event_df[['id', 'date_created', 'date_modified', 'slug', 'date', 'age_limit', 'image', 'venue_id', 'description', 'time1', 'time2', 'time1_type_id', 'time2_type_id', 'private', 'venue_undisclosed', 'title', 'sale_ended', 'metadata_image', 'draft', 'release_date', 'creator_id', 'last_modifier_id', 'white_label_only', 'deleted', 'artist_names', 'promoter_names', 'organization_names', 'club_name', 'club_address', 'creator_email', 'modifier_email']].dropna(how='any', subset=['id'])
event_df['id'] = event_df["id"]
event_df['date_created'] = event_df["date_created"].apply(datetimestr_to_epoch)
event_df['date_modified'] = event_df["date_modified"].apply(datetimestr_to_epoch)
event_df['date'] = event_df["date"].apply(datestr_to_epoch)
event_df.head()
print(event_df)

       id  date_created  date_modified  \
0       1  1.642705e+09   1.643230e+09   
1       2  1.567641e+09   1.583939e+09   
2       3  1.567792e+09   1.579149e+09   
3       4  1.567793e+09   1.583939e+09   
4       5  1.567793e+09   1.583939e+09   
..    ...           ...            ...   
843  1125  1.699896e+09   1.699902e+09   
844  1134  1.700099e+09   1.700099e+09   
845  1135  1.700157e+09   1.700157e+09   
846  1138  1.700428e+09   1.700442e+09   
847  1140  1.700632e+09   1.700633e+09   

                                         slug          date  age_limit  \
0                       primary-presents-melt  1.643328e+09         21   
1    overdrive-w-agent-orange-dj-gettoblaster  1.569629e+09         21   
2               proper-presents-dave-owen-nyc  1.568160e+09         21   
3                  proper-presents-rees-urban  1.568765e+09         21   
4                     proper-presents-dave-uv  1.569370e+09         21   
..                                        ...      

## Set up Superlinked

In [7]:
@schema
class EventSchema:
    id: IdField
    date_created: Timestamp
    date_modified: Timestamp
    slug: String
    date: Timestamp
    age_limit: Integer
    image: String
    description: String
    time1: String
    time2: String
    private: String
    venue_undisclosed: String
    title: String
    release_date: Timestamp
    white_label_only: String
    artist_names: String
    promoter_names: String
    organization_names: String
    club_name: String
    club_address: String
    creator_email: String
    modifier_email: String

In [15]:
print(EventSchema)
print(event)
print(description_space)
print(event_index)

<class 'superlinked.framework.common.schema.schema.DecoratedType'>
<superlinked.framework.common.schema.schema.DecoratedType object at 0x7909d4487070>
<superlinked.framework.dsl.space.text_similarity_space.TextSimilaritySpace object at 0x7909d44b3a60>
<superlinked.framework.dsl.index.index.Index object at 0x790ac8c1e1a0>


In [9]:
event = EventSchema()

In [12]:
description_space = TextSimilaritySpace(text=event.description, model=MODEL)
title_space = TextSimilaritySpace(text=event.title, model=MODEL)
artist_space = TextSimilaritySpace(text=event.artist_names, model=MODEL)
promoter_space = TextSimilaritySpace(text=event.promoter_names, model=MODEL)
venue_space = TextSimilaritySpace(text=event.club_name, model=MODEL)
recency_space = RecencySpace(timestamp=event.date, period_time_param_list=[
    PeriodTimeParam(16 * YEAR_IN_DAYS, negative_filter=0),
    PeriodTimeParam(40 * YEAR_IN_DAYS, negative_filter=0),
    PeriodTimeParam(160 * YEAR_IN_DAYS, negative_filter=0)
])

In [14]:
event_index = Index(spaces=[description_space, title_space, artist_space, promoter_space, venue_space, recency_space])

.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.01k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/69.6M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
advanced_query = (
    Query(event_index, weights={
        description_space: Param("description_weight"),
        title_space: Param("title_weight"),
        artist_space: Param("artist_weight"),
        promoter_space: Param("promoter_weight"),
        venue_space: Param("venue_weight"),
        recency_space: Param("recency_weight")
    })
    .find(event)
    .similar(description_space.text, Param("description_query_text"))
    .similar(title_space.text, Param("title_query_text"))
    .similar(artist_space.text, Param("artist_query_text"))
    .similar(promoter_space.text, Param("promoter_query_text"))
    .similar(venue_space.text, Param("venue_query_text"))
)

In [None]:
df_parser = DataFrameParser(schema=event)

In [None]:
source: InMemorySource = InMemorySource(event, parser=df_parser)
executor: InMemoryExecutor = InMemoryExecutor(sources=[source], indices=[event_index])
app: InMemoryApp = executor.run()

In [None]:
source.put([event_df])

## Run queries

### Queries

With the simple query, I can search with my text in all of the fields

In [None]:
# result: Result = app.query(
#     simple_query,
#     query_text="Sci-fi epic action film",
#     description_weight=1,
#     title_weight=1,
#     genre_weight=1,
#     recency_weight=0
# )
# parse_results(result, movie_df, 10)

With the advanced query, I can supply different search terms for each attribute of the event.

In [None]:
result = app.query(
    advanced_query,
    description_query_text="",
    title_query_text="",
    artist_query_text="lesprite",
    promoter_query_text="",
    venue_query_text="arbella",
    description_weight=0,
    title_weight=0,
    artist_weight=10,
    promoter_weight=0,
    venue_weight=10,
    recency_weight=-15,
)
parse_results(result, event_df, 3)

     date_created  date_modified  \
id                                 
957  1.683852e+09   1.683852e+09   
7    1.568033e+09   1.583939e+09   
190  1.557408e+09   1.591714e+09   

                                              slug          date  age_limit  \
id                                                                            
957                       lesprite-nite-at-arbella  1.683936e+09         21   
7    deep-grooves-iv-in-support-of-for-real-spaces  1.571357e+09         21   
190                               deep-grooves-iii  1.562112e+09         21   

                                        image  venue_id  \
id                                                        
957  e868e4b8-2cd1-4238-aac2-8f76eb594b7f.jpg        50   
7        6675a8723fa80cebdc53c0eeb3cf08f8.jpg        50   
190      91c314017cafb5f0de20f4b61dcd5a73.jpg        50   

                                           description     time1     time2  \
id                                               

Unnamed: 0_level_0,date_created,date_modified,slug,date,age_limit,image,venue_id,description,time1,time2,...,white_label_only,deleted,artist_names,promoter_names,organization_names,club_name,club_address,creator_email,modifier_email,order
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
957,1683852000.0,1683852000.0,lesprite-nite-at-arbella,1683936000.0,21,e868e4b8-2cd1-4238-aac2-8f76eb594b7f.jpg,50,,22:00:00,03:00:00,...,False,False,"Brown Sugar, Flower Food, Lesprite",,XXXXXXX,Arbella,"112 W Grand Ave, Chicago, IL 60654, USA",matt@thebloxoffice.com,matt@thebloxoffice.com,1
7,1568033000.0,1583939000.0,deep-grooves-iv-in-support-of-for-real-spaces,1571357000.0,21,6675a8723fa80cebdc53c0eeb3cf08f8.jpg,50,"Deep Grooves is back for its 4th edition, now ...",17:00:00,01:30:00,...,False,False,"Beaux, Cope, Desadeca, JRynecki, Liam Wells, M...",,Abound Productions,Arbella,"112 W Grand Ave, Chicago, IL 60654, USA",matt@thebloxoffice.com,skhaita2@gmail.com,2
190,1557408000.0,1591714000.0,deep-grooves-iii,1562112000.0,21,91c314017cafb5f0de20f4b61dcd5a73.jpg,50,"A night of deep, soulful, punchy grooves in th...",22:00:00,01:30:00,...,False,False,"Desadeca, JRynecki b2b Cyric, Liam Wells",Abound Productions,Abound Productions temp,Arbella,"112 W Grand Ave, Chicago, IL 60654, USA",matt@thebloxoffice.com,skhaita2@gmail.com,3
