In [56]:
import pandas as pd
import openai
import numpy as np
import pickle
import requests
import json
import re
from typing import Set
from typing import List
from transformers import GPT2TokenizerFast
from nltk.tokenize import sent_tokenize
from apiclient.discovery import build

In [4]:
openai.api_key = "OPENAI_API_KEY"
search_api_key = "GOOGLE_SEARCH_API_KEY"

In [49]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def count_tokens(text: str) -> int:
    """count the number of tokens in a string"""
    return len(tokenizer.encode(text))

    
def extract_text(snippet: str, title: str, link: str) -> str:
    date_regex = r'\b(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s+\d{1,2},\s+\d{4}\b'
    date = re.search(date_regex, title)
    date = date.group() if date is not None else None
    ntitle = re.sub(date_regex,'',title)
    ntitle = re.sub(r"[\(\[].*[\)\]]|\s*\.{3,}\s*|['\"]", '', ntitle)
    nsnippet = re.sub(r"[\(\[].*[\)\]]|\s*\.{3,}\s*|['\"]", '', snippet)
    return [(ntitle, nsnippet, link, count_tokens(ntitle + " " + nsnippet), date)]
    

In [140]:
prompt_query = "layup drill"
response = openai.Completion.create(
  model="text-davinci-003",
  prompt= prompt_query,
  temperature=0.3,
  max_tokens=300,
  top_p=0.0,
  frequency_penalty=0.0,
  presence_penalty=0.0
)
print("".join(response["choices"][0]["text"].split("\n")))

A layup drill is a basketball drill used to practice the layup shot. It involves a player dribbling the ball up the court and then shooting a layup. The drill can be done with one or two players, and can be modified to include different types of layups, such as reverse layups or floaters. The drill can also be used to practice defensive footwork and positioning.


### Preprocessing + Google Search

In [141]:
search_query = "".join(response["choices"][0]["text"].split("\n"))
resource = build("customsearch", 'v1', developerKey=search_api_key).cse()
#help(resource.list)
# Obtains the string of the documentation.
#docstring = filter.__doc__
#print(resource._schema.schemas)
#help(resource.list(q=query,cx='d6da5a3ee7f8144c2',searchType='image'))
result = resource.list(q=search_query,cx='CUSTOM_SEARCH_URL', searchType='image').execute()
res = []
for items in result["items"]:
    print(items['snippet'], items['title'], items['link'])
    res += extract_text(items['title'], items['snippet'], items['link'])
df = pd.DataFrame(res, columns=["Title", "Snippet", "Link", "Tokens", "Date"])
df = df.drop_duplicates(['Title','Snippet'])
df = df.reset_index().drop('index',axis=1) # reset index
df.head()
    

Coach Dave Taylor Coach Dave Taylor https://images.squarespace-cdn.com/content/v1/51865a34e4b0046126d95f2c/1414776785084-JI5R5HC6PCARSEKPZWC3/image-asset.png
The Ultimate Lay Up Drill with 5 Essential Lay Ups for Youth and ... The Ultimate Lay Up Drill with 5 Essential Lay Ups for Youth and ... https://www.breakthroughbasketball.com/training/graphics/ultimate-lay-up.jpg
73 Basketball Drills for Players and Coaches (2022 Update) 73 Basketball Drills for Players and Coaches (2022 Update) https://www.basketballforcoaches.com/wp-content/uploads/2020/06/73-basketball-drills.jpg
The Pure Guide to Scoring Inside & Layups in Basketball - Hooper Boost The Pure Guide to Scoring Inside & Layups in Basketball - Hooper Boost https://www.hooperboost.com/wp-content/uploads/2021/01/LAYUPtypes.jpg
22 Simple, Fun & Effective Basketball Drills for Coaches 22 Simple, Fun & Effective Basketball Drills for Coaches https://www.online-basketball-drills.com/wp-content/uploads/2019/07/bunnies2.jpg
The Pure Guid

Unnamed: 0,Title,Snippet,Link,Tokens,Date
0,Coach Dave Taylor,Coach Dave Taylor,https://images.squarespace-cdn.com/content/v1/...,6,
1,The Ultimate Lay Up Drill with 5 Essential Lay...,The Ultimate Lay Up Drill with 5 Essential Lay...,https://www.breakthroughbasketball.com/trainin...,26,
2,73 Basketball Drills for Players and Coaches,73 Basketball Drills for Players and Coaches,https://www.basketballforcoaches.com/wp-conten...,20,
3,The Pure Guide to Scoring Inside & Layups in B...,The Pure Guide to Scoring Inside & Layups in B...,https://www.hooperboost.com/wp-content/uploads...,32,
4,"22 Simple, Fun & Effective Basketball Drills f...","22 Simple, Fun & Effective Basketball Drills f...",https://www.online-basketball-drills.com/wp-co...,24,


In [143]:
MODEL_NAME = "curie"

DOC_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-doc-001"
QUERY_EMBEDDINGS_MODEL = f"text-search-{MODEL_NAME}-query-001"

In [144]:
def get_embedding(text: str, model: str):
    result = openai.Embedding.create(
      model=model, 
        input=text
    )
    return result["data"][0]["embedding"]

def get_doc_embedding(text: str):
    return get_embedding(text, DOC_EMBEDDINGS_MODEL)

def get_query_embedding(text: str):
    return get_embedding(text, QUERY_EMBEDDINGS_MODEL)

def compute_doc_embeddings(df: pd.DataFrame):
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_doc_embedding(r.Title + " " + r.Snippet) for idx, r in df.iterrows()
    }

In [145]:
df


Unnamed: 0,Title,Snippet,Link,Tokens,Date
0,Coach Dave Taylor,Coach Dave Taylor,https://images.squarespace-cdn.com/content/v1/...,6,
1,The Ultimate Lay Up Drill with 5 Essential Lay...,The Ultimate Lay Up Drill with 5 Essential Lay...,https://www.breakthroughbasketball.com/trainin...,26,
2,73 Basketball Drills for Players and Coaches,73 Basketball Drills for Players and Coaches,https://www.basketballforcoaches.com/wp-conten...,20,
3,The Pure Guide to Scoring Inside & Layups in B...,The Pure Guide to Scoring Inside & Layups in B...,https://www.hooperboost.com/wp-content/uploads...,32,
4,"22 Simple, Fun & Effective Basketball Drills f...","22 Simple, Fun & Effective Basketball Drills f...",https://www.online-basketball-drills.com/wp-co...,24,
5,Layup - Wikipedia,Layup - Wikipedia,https://upload.wikimedia.org/wikipedia/commons...,8,
6,What to Expect at Basketball Tryouts | PRO TIP...,What to Expect at Basketball Tryouts | PRO TIP...,https://dsgmedia.blob.core.windows.net/pub/201...,32,
7,basketball drills Archives - Teach Hoops,basketball drills Archives - Teach Hoops,https://teachhoops.com/wp-content/uploads/2022...,14,


In [146]:
context_embeddings = compute_doc_embeddings(df)    

In [147]:
context_embeddings

{0: [-0.001264234771952033,
  -0.008197895251214504,
  -0.002629428869113326,
  -0.0030467272736132145,
  -0.01611759141087532,
  0.019922634586691856,
  -0.014484293758869171,
  -0.025253284722566605,
  0.009692092426121235,
  0.007426117081195116,
  -0.0032665941398590803,
  0.002817886183038354,
  -0.014986846596002579,
  0.03198390454053879,
  0.012761254794895649,
  -0.011576665565371513,
  -0.028394240885972977,
  -0.025558406487107277,
  0.0011711277766153216,
  0.005406931042671204,
  -0.0047204080037772655,
  -0.003769146976992488,
  -0.011370260268449783,
  0.04102985933423042,
  0.006793438922613859,
  0.012931764125823975,
  0.011801020242273808,
  -0.04652204364538193,
  -0.0022491486743092537,
  0.016027849167585373,
  -0.007888286374509335,
  -0.01150487270206213,
  0.014977872371673584,
  0.06558316200971603,
  0.026904530823230743,
  0.03343771770596504,
  0.01067924965173006,
  0.011935632675886154,
  0.0010017405729740858,
  -0.07380349189043045,
  -0.024373818188905

In [148]:
example_entry = list(context_embeddings.items())[0]
print(f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")

0 : [-0.001264234771952033, -0.008197895251214504, -0.002629428869113326, -0.0030467272736132145, -0.01611759141087532]... (4096 entries)


In [149]:
# Make the key of the dict a tuple of the tile, row, and link
document_embeddings = dict()
for key, row in df.iterrows():
    document_embeddings[(row['Title'], row['Snippet'], row['Link'])] = context_embeddings[key]
    

In [150]:
def vector_similarity(x: List[float], y: List[float]):
    """
    We could use cosine similarity or dot product to calculate the similarity between vectors.
    In practice, we have found it makes little difference. 
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query, contexts):
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_query_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [157]:
images = order_document_sections_by_query_similarity(search_query, context_embeddings)


In [163]:
#Our final images
for image in images:
    print(df.iloc[image[1]]['Link'])

https://www.breakthroughbasketball.com/training/graphics/ultimate-lay-up.jpg
https://www.hooperboost.com/wp-content/uploads/2021/01/LAYUPtypes.jpg
https://www.online-basketball-drills.com/wp-content/uploads/2019/07/bunnies2.jpg
https://www.basketballforcoaches.com/wp-content/uploads/2020/06/73-basketball-drills.jpg
https://teachhoops.com/wp-content/uploads/2022/09/shutterstock_2060154047.jpg
https://dsgmedia.blob.core.windows.net/pub/2017/09/WhatToExpectBasketballTryouts1.jpg
https://upload.wikimedia.org/wikipedia/commons/thumb/8/8c/20130103_Trey_Burke_Layup_%282%29.JPG/1200px-20130103_Trey_Burke_Layup_%282%29.JPG
https://images.squarespace-cdn.com/content/v1/51865a34e4b0046126d95f2c/1414776785084-JI5R5HC6PCARSEKPZWC3/image-asset.png
