In [1]:
import requests
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import trange, tqdm
import openai

from datetime import datetime, timedelta

In [2]:
def get_urls(date):
    url = 'https://techcrunch.com/' + date.strftime('%Y/%m/%d')
    content = requests.get(url).text
    return [a['href'] for a in BeautifulSoup(content).find_all(
        'a',
        {'class': 'loop-card__title-link'}
    )]

In [3]:
urls = sum([get_urls(datetime.now() - timedelta(days=i)) for i in trange(7)], [])
len(urls)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:13<00:00,  1.91s/it]


102

In [4]:
def get_article(url):
    content = requests.get(url).text
    article = BeautifulSoup(content).find_all('div', {'class': 'entry-content'})[0]
    return [p.text for p in article.find_all('p', recursive=False)]

In [5]:
articles = pd.DataFrame({
    'url': urls,
    'article': [get_article(url) for url in tqdm(urls)]
})

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 102/102 [01:10<00:00,  1.44it/s]


In [6]:
paragraphs = (
    articles.explode('article')
    .rename(columns={'article': 'paragraph'})
)
paragraphs = paragraphs[paragraphs['paragraph'].str.split().map(len) > 10]
paragraphs

Unnamed: 0,url,paragraph
0,https://techcrunch.com/2023/01/21/india-blocks...,The Indian government has ordered YouTube and ...
0,https://techcrunch.com/2023/01/21/india-blocks...,India’s Ministry of Information and Broadcasti...
0,https://techcrunch.com/2023/01/21/india-blocks...,The ministry issued the directions under the I...
0,https://techcrunch.com/2023/01/21/india-blocks...,Gupta called the BBC documentary a “hateful pr...
0,https://techcrunch.com/2023/01/21/india-blocks...,BBC aired the first episode of the two-part do...
...,...,...
101,https://techcrunch.com/2023/01/15/environmenta...,"Historically, environmental health and safety ..."
101,https://techcrunch.com/2023/01/15/environmenta...,EHS software acts as a data management system ...
101,https://techcrunch.com/2023/01/15/environmenta...,Verdantix’s Green Quadrant: EHS Software 2023 ...
101,https://techcrunch.com/2023/01/15/environmenta...,"“Over the past two years, the market landscape..."


In [7]:
with open('api_key', 'r') as f:
    openai.api_key = f.read().strip()

In [8]:
def get_embedding(texts, model='text-embedding-ada-002'):
    texts = [text.replace('\n', ' ') for text in texts]
    return [res['embedding'] for res in openai.Embedding.create(input=texts, model=model)['data']]

In [9]:
batch_size = 100
embeddings = []

for i in trange(0, len(paragraphs), batch_size):
    embeddings += get_embedding(paragraphs.iloc[i:i+batch_size]['paragraph'])

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:31<00:00,  2.61s/it]


In [10]:
paragraphs['embedding'] = embeddings

In [11]:
query = 'TODO'  # keep in mind we scraped only a sample of articles from the last week
query_embedding = get_embedding([query])[0]

In [None]:
best_idx = paragraphs['embedding'].map(
    lambda emb: np.dot(emb, query_embedding) / (
        np.linalg.norm(emb) * np.linalg.norm(query_embedding)
    )
).argmax()

best_paragraph = paragraphs.iloc[best_idx]['paragraph']

In [None]:
prompt = (
    "Here's a piece of text:\n" +
    best_paragraph + '\n\n' +
    'I have a question about this text: ' + query +
    'Please answer in a concise manner'
)

print(prompt)