In [1]:
import json
from transcribe.config import OPENAI_API_KEY
from transcribe.db import init_db
import transcribe.db.embedding as db_embedding
import transcribe.db.transcription as db_transcription
from typing import Optional
from gpt_index import GPTSimpleVectorIndex, GPTListIndex, Document, GPTPineconeIndex
import openai
import os
from yaspin import yaspin

In [2]:
import pinecone
api_key = "cab26ede-432b-40d5-bcf2-6f7849be9adc"
pinecone.init(api_key=api_key, environment="us-east1-gcp")
pindex = pinecone.Index("quickstart")

  from tqdm.autonotebook import tqdm


In [3]:
def get_ycombinator_videos():
    with open('ycombinator_videos.json') as f:
        data = json.load(f)
        return data

In [4]:
def get_link_from_id(id: str) -> str:
    return f'https://www.youtube.com/watch?v={id}'

In [5]:
def get_index_for_video(id: str, db) -> Optional[GPTSimpleVectorIndex]:
    link = get_link_from_id(id)
    embedding = db_embedding.get_embeddings_for_link(db, link)
    if not embedding:
        print("no embedding for link:", link)
        return None
    index = GPTSimpleVectorIndex.load_from_string(
        embedding['embedding_json'],
    )

    index.set_doc_id(link)
    summary = db_transcription.get_summary_for_link(db, link)
    if not summary:
        print("no summary for link:", link)
        return None
    index.set_text(summary)

    return index

In [6]:
videos = get_ycombinator_videos()
print(videos[0])

{'id': 'ycKU-ebeE24', 'title': 'The best way to have startup ideas is to just notice them organically.'}


In [7]:
openai.api_key = OPENAI_API_KEY
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
db = init_db()

In [22]:
def get_pinecone_indexes(vids):
    docs = []
    skipped = 0
    done = 0
    for video in vids:
        link = get_link_from_id(video['id'])
        transcription = db_transcription.get_transcription_by_link(db, link)
        if not transcription or not transcription['result']:
            skipped += 1
            print(f"no result for link: {link}, skipped {skipped}")
            continue
        tdata = json.loads(transcription['result'])
        text = tdata['transcription']
        doc = Document(text, doc_id=link)
        docs.append(doc) 
        done += 1
    print(f"done {done} videos!")
    return GPTPineconeIndex(docs,pinecone_index=pindex)

In [23]:
ndx = get_pinecone_indexes(videos)
ndx.save_to_disk("ycombinator_pinecone_index.json")

no result for link: https://www.youtube.com/watch?v=qh8sHetf-Nk, skipped 1
no result for link: https://www.youtube.com/watch?v=vqgnifnlLMI, skipped 2
no result for link: https://www.youtube.com/watch?v=K8tcouVhtI8, skipped 3
no result for link: https://www.youtube.com/watch?v=Octm_7llbGA, skipped 4
no result for link: https://www.youtube.com/watch?v=euZH0tVotPQ, skipped 5
no result for link: https://www.youtube.com/watch?v=5fmDKGV0TnQ, skipped 6
no result for link: https://www.youtube.com/watch?v=3xU050kMbHM, skipped 7
no result for link: https://www.youtube.com/watch?v=IYLVhk7yaaw, skipped 8
no result for link: https://www.youtube.com/watch?v=KWNNmPCF-Xs, skipped 9
no result for link: https://www.youtube.com/watch?v=tzsmJtKZ2No, skipped 10
no result for link: https://www.youtube.com/watch?v=sM2reZib2RY, skipped 11
no result for link: https://www.youtube.com/watch?v=jwXlo9gy_k4, skipped 12
no result for link: https://www.youtube.com/watch?v=VIWiEzO9KMM, skipped 13
no result for link: h

INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens
INFO:root:> [build_index_from_documents] Total embedding token usage: 2615815 tokens


In [24]:
from_disk_pinecone_ndx = GPTPineconeIndex.load_from_disk(
    "ycombinator_pinecone_index.json",
    pinecone_index=pindex,
) 

In [34]:
def ask(question):
    with yaspin(text="thinking..."):
        response = ndx.query(question)
    print(response)
    print(response.get_formatted_sources())

In [35]:
ask("what programming language is the best to use for a startup?")

⠦ thinking... 

INFO:root:> [query] Total LLM token usage: 3109 tokens
INFO:root:> [query] Total embedding token usage: 12 tokens


              

The best programming language to use for a startup will depend on the specific needs of the startup. It is important to optimize everything you do for learning and to be passionate about what you are working on. Popular programming languages for startups include Python, JavaScript, Java, and C++. Ultimately, the best programming language to use for a startup will depend on the specific goals and requirements of the startup.
> Source (Doc id: None): doc_id: https://www.youtube.com/watch?v=ypLoGFaKdbU
text: We got to the bagel store and my phone ...


In [36]:
ask("how do you get a startup idea?")

⠏ thinking... 

INFO:root:> [query] Total LLM token usage: 669 tokens
INFO:root:> [query] Total embedding token usage: 8 tokens


              
The best way to get a startup idea is to notice them organically. Look at the YC-TOP 100 companies and observe that at least 70% of them had their startup ideas organically, rather than by explicitly trying to think of a startup idea. To put yourself in a position to have organic startup ideas in the future, become an expert on something valuable, go work at a startup, and if you're a programmer, build things that you find interesting.
> Source (Doc id: None): doc_id: https://www.youtube.com/watch?v=ycKU-ebeE24
text:  Let's talk about how to come up with s...


In [37]:
ask("what is a startup?")

⠹ thinking... 

INFO:root:> [query] Total LLM token usage: 4791 tokens
INFO:root:> [query] Total embedding token usage: 5 tokens


              

A startup is a business venture that is typically in the early stages of development and growth. It is usually founded by entrepreneurs who are looking to develop a product or service that can be sold to customers. Startups often involve a high degree of risk and uncertainty, as they are typically funded by venture capital and require a great deal of effort to succeed. Starting a successful startup is a life-changing endeavor that requires dedication and hard work. It is not a game of tricks or shortcuts, but rather a process of creating something that users love and then telling them about it. It is an all-consuming endeavor that can take up years of your life, and even if you are successful, the problems you face will never get any easier. It is similar to having kids in that it is a button you press that changes your life irrevocably, and while it is honestly the best thing in the world, it is important to remember that there are a lot of things that are easier to do