In [3]:
import pdfplumber
import re

In [3]:
#implementing the pdfplumber library to extract text from pdf
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

text = extract_text_from_pdf('Startup Playbook.pdf')

In [4]:
with open('output.txt', 'r') as file:
    text = file.read()

In [5]:
import re

# Splitting the essay on '.', '?', and '!'
single_sentences_list = re.split(r'(?<=[.?!])\s+', text)
print (f"{len(single_sentences_list)} senteneces were found")

455 senteneces were found


In [6]:
sentences = [{'sentence': x, 'index' : i} for i, x in enumerate(single_sentences_list)]

In [7]:
def combine_sentences(sentences, buffer_size=1):
    for i in range(len(sentences)):
        combined_sentence = ''
        for j in range(i - buffer_size, i):
            if j >= 0:
                combined_sentence += sentences[j]['sentence'] + ' '
        combined_sentence += sentences[i]['sentence']
        for j in range(i + 1, i + 1 + buffer_size):
            if j < len(sentences):
                combined_sentence += ' ' + sentences[j]['sentence']
        sentences[i]['combined_sentence'] = combined_sentence
    return sentences

sentences = combine_sentences(sentences)

In [8]:
sentences[:3]

[{'sentence': 'Startup Playbook\nWritten by Sam Altman · Illustrated by Gregory Koberger · Spanish translation (https://platzi.com/startup-\nplaybook/)\nWe spend a lot of time advising startups.',
  'index': 0,
  'combined_sentence': 'Startup Playbook\nWritten by Sam Altman · Illustrated by Gregory Koberger · Spanish translation (https://platzi.com/startup-\nplaybook/)\nWe spend a lot of time advising startups. Though one-on-one advice will always be crucial,\nwe thought it might help us scale Y Combinator if we could distill the most generalizable parts\nof this advice into a sort of playbook we could give YC and YC Fellowship companies.'},
 {'sentence': 'Though one-on-one advice will always be crucial,\nwe thought it might help us scale Y Combinator if we could distill the most generalizable parts\nof this advice into a sort of playbook we could give YC and YC Fellowship companies.',
  'index': 1,
  'combined_sentence': 'Startup Playbook\nWritten by Sam Altman · Illustrated by Gregor

In [4]:
from dotenv import load_dotenv
import os
import voyageai

# Load the environment variables from .env file
load_dotenv()

# Accessing variables
VOYAGE_API_KEY = os.getenv('VOYAGE_API_KEY')
vo = voyageai.Client()

In [55]:
result = vo.count_tokens([x['combined_sentence'] for x in sentences])
result

32849

In [11]:
embeds = vo.embed([x['combined_sentence'] for x in sentences[:3]], model="voyage-large-2", input_type="document")

In [12]:
embeds.embeddings

[[-0.01960401050746441,
  0.00952412560582161,
  0.029937220737338066,
  0.028779219835996628,
  0.020263994112610817,
  0.018185047432780266,
  -0.002794843865558505,
  -0.011948895640671253,
  -0.03194594755768776,
  0.011232119053602219,
  -0.0039060316048562527,
  0.019316615536808968,
  -0.021202653646469116,
  -0.039265573024749756,
  0.006924385204911232,
  -0.034959301352500916,
  0.015047195367515087,
  -0.018160764127969742,
  -0.04380667954683304,
  0.042163725942373276,
  0.02861500345170498,
  0.0728130042552948,
  -0.03810025006532669,
  -0.021707763895392418,
  0.018393084406852722,
  -0.006779320538043976,
  -0.001975221326574683,
  -0.03141256421804428,
  0.02604994736611843,
  0.036515459418296814,
  -0.0014987196773290634,
  -0.02858075499534607,
  -0.003778189653530717,
  -0.050665102899074554,
  -0.003182005835697055,
  0.014508194290101528,
  -0.015082728117704391,
  0.01819777302443981,
  0.043141599744558334,
  -0.021107787266373634,
  0.0417252853512764,
  0.00

In [13]:
combined_embeddings = []
i = 0
while i < len(sentences):
    combined_embeddings += vo.embed([x['combined_sentence'] for x in sentences[i:i+128]], model="voyage-large-2", input_type="document").embeddings
    i += 128

In [14]:
len(combined_embeddings)

455

In [15]:
for i, sentence in enumerate(sentences):
    sentence['combined_sentence_embedding'] = combined_embeddings[i]

In [5]:
from sklearn.metrics.pairwise import cosine_similarity
def calculate_cosine_distances(sentences):
    distances = []
    for i in range(len(sentences) - 1):
        embedding_current = sentences[i]['combined_sentence_embedding']
        embedding_next = sentences[i + 1]['combined_sentence_embedding']
        
        # Calculate cosine similarity
        similarity = cosine_similarity([embedding_current], [embedding_next])[0][0]
        
        # Convert to cosine distance
        distance = 1 - similarity

        # Append cosine distance to the list
        distances.append(distance)

        # Store distance in the dictionary
        sentences[i]['distance_to_next'] = distance

    # Optionally handle the last sentence
    # sentences[-1]['distance_to_next'] = None  # or a default value

    return distances, sentences

In [17]:
distances, sentences = calculate_cosine_distances(sentences)

In [57]:
import numpy as np

breakpoint_percentile_threshold = 80
breakpoint_distance_threshold = np.percentile(distances, breakpoint_percentile_threshold)

In [58]:
num_distances_above_theshold = len([x for x in distances if x > breakpoint_distance_threshold]) # The amount of distances above your threshold
indices_above_thresh = [i for i, x in enumerate(distances) if x > breakpoint_distance_threshold] # The indices of those breakpoints on your list

# Initialize the start index
start_index = 0

# Create a list to hold the grouped sentences
chunks = []

# Iterate through the breakpoints to slice the sentences
for index in indices_above_thresh:
    # The end index is the current breakpoint
    end_index = index

    # Slice the sentence_dicts from the current start index to the end index
    group = sentences[start_index:end_index + 1]
    combined_text = ' '.join([d['sentence'] for d in group])
    chunks.append(combined_text)
    
    # Update the start index for the next group
    start_index = index + 1

# The last group, if any sentences remain
if start_index < len(sentences):
    combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]])
    chunks.append(combined_text)

# grouped_sentences now contains the chunked sentences

In [59]:
print(len(chunks))

92


In [61]:
print(chunks[0])

Startup Playbook
Written by Sam Altman · Illustrated by Gregory Koberger · Spanish translation (https://platzi.com/startup-
playbook/)
We spend a lot of time advising startups. Though one-on-one advice will always be crucial,
we thought it might help us scale Y Combinator if we could distill the most generalizable parts
of this advice into a sort of playbook we could give YC and YC Fellowship companies. Then we thought we should just give it to everyone. This is meant for people new to the world of startups. Most of this will not be new to people
who have read a lot of what YC partners have written—the goal is to get it into one place. There may be a part II on how to scale a startup later—this mostly covers how to start one. Part I: The Idea Part II: A Great Team Part III: A Great Product
Part IV: Great Execution Closing Thought
Your goal as a startup is to make something users love. If you do that, then you have to
figure out how to get a lot more users. But this first part is critic

In [6]:
from pinecone import Pinecone

PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("rag-chat")

In [63]:
chunk_embeddings = vo.embed(chunks, model="voyage-large-2", input_type="document").embeddings

In [64]:
len(chunk_embeddings)

92

In [65]:
id_name = "default"
vectors = []
for i, chunk in enumerate(chunks):
    thisid = id_name + str(i)
    vector = chunk_embeddings[i]
    metadata = {"text": chunk, "pdf_id": id_name}
    full_dct = {"id": thisid, "values": vector, "metadata": metadata}
    vectors.append(full_dct)
index.upsert(vectors)

{'upserted_count': 92}

In [49]:
fullchunks = " ".join(chunks)

In [54]:
print(chunks[40])

Watch your cash flow obsessively. Although it sounds unbelievable, we’ve seen founders run
out of money without being aware it was happening a number of times (and read Paul
Graham’s essay (http://paulgraham.com/aord.html)). Most startups raise money at some point. You should raise money when you need it or when it’s available on good terms. Be careful
not to lose your sense of frugality or to start solving problems by throwing money at them. Not having enough money can be bad, but having too much money is almost always bad.


In [22]:
import dspy
import voyageai
import numpy as np
# Load the environment variables
load_dotenv()
from openai import OpenAI

# Initialize the Pinecone client and index
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=PINECONE_API_KEY)
PINECONE_INDEX = pc.Index("rag-chat")

# OpenAI and dspy setup
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
gpt4 = dspy.OpenAI(model="gpt-4", api_key=OPENAI_API_KEY)
gpt4_turbo = dspy.OpenAI(model="gpt-4-turbo", api_key=OPENAI_API_KEY)
dspy.configure(lm=gpt4)
client = OpenAI()

class QueryReformatter(dspy.Signature):
    """Given a query, make it more detailed by asking implied subquestions for a vector search.
    If there isn't a clear way to make the query more detailed, return the original query."""

    query = dspy.InputField()
    new_query = dspy.OutputField(desc="The more detailed version of the query, assuming it is known the information is contained in the writing. ONLY GIVE THE QUERY, no additional text.")

class PineconeRM(dspy.Retrieve):
    """
    Retrieval model used in DSPy, reformats the query and retrieves the top k passages from Pinecone.
    """
    def __init__(self, id:str = "", k:int = 3):
        super().__init__(k=k)
        self.id = id

    def forward(self, query:str) -> dspy.Prediction:
        dspy.configure(lm=gpt4)
        queryref = dspy.Predict(QueryReformatter)
        query_redone = queryref(query=query).new_query
        voyage_call = vo.embed(query_redone, model="voyage-large-2", input_type="query")
        query_vector = voyage_call.embeddings[0]
        if not self.id:
            result = PINECONE_INDEX.query(
                vector=query_vector,
                top_k=self.k,
                include_metadata=True
            )
        else:
            result = PINECONE_INDEX.query(
                vector=query_vector,
                filter={
                    "pdf_id": self.id
                },
                top_k=self.k,
                include_metadata=True
            )
        text_strings = [i["metadata"]["text"] for i in result["matches"]]
        return dspy.Prediction(
            passages=text_strings
        )

class GenerateAnswer(dspy.Signature):
    """Answer questions with as ground-truth information as possible, with the added filter. 
    If no filter is provided, just give your response as usual."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    filter = dspy.InputField(desc="the filter through which the answer should be generated.")
    answer = dspy.OutputField(desc="complete, detailed answer to the question in max 3 sentences. just the answer, no additional text.")

class RAG(dspy.Module):
    """Retrieve, Answer, Generate model for question answering."""
    def __init__(self, num_passages=2, id:str = ""):
        super().__init__()

        self.retrieve = PineconeRM(id=id, k=num_passages)
        self.generate_answer = dspy.Predict(GenerateAnswer)
    
    def forward(self, question, voice=""):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question, filter=voice)
        return dspy.Prediction(context=context, answer=prediction.answer)

def rag_qa(id,query,voice):
    """
    Given a question and an ID, retrieves the top k passages from Pinecone and generates an answer using the RAG model.
    """
    rag = RAG(id=id)
    call = rag(question=query, voice=voice)
    return call

In [24]:
rag_qa("default","how does sam altman feel about fundraising?","")

Prediction(
    context=['Investors that only invest a small amount usually\ndon’t do anything for you (i.e., beware party rounds). Great board members are one of the best outside forcing functions for a company other than\nusers, and outside forcing functions are worth more than most founders think. Be willing to\naccept a lower valuation to get a great board member who is willing to be very involved. I think this essay by Paul Graham (http://paulgraham.com/fr.html) is the best thing out there\non fundraising. Remember that at least a thousand people have every great idea.', 'The “really successful” part is important\n—because investors’ returns are dominated by the big successes, if an investor believes you\nhave a 100% chance of creating a $10 million company but almost no chance of building a\nlarger company, he/she will still probably not invest even at a very low valuation. Always\nexplain why you could be a huge success. Investors are driven by the dual fears of missing the next