In [1]:
import os
import sys
import openai
import dspy
import re
from werkzeug.utils import secure_filename
from dotenv import load_dotenv
from tqdm import tqdm
import cohere
import weaviate
import weaviate.classes.config as wvcc
from dspy.retrieve.weaviate_rm import WeaviateRM
from dotenv import load_dotenv
from DSPyevaluate import *
import matplotlib.pyplot as plt
import numpy as np
import json
import pymupdf

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load environment variables
load_dotenv()

# Access environment variables
AI71_API_KEY = os.getenv("AI71_API_KEY")
AI71_BASE_URL = os.getenv("AI71_BASE_URL")

In [3]:
# Function to check allowed file extensions
def allowed_file(filename):
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

# Function to process PDF files
def replace_ligatures(text: str) -> str:
    ligatures = {
        "ﬁ": "fi", "ﬂ": "fl", "ﬃ": "ffi", "ﬄ": "ffl",
        "ﬅ": "ft", "ﬆ": "st", "Ꜳ": "AA", "Æ": "AE", "ꜳ": "aa"
    }
    for search, replace in ligatures.items():
        text = text.replace(search, replace)
    return text

def remove_footnotes(text: str) -> str:
    '''Removes footnotes'''
    footnote_pattern = r'\[\d+\]|\(\d+\)'
    return re.sub(footnote_pattern, '', text)

def data_cleaning(text: str) -> str:
    '''Removes hyperlinks and non-essential characters, and changes text to lowercase'''
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r"[^a-zA-Z0-9 %$]", " ", text)
    return text.lower()

def process_pdf(filepath: str) -> str:
    '''Function for processing individual PDFs'''
    if not os.path.exists(filepath):
        raise FileNotFoundError(f"File not found: {filepath}")
    
    document = pymupdf.open(filepath)
    text_data = ""
    
    for page_num in range(len(document)):
        page = document.load_page(page_num)
        data = page.get_text()
        text_data += remove_footnotes(data)
        
    text_data = replace_ligatures(text_data)
    return data_cleaning(text_data)

def evaluate_pitch_deck(content):
    evaluate = dspy.Predict(EvaluatePitchDeck, n=1)
    response = evaluate(pitchdeck_content=content)
    return response.evaluation_results  # This would be a dict with scores and feedback


In [35]:
filename ="Carta-Pitch-Deck-Business-Insider-February-2019" + ".pdf"

In [36]:
filepath = "C:\\Users\\amagd\\projects\\falcon\\uploads\\" + filename

In [37]:
print(filepath)

C:\Users\amagd\projects\falcon\uploads\Carta-Pitch-Deck-Business-Insider-February-2019.pdf


In [38]:
pdf_content = process_pdf(filepath)

In [39]:
print(f"""{pdf_content}""")

carta investor pitch 1 the problem $20k $7 private public the solution build the central registry of asset ownership henry $100 shares mary $50 options bob $200 warrant bill $1000 debt sam $400 membership units how  create a simple way for issuers to transfer securities online issuer owner ownership we issue electronic shares  options  debt   and derivatives and track ownership  restrictions  and shareholder information we automate  their approval   and compliance including new issues   transfers  and settlement we track the cap table and all issuer liabilities we track the portfolio and all the owner s assets venture capital is our first network fractional ownership industries have strong  network effects e company vc vc vc stocks e e options e our second network is limited partners membership in funds is a larger and more  valuable network effect company vc vc vc stocks company warrants banks lp lp lp lp lp lp lp company e e e e membership units options sell software and financial pr

In [40]:
json_file_path = 'C:/Users/amagd/projects/falcon/startup.json'
with open(json_file_path, 'r') as file:
    startup_data = json.load(file)

print(f"""{startup_data}""")

{'Info': {'questions': ['What is the Startup Name?', 'When was the startup Date Started?', 'What is the Registration Type?', 'What is the Registration Country?', 'What is the Contact Info?'], 'dataFields': {'Startup Name': 'string', 'Date Started': 'date', 'Registration Type': 'string', 'Registration Country': 'string', 'Contact Info': 'string'}}, 'Team': {'questions': ['How many team members do you have?', 'List the team members with the following details:', {'Name': "What is the team member's name?", 'Title': "What is the team member's title?", 'Availability Per Week': 'How many hours per week is the team member available?', 'Involved Since': 'Since when has the team member been involved?', 'Equity %': 'What percentage of equity does the team member hold?', 'Salary %': 'What percentage of salary does the team member receive?', 'Years of Experience': 'How many years of experience does the team member have?', 'Academic Degree': {'Undergraduate': 'Does the team member have an undergradu

In [41]:
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")
WEAVIATE_URL = os.getenv("WEAVIATE_URL")

In [42]:
co = cohere.Client(COHERE_API_KEY)

In [43]:
weaviate_client = weaviate.connect_to_wcs(
    cluster_url=WEAVIATE_URL,  # Replace with your WCS URL
    auth_credentials=weaviate.auth.AuthApiKey(WEAVIATE_API_KEY),  # Replace with your WCS key
    headers={
        'X-Cohere-Api-Key': (COHERE_API_KEY) # Replace with your Cohere API key
    }
)

In [44]:
if not weaviate_client.collections.exists("senu"):
    # Define a data collection (class) in Weaviate
    try:
        collection = weaviate_client.collections.create(
            name="senu",
            vectorizer_config=weaviate.classes.config.Configure.Vectorizer.text2vec_cohere(),
            generative_config=weaviate.classes.config.Configure.Generative.cohere(
                model="embed-multilingual-v3.0"
            ),
            properties=[
                    wvcc.Property(name="startup", data_type=wvcc.DataType.TEXT),
                    wvcc.Property(name="text", data_type=wvcc.DataType.TEXT),
                    wvcc.Property(name="embedding", data_type=wvcc.DataType.INT_ARRAY),
            ]
        )

    finally:
        weaviate_client.close()

    print("Weaviate class 'Senu' created.")

else:
    print("Weaviate class 'Senu' already exists.")

Weaviate class 'Senu' already exists.


In [45]:
weaviate_rm = WeaviateRM("senu", weaviate_client)

In [46]:
# Embed text data using Cohere
startup_str = str(startup_data)

embeds = co.embed(texts=[startup_str], model="multilingual-22-12").embeddings
print(f"{embeds}")

unknown field: parameter compress is not a valid field


[[0.36914062, 0.101989746, 0.010253906, 0.5576172, 0.18066406, 0.14208984, 0.20532227, -0.3154297, 0.13000488, 0.14428711, -0.24621582, -0.5991211, -0.21313477, -0.025253296, -0.7685547, 0.1463623, 0.25756836, -0.12866211, 0.020050049, 0.04159546, 0.2376709, 0.27416992, 0.37573242, 0.09631348, 0.20458984, -0.33984375, -0.29589844, 0.36816406, -0.59277344, 0.31811523, 0.15612793, 0.041503906, -0.30371094, 0.48486328, -0.74072266, 0.19995117, -0.22363281, 0.1003418, -0.32348633, 0.3720703, -0.16882324, 0.105041504, -0.16064453, -0.13305664, 0.43554688, -0.11895752, -0.040740967, 0.25317383, -0.05657959, 0.028335571, -0.2175293, -0.43041992, -0.0597229, -0.31298828, -0.5058594, 0.47021484, -0.06488037, 0.20239258, 0.07342529, -0.070373535, -0.07910156, -0.21569824, -0.14709473, 0.13378906, -0.12768555, 0.096191406, -0.2536621, 0.4428711, 0.10028076, 0.3190918, -0.15490723, -0.020904541, 0.6118164, -0.0048065186, 0.15075684, 0.060638428, 0.15270996, -0.05895996, 0.2541504, -0.22216797, 0.2

In [47]:
# Process embeddings for the text
for embedding in tqdm(embeds):
    # Create a Weaviate object with the text and embedding
    object_data = {
        "startup": filename,
        "text": startup_str,
        "embedding": json.dumps(embedding)
    }

try: 
    # Add the object to the Weaviate collection
    collection = weaviate_client.collections.get("Senu")
    collection.data.insert(object_data)
finally:
    weaviate_client.close

100%|██████████| 1/1 [00:00<00:00, 1000.31it/s]


UnexpectedStatusCodeError: Object was not added! Unexpected status code: 422, with response body: {'error': [{'message': "invalid object: invalid number array property 'embedding' on class 'Senu': not an integer array, but string"}]}.

In [48]:
# Configure DSPy
falcon_lm = dspy.OpenAI(model="tiiuae/falcon-180b-chat",max_tokens=2000, api_base=AI71_BASE_URL, api_key=AI71_API_KEY)
dspy.configure(lm=falcon_lm)

In [49]:
class EvaluatePitchDeck(dspy.Signature):
    """
    Evaluate the provided pitch deck content across all sections: Team, Fundraising, Market, Business Model, Product, and Traction.
    The output is a dictionary with each section containing a score (1-10) and a list of feedback items.

I need you to provide feedback in the following JSON format. Each section should include a score and five feedback items. The startup name should also be included in the JSON output. Below is an example of the format I need:

{
    "startup_name": "Example Startup",
    "team": {
        "score": 8,
        "feedback_1": "The team has strong experience and a clear vision, but they need more diversity in skills.",
        "feedback_2": "The team's track record is impressive, but they lack experience in scaling businesses.",
        "feedback_3": "There is a strong leadership team, but more emphasis on technical skills is needed.",
        "feedback_4": "The team has a clear vision but needs better execution plans.",
        "feedback_5": "The team should work on improving communication strategies within the group."
    },
    "fundraising": {
        "score": 7,
        "feedback_1": "The startup has secured initial funding but needs to outline a clearer path for future rounds.",
        "feedback_2": "Funding sources are diversified, but there is a need for more detailed financial projections.",
        "feedback_3": "The pitch to investors is strong but needs better risk management strategies.",
        "feedback_4": "Current funding is sufficient for initial growth but not for scaling.",
        "feedback_5": "Consider exploring alternative funding options like grants or strategic partnerships."
    },
    "business_model": {
        "score": 9,
        "feedback_1": "The business model is solid with clear revenue streams and customer acquisition strategies.",
        "feedback_2": "There is a well-defined value proposition and revenue model.",
        "feedback_3": "The model is scalable and has potential for high margins.",
        "feedback_4": "Consider refining the pricing strategy to maximize revenue.",
        "feedback_5": "The business model is competitive but should anticipate market changes."
    },
    "market": {
        "score": 6,
        "feedback_1": "The market size is promising but the startup should provide more data on target demographics.",
        "feedback_2": "Market research needs to be more comprehensive to support growth projections.",
        "feedback_3": "Competitive analysis is lacking; include more details on market positioning.",
        "feedback_4": "There is potential in the market, but customer needs need to be better defined.",
        "feedback_5": "Market entry strategy is good but should address potential barriers to entry."
    },
    "product": {
        "score": 8,
        "feedback_1": "The product is well-developed and addresses key customer needs, though some additional features would be beneficial.",
        "feedback_2": "Product design is strong, but user experience could be improved.",
        "feedback_3": "Consider adding more functionality based on customer feedback.",
        "feedback_4": "The product has good potential but needs a more robust testing phase.",
        "feedback_5": "Ensure the product is adaptable to future market trends."
    },
    "traction": {
        "score": 5,
        "feedback_1": "There is some initial traction, but significant growth is needed to prove the business's potential.",
        "feedback_2": "Customer acquisition numbers are low; focus on scaling marketing efforts.",
        "feedback_3": "Early results are promising but need to be sustained over a longer period.",
        "feedback_4": "Traction metrics should include more detailed customer feedback.",
        "feedback_5": "Consider strategies to accelerate growth and increase user engagement."
    }
}

Please format your feedback in this exact JSON structure, including five feedback items and a score for each section. Ensure that the 'startup_name' field is included at the top level of the JSON object.

    """
    pitchdeck_content = dspy.InputField(desc="Content of the startup pitch deck.")
    evaluation_results = dspy.OutputField(desc="A dictionary in the given Json format containing scores and feedback for all sections of the pitch deck.")

In [50]:
evaluate = dspy.ChainOfThought(EvaluatePitchDeck, n=1)
response = evaluate(pitchdeck_content=pdf_content)
print(f"""{response}""")

InternalServerError: Error code: 500