In [1]:
import pickle
import nltk
import PIL
from PIL import Image
import torch
import torchvision.models as models
import torchvision.transforms as transforms
import concurrent.futures
import json
import os
import re
import warnings
from collections import defaultdict
from datetime import datetime
from functools import lru_cache
from typing import List, Tuple, Dict
from urllib.parse import urlparse, urljoin

import faiss
import numpy as np
import openai
import PyPDF2
import requests
import spacy
import tldextract
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from pptx import Presentation
from tenacity import retry, stop_after_attempt, wait_random_exponential
from tkinter import Tk, filedialog
from transformers import GPT2Tokenizer
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

from readppt import read_ppt
import urlscrape

load_dotenv()
warnings.filterwarnings("ignore")
openai.api_key = os.getenv('OPENAI_API_KEY')

MODEL = "gpt-4"
CHUNK_SIZE=7000

# Save the index and dictionaries
def save_index_and_paths(index, text_paths, image_paths, index_file, paths_file):
    faiss.write_index(index, index_file)
    with open(paths_file, "wb") as f:
        pickle.dump((text_paths, image_paths), f)

# Load the index and dictionaries
def load_index_and_paths(index_file, paths_file):
    index = faiss.read_index(index_file)
    with open(paths_file, "rb") as f:
        text_paths, image_paths = pickle.load(f)
    return index, text_paths, image_paths

def clean_text(text):
    cleaned_text = " ".join(text.split())
    cleaned_text = re.sub(r'http\S+', '', cleaned_text)
    cleaned_text = re.sub(r'<script.*?>.*?</script>', '', cleaned_text, flags=re.DOTALL)
    cleaned_text = re.sub(r'<style.*?>.*?</style>', '', cleaned_text, flags=re.DOTALL)
    cleaned_text = " ".join(cleaned_text.split())
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    cleaned_text = cleaned_text.replace("\n", " ").replace("\r", " ").replace("\t", " ")
    cleaned_text = re.sub(r'[^a-zA-Z0-9.,!?/:;()%$@&\s]', '', cleaned_text)
    cleaned_text = re.sub(r'(?i)(terms\s*and\s*conditions|privacy\s*policy|copyright|blog|legal|careers|cdn*).{0,10}', '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    return cleaned_text

def split_text(text: str, max_tokens=CHUNK_SIZE) -> List[str]:
    sentences = nltk.sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_tokens = 0

    for sentence in sentences:
        sentence_tokens = tokenizer(sentence)["input_ids"]
        # Exclude the special tokens ([CLS], [SEP]) from the token count
        sentence_token_count = len(sentence_tokens) - 2

        if current_tokens + sentence_token_count > max_tokens:
            # Exceeds token limit, create a new chunk
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_tokens = sentence_token_count
        else:
            # Append the sentence to the current chunk
            current_chunk.append(sentence)
            current_tokens += sentence_token_count

    # Add the last chunk
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def preprocess_documents(root_folder: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, PIL.Image.Image]]]:
    text_documents = []
    image_documents = []

    for subdir, dirs, files in os.walk(root_folder):
        for file in files:
            file_path = os.path.join(subdir, file)
            input_type = file.split('.')[-1]

            if input_type not in ['pdf', 'pptx', 'jpg', 'png', 'jpeg']:
                continue

            if input_type in ['pdf', 'pptx']:
                text = analyze_input(input_type, None, file_path)
                cleaned_text = clean_text(text)
                chunks = split_text(cleaned_text)

                for chunk in chunks:
                    text_documents.append((file_path, chunk))
            elif input_type in ['jpg', 'png', 'jpeg']:
                image = PIL.Image.open(file_path)
                image_documents.append((file_path, image))

    return text_documents, image_documents

def read_pdf(file):
    file.seek(0)  # move the file cursor to the beginning
    pdf_reader = PyPDF2.PdfReader(file)
    
    # Check if the PDF is encrypted
    if pdf_reader.is_encrypted:
        print("Encrypted PDF file detected. Skipping...")
        return ""
    
    if len(pdf_reader.pages) == 0:
        raise ValueError("PDF file is empty")
    
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# For image embedding
def embed_image(image: Image.Image) -> np.ndarray:
    # Load the pre-trained ResNet model
    resnet = models.resnet18(pretrained=True)
    resnet.eval()

    # Define the preprocessing transformations
    transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

    # Apply the transformations to the image
    transformed_image = transform(image).unsqueeze(0)

    # Extract the features using the ResNet model
    with torch.no_grad():
        features = resnet(transformed_image).numpy()

    return features

# Embed the text
def embed_text(text: str) -> np.ndarray:
    response = openai.Embedding.create(input=text, model="text-embedding-ada-002")
    return np.array(response['data'][0]['embedding'])


def index_embeddings(text_documents: List[Tuple[str, str]], image_documents: List[Tuple[str, PIL.Image.Image]]) -> Tuple[faiss.Index, Dict[int, Tuple[str, str]], Dict[int, str]]:
    text_paths = {}
    image_paths = {}

    # Obtain the first text embedding to get its dimensions
    first_path, first_text = text_documents[0]
    first_embedding = embed_text(first_text)
    embedding_dim = first_embedding.shape[0]

    # Initialize the index with the dynamic embedding dimension
    index = faiss.IndexFlatL2(embedding_dim)

    # Add the first text embedding to the index
    index.add(first_embedding.reshape(1, -1))
    text_paths[0] = (first_path, first_text)  # Store the path and chunk together

    current_id = 1

    # Add the remaining text embeddings to the index
    for path, text in text_documents[1:]:
        chunks = split_text(text)
        for chunk in chunks:
            print(f"Embedding chunk with {len(chunk)} tokens")
            embedding = embed_text(chunk)
            index.add(embedding.reshape(1, -1))
            text_paths[current_id] = (path, chunk)  # Store the path and chunk together
            current_id += 1

    # Add the image embeddings to the index
    for path, image in image_documents:
        print("Embedding image")
        embedding = embed_image(image)
        index.add(embedding.reshape(1, -1))
        image_paths[current_id] = path
        current_id += 1

    return index, text_paths, image_paths

def search(query: str, index: faiss.IndexIDMap, text_paths: Dict[int, str], image_paths: Dict[int, str]) -> List[Tuple[str, float]]:
    # Embed the query
    query_embedding = embed_text(query)

    # Search the index
    D, I = index.search(np.array([query_embedding]), k=10)

    # Get the paths and scores of the search results
    results = []
    for i, score in zip(I[0], D[0]):
        if i in text_paths:
            path, chunk = text_paths[i]
            results.append((path, chunk, score))
        elif i in image_paths:
            results.append((image_paths[i], "", score))  # Add an empty string for the chunk in the case of images

    return results

# Analyze input
def analyze_input(input_type, company, url):
    text = ""
    if input_type == "url": # Placeholder for url scraping
        data = urlscrape.link(url)
    elif input_type in ["pdf", "pptx"]:
        file_path = url

        with open(file_path, "rb") as file:
            if input_type == "pdf":
                text = read_pdf(file)
            elif input_type == "pptx":
                file_content = file.read()
                text = read_ppt(file_content)
        data = text
    else:
        raise ValueError("Invalid input type")

    return data

def base_gptcall(prompt):
    messages = [{"role": "system", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=MODEL,
        messages=messages,
        temperature=0.1
    )
    return response.choices[0]['message']['content'].strip()

@retry(wait=wait_random_exponential(min=2, max=20), stop=stop_after_attempt(3), reraise=True)
def call_gpt(prompt):
    answers = []
    if len(prompt)>CHUNK_SIZE:
        textchunks = split_text(prompt)
        for chunk in textchunks:
            answer = []
            # print(len(chunk))
            # print(chunk)
            answer = base_gptcall(chunk)
            answers.append(answer)
        return ' '.join(answers)
    else:
        return base_gptcall(prompt)

def recursive_analyze(text):
    text_chunks = clean_text(text)
    text_chunks = split_text(text)
    print("The total length of all text chunks is: ")
    print(len(text_chunks))
    # Use ThreadPoolExecutor to parallelize GPT calls
    with concurrent.futures.ThreadPoolExecutor() as executor:
        futures = []
        for chunk in text_chunks:
            futures.append(executor.submit(call_gpt, f"Extract all insights, names and facts from the following text as would be useful for an investment memo:\n\n{chunk}"))
        insights_lists = [future.result() for future in futures]
    combined_insights = "\n".join(insights_lists)
    prompt = f"Please summarise. If no useful information is present, please reply with 'info not available':\n\n{combined_insights}"
    summary = call_gpt(prompt)
    return summary

def main():
    openai.api_key = os.getenv("OPENAI_API_KEY")

    root_folder = '/Users/rohit/Library/CloudStorage/OneDrive-Personal/# Backup/Venture Capital/Decks - funds'
    text_documents, image_documents = preprocess_documents(root_folder)
    index, text_paths, image_paths = index_embeddings(text_documents, image_documents)

    query = "This investor has held several CSO positions and is a top professional with dealflow."
    results = search(query, index, text_paths, image_paths)

    for path, chunk, score in results:
        print(f"Path: {path}, Chunk: {chunk}, Score: {score}")

    top_result_path, top_result_chunk = results[0][:2]  # Get the path and chunk of the top result
    input_type = top_result_path.split('.')[-1]
    text = analyze_input(input_type, None, top_result_path)
    summary = recursive_analyze(text)

    print("\nSummary of the top result:")
    print(summary)

if __name__ == "__main__":
    main()

ModuleNotFoundError: No module named 'readppt'