In [1]:
import os
from dotenv import load_dotenv
from PIL import Image
from io import BytesIO
import base64
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_community.document_loaders import JSONLoader
from langchain.vectorstores import Chroma
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from lida import Manager, TextGenerationConfig, llm
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders import UnstructuredPowerPointLoader
from langchain.chat_models import ChatOpenAI
import openai

load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

In [2]:
from lida import Manager, TextGenerationConfig , llm  
from dotenv import load_dotenv
import os
import openai
from PIL import Image
from io import BytesIO
import re 
import csv
import re
from pptx import Presentation
import base64

In [3]:
lida = Manager(text_gen = llm("openai"))
textgen_config = TextGenerationConfig(n=1, temperature=0.5, model="gpt-3.5-turbo-0301", use_cache=True)

In [4]:
# Function to convert base64 string to image
def base64_to_image(base64_string):
    # Decode the base64 string
    byte_data = base64.b64decode(base64_string)
    
    # Use BytesIO to convert the byte data to image
    return Image.open(BytesIO(byte_data))

In [5]:
file_path = "data/Powerpoint_covid.pptx"

In [6]:
from pptx import Presentation
import pandas as pd
import re

def clean_text(text):
    """Clean text by removing extra spaces and newlines."""
    return re.sub(r'\s+', ' ', text).strip()

# Load the presentation
ppt = Presentation(file_path)

# Prepare a list to collect data that will be converted to DataFrame
data_for_df = []

for slide_number, slide in enumerate(ppt.slides):
    for shape in slide.shapes:
        if hasattr(shape, "text"):
            cleaned_text = clean_text(shape.text)
            data_for_df.append({
                'Slide Number': slide_number + 1,
                'Content': cleaned_text,
                'Content Type': 'Text'
            })
        elif shape.has_table:
            for row in shape.table.rows:
                row_data = ' | '.join([clean_text(cell.text) for cell in row.cells])
                data_for_df.append({
                    'Slide Number': slide_number + 1,
                    'Content': row_data,
                    'Content Type': 'Table'
                })

# Convert list to DataFrame
df = pd.DataFrame(data_for_df)


### Summary Generation

In [7]:
#Descriptive analysis

summary = lida.summarize(df, summary_method="default", textgen_config=textgen_config)  
summary

{'name': '',
 'file_name': '',
 'dataset_description': '',
 'fields': [{'column': 'Slide Number',
   'properties': {'dtype': 'int64',
    'samples': [1, 18, 16],
    'num_unique_values': 21,
    'semantic_type': '',
    'description': ''}},
  {'column': 'Content',
   'properties': {'dtype': 'string',
    'samples': ['KEY FINDINGS: UNI & BI-VARIABLE ANALYSIS',
     '',
     'Large | 2539 (0.39%) | 515,152 (16.6%)'],
    'num_unique_values': 82,
    'semantic_type': '',
    'description': ''}},
  {'column': 'Content Type',
   'properties': {'dtype': 'category',
    'samples': ['Table', 'Text'],
    'num_unique_values': 2,
    'semantic_type': '',
    'description': ''}}],
 'field_names': ['Slide Number', 'Content', 'Content Type']}

In [8]:
#Goals/Objectives of the analysis

goals = lida.goals(summary, n=4, textgen_config=textgen_config)

for goal in goals:
    display(goal)


### Goal 0
---
**Question:** What is the distribution of slide numbers?

**Visualization:** `Histogram of Slide Number`

**Rationale:** This visualization will show the frequency distribution of slide numbers in the dataset. It will help us understand the range of slide numbers in the dataset and identify any potential outliers or gaps in the data.



### Goal 1
---
**Question:** What is the most common content type in the dataset?

**Visualization:** `Bar chart of Content Type`

**Rationale:** This visualization will show the frequency distribution of content types in the dataset. It will help us identify the most common content type and understand the overall composition of the dataset.



### Goal 2
---
**Question:** What is the relationship between slide number and content type?

**Visualization:** `Scatter plot of Slide Number vs. Content Type`

**Rationale:** This visualization will show the relationship between slide number and content type. It will help us identify any patterns or trends in the data and understand if there is any correlation between slide number and content type.



### Goal 3
---
**Question:** What are the key findings from the uni and bi-variable analysis?

**Visualization:** `Table of Content where Content includes 'KEY FINDINGS'`

**Rationale:** This visualization will show the key findings from the uni and bi-variable analysis. It will help us understand the main insights and conclusions from the analysis and identify any important trends or patterns in the data.


### Query / Critique the Document

In [9]:
user_query = "Comment on the data collection methodology employed in the study"

In [10]:
loader = UnstructuredPowerPointLoader(file_path)
docs = loader.load()
docs

[Document(page_content='PRESS RELEASE\n\nCOVID-19 RAPID SURVEYS \x0bBUSINESS TRACKER \x0bHIGHLIGHTS OF FINDINGS\x0bFIRST EDITION OF WAVE I\n\n3 August 2020\n\nAugust 3 2020\n\n\n\nScope\n\nMotivation\n\nNotes on Methods\n\nHighlights of findings\n\nUpcoming Activities\n\n2\n\nAugust 3 2020\n\n\n\nMotivation  - 1\\2\n\nCOVID-19 from both a health and non-health perspective has unquestionably affected business operations and performance\n\nIn what dimensions have been businesses been affected?\n\nSome of which include:\n\nDisruptions in labour supply as a result of COVID-19 related morbidity, mortality, self exclusion from work and disarranged telework engagement\n\nBusiness closures in view of partial lockdown of Greater Accra and Kumasi\n\nSlow down and disturbances in demand of goods and services due to diverse reasons including wider effects of partial lockdown, anxiety among consumers and border closures\n\nInterruptions in supply of inputs and access to finance given mobility restr

In [12]:
# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=5)
docs = text_splitter.split_documents(docs)

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# load it into Chroma with a persistent directory
db = Chroma.from_documents(docs, embedding_function, ids=None, collection_name="langchain-pptx", persist_directory="./chroma_db/pptx")
db.persist()

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange
  warn_deprecated(


In [11]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)

  warn_deprecated(


In [17]:
retriever=db.as_retriever()

In [18]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [19]:
from langchain_core.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Always say "thanks for asking!" at the end of the answer.
You are a helpful data analyst that can help answer question about data.
You critically analyze the data and provide insights into the data.
You answer only questions from the data. You don't answer generic questions outside the document.
You provide the answer in a bullet point format. You use chain of thought to answer questions.
You make relevant recommendations based on the data. You don't summarize. 
You quote the figures as they are

{context}

Question: {question}

Helpful Answer:"""
custom_rag_prompt = PromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

result = rag_chain.invoke(user_query)

NameError: name 'user_query' is not defined

In [None]:
from IPython.display import Markdown
# Display the result in a more readable Markdown format
display(Markdown(result))