In [56]:
import os
from dotenv import load_dotenv
from PIL import Image
from io import BytesIO
import base64
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_community.document_loaders import JSONLoader
from langchain.vectorstores import Chroma
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from lida import Manager, TextGenerationConfig, llm
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.chat_models import ChatOpenAI
from langchain_community.document_loaders import UnstructuredFileLoader
import openai

load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

In [2]:
# Function to convert base64 string to image
def base64_to_image(base64_string):
    # Decode the base64 string
    byte_data = base64.b64decode(base64_string)
    
    # Use BytesIO to convert the byte data to image
    return Image.open(BytesIO(byte_data))

In [35]:
lida = Manager(text_gen = llm("openai"))
textgen_config = TextGenerationConfig(n=1, temperature=0.5, model="gpt-3.5-turbo-16k-0613", use_cache=True)

In [38]:
file_path = "data/Amazon-2021-Annual-Report.pdf"

###  Summary  Generation

In [39]:
#Descriptive Analysis
import pdfplumber
import pandas as pd

pdf_path = file_path
data = {'pages': []}

with pdfplumber.open(pdf_path) as pdf:
    for i, page in enumerate(pdf.pages):
        text = page.extract_text()
        if text:
            text = text.replace(u'\xa0', u' ')
            text = text.encode('utf-8', 'replace').decode('utf-8')
        data['pages'].append({
            'page_number': i + 1,
            'content': text
        })
    


# Assuming 'data' is already loaded as shown earlier
df = pd.json_normalize(data['pages'])

# Try summarizing using the DataFrame
summary = lida.summarize(df)


In [40]:
summary

{'name': '',
 'file_name': '',
 'dataset_description': '',
 'fields': [{'column': 'page_number',
   'properties': {'dtype': 'int64',
    'samples': [76, 1, 71],
    'num_unique_values': 86,
    'semantic_type': '',
    'description': ''}},
  {'column': 'content',
   'properties': {'dtype': 'string',
    'samples': ['Net sales are attributed to countries primarily based on country-focused online and physical stores or, for AWS purposes,\nthe selling entity. Net sales attributed to countries that represent a significant portion of consolidated net sales are as follows (in\nmillions):\nYear Ended December 31,\n2019 2020 2021\nUnited States $ 193,636 $ 263,520 $ 314,006\nGermany 22,232 29,565 37,326\nUnited Kingdom 17,527 26,483 31,914\nJapan 16,002 20,461 23,071\nRest of world 31,125 46,035 63,505\nConsolidated $ 280,522 $ 386,064 $ 469,822\nTotal segment assets exclude corporate assets, such as cash and cash equivalents, marketable securities, other long-term\ninvestments, corporate faci

In [41]:
#Goals/Objectives of the analysis

goals = lida.goals(summary, n=4, textgen_config=textgen_config)

for goal in goals:
    display(goal)


### Goal 0
---
**Question:** What is the trend in net sales for the United States, Germany, United Kingdom, Japan, and Rest of world from 2019 to 2021?

**Visualization:** `Line chart showing the net sales for each country over time`

**Rationale:** By visualizing the net sales for each country over time, we can identify the trends and growth rates in different regions.



### Goal 1
---
**Question:** How do the total segment assets compare between North America, International, AWS, and Corporate in 2019, 2020, and 2021?

**Visualization:** `Stacked bar chart comparing the total segment assets for each category over time`

**Rationale:** A stacked bar chart will allow us to compare the total segment assets for each category over time and identify any changes in their proportions.



### Goal 2
---
**Question:** What is the distribution of property and equipment net by segment in 2021?

**Visualization:** `Bar chart showing the property and equipment net for each segment in 2021`

**Rationale:** A bar chart will provide a clear comparison of the property and equipment net for each segment in 2021 and help identify any significant differences.



### Goal 3
---
**Question:** How do the total net additions to property and equipment compare between North America, International, AWS, and Corporate from 2019 to 2021?

**Visualization:** `Line chart showing the total net additions to property and equipment for each category over time`

**Rationale:** By visualizing the total net additions to property and equipment for each category over time, we can identify the trends and growth rates in different regions.


### Data Visualization

In [48]:
charts = lida.visualize(summary=summary, goal=goals[0], library="matplotlib")


```python
import matplotlib.pyplot as plt
import pandas as pd

# plan -
def plot(data: pd.DataFrame):
    # Filter the data for the required countries
    countries = ['United States', 'Germany', 'United Kingdom', 'Japan', 'Rest of world']
    filtered_data = data[data['content'].str.contains('|'.join(countries))]
    
    # Extract the year and net sales columns
    filtered_data['year'] = filtered_data['content'].str.extract(r'(\d{4})')
    filtered_data['net_sales'] = filtered_data['content'].str.extract(r'\$ (\d+,\d+,\d+)')
    filtered_data['net_sales'] = filtered_data['net_sales'].str.replace(',', '').astype(int)
    
    # Group the data by country and year and calculate the sum of net sales
    grouped_data = filtered_data.groupby(['year', 'content']).sum().reset_index()
    
    # Pivot the data to have years as columns and countries as rows
    pivoted_data = grouped_data.pivot(index='content', columns='year', values='net_sales')
    
    # Plot the line chart
    pivoted_dat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [44]:
for goal in goals:
    charts = lida.visualize(summary=summary, goal=goal, library="matplotlib")

len(goals)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


```python
import matplotlib.pyplot as plt
import pandas as pd

# plan -
def plot(data: pd.DataFrame):
    # Filter the data for the required countries
    countries = ['United States', 'Germany', 'United Kingdom', 'Japan', 'Rest of world']
    filtered_data = data[data['content'].str.contains('|'.join(countries))]
    
    # Extract the year and net sales columns
    filtered_data['year'] = filtered_data['content'].str.extract(r'(\d{4})')
    filtered_data['net_sales'] = filtered_data['content'].str.extract(r'\$ (\d+,\d+,\d+)')
    filtered_data['net_sales'] = filtered_data['net_sales'].str.replace(',', '').astype(int)
    
    # Group the data by country and year and calculate the sum of net sales
    grouped_data = filtered_data.groupby(['year', 'content']).sum().reset_index()
    
    # Pivot the data to have years as columns and countries as rows
    pivoted_data = grouped_data.pivot(index='content', columns='year', values='net_sales')
    
    # Plot the line chart
    pivoted_dat

4

### Visualize Based On User Query

In [45]:
user_query = "What's the distribution of sales by country?"

In [52]:
charts = lida.visualize(summary=summary, goal=user_query, library="matplotlib")

```python
import matplotlib.pyplot as plt
import pandas as pd

# plan -
def plot(data: pd.DataFrame):
    # Group the data by country and calculate the sum of net sales
    sales_by_country = data.groupby('country')['net_sales'].sum().reset_index()

    # Sort the data by net sales in descending order
    sales_by_country = sales_by_country.sort_values('net_sales', ascending=False)

    # Create a bar plot to visualize the distribution of sales by country
    plt.bar(sales_by_country['country'], sales_by_country['net_sales'])

    # Add labels and title
    plt.xlabel('Country')
    plt.ylabel('Net Sales')
    plt.title('Distribution of Sales by Country')

    # Rotate x-axis labels for better readability
    plt.xticks(rotation=45)

    return plt

chart = plot(data)
```
****
 'country'


###  Critize the document

In [53]:
user_query = "Provide a brief forecast for the new year based on the data given?"

In [65]:
from langchain_community.document_loaders import PyMuPDFLoader

In [67]:
loader = PyMuPDFLoader(file_path)
docs = loader.load()

In [68]:
# split it into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=5)
docs = text_splitter.split_documents(docs)

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# load it into Chroma with a persistent directory
db = Chroma.from_documents(docs, embedding_function, ids=None, collection_name="langchain-pdf", persist_directory="./chroma_db/pdf")
db.persist()

  warn_deprecated(


In [69]:
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.2)
retriever=db.as_retriever()

  warn_deprecated(


In [70]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [71]:
from langchain_core.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Always say "thanks for asking!" at the end of the answer.
You are a helpful data analyst that can help answer question about data.
You critically analyze the data and provide insights into the data.
You answer only questions from the data. You don't answer generic questions outside the document.
You provide the answer in a bullet point format. You use chain of thought to answer questions.
You make relevant recommendations based on the data. You don't summarize. 
You quote the figures as they are

{context}

Question: {question}

Helpful Answer:"""
custom_rag_prompt = PromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

result = rag_chain.invoke(user_query)

In [72]:
from IPython.display import Markdown
# Display the result in a more readable Markdown format
display(Markdown(result))

- Net sales for the first quarter of 2022 are expected to be between $112.0 billion and $117.0 billion, with a growth rate of 3% to 8% compared to the first quarter of 2021, anticipating an unfavorable impact of approximately 150 basis points from foreign exchange rates.
- Operating income for the first quarter of 2022 is expected to be between $3.0 billion and $6.0 billion, compared to $8.9 billion in the first quarter of 2021, including lower depreciation expense due to increases in the estimated useful lives of servers and networking equipment.
- The company's financial focus is on long-term, sustainable growth in free cash flows, driven by increasing operating income and efficiently managing accounts receivable, inventory, accounts payable, and cash capital expenditures.
- Variability in inventory turnover over time is expected due to factors like product mix and sales mix, impacting the cash-generating operating cycle.

Thanks for asking!