# Data analysis using LLM models 

### reading the essential packages

In [62]:

import sys
# !{sys.executable} -m pip install langchain
# !{sys.executable} -m pip install langchain_community
# !{sys.executable} -m pip install pandasai
# !{sys.executable} -m pip install ollama
# !{sys.executable} -m pip install chromadb
# !{sys.executable} -m pip install pysqlite3-binary
!{sys.executable} -m pip install -U langchain-nomic 
!{sys.executable} -m pip install -U tiktoken 
!{sys.executable} -m pip install -U langchainhub 
!{sys.executable} -m pip install -U langgraph

Collecting langchain-nomic
  Downloading langchain_nomic-0.0.2-py3-none-any.whl.metadata (1.3 kB)
Collecting nomic<4.0.0,>=3.0.12 (from langchain-nomic)
  Downloading nomic-3.0.25.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting jsonlines (from nomic<4.0.0,>=3.0.12->langchain-nomic)
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting loguru (from nomic<4.0.0,>=3.0.12->langchain-nomic)
  Downloading loguru-0.7.2-py3-none-any.whl.metadata (23 kB)
Collecting pyarrow (from nomic<4.0.0,>=3.0.12->langchain-nomic)
  Downloading pyarrow-16.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting pyjwt (from nomic<4.0.0,>=3.0.12->langchain-

In [1]:
import pandas as pd  
from pandasai import SmartDataframe
from pandasai import Agent
# Instantiate a LLM
from pandasai.llm import OpenAI
from pandasai.llm.local_llm import LocalLLM
from langchain_community.llms import Ollama
import ollama
import bs4
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import dataframe
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
__import__('pysqlite3')
import sys
sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')

In [2]:
# Reading the data 
data = pd.read_csv('../data/population.csv')
data.sample(n=10)

Unnamed: 0,Country,Population
10,Bangladesh,172954319
16,DR Congo,102262808
12,Mexico,128455567
0,United States,339996563
5,Indonesia,277534122
3,Turkey,85816199
18,Iran,89172767
1,China,1425671352
19,Thailand,71801279
7,Nigeria,223804632


In [24]:
# init an llm model
# llm = LocalLLM(api_base="http://localhost:11434/v1",model='llama3')
gpt4_llm = OpenAI(api_token="OPENAIKEY",model="gpt-4")

llm = Ollama(model = 'llama3')
sdf = SmartDataframe(data, config={"llm":gpt4_llm})

In [25]:
sdf.chat('Which country which has max population and what is the value?')

Unnamed: 0,Country,Population,doc
8,India,1428627663,country: India population: str(1428627663)


In [26]:
print(sdf.last_code_generated)

max_population_country = dfs[0][dfs[0]['Population'] == dfs[0]['Population'].max()]
result = {'type': 'dataframe', 'value': max_population_country}


In [60]:
sdf.chat("what is the popluation of India? ")

1428627663

In [61]:
print(sdf.last_code_generated)

population_india = None
for df in dfs:
    if 'India' in df['Country'].values:
        population_india = df.loc[df['Country'] == 'India', 'Population'].values[0]
        break
result = {'type': 'number', 'value': population_india}


# RAG system

In [12]:
def create_doc (country, population):
    return f"country:  {country}  population: str({population })"

create_doc('india',3444444) 

'country:  india  population: str(3444444)'

In [13]:
data['doc'] = data.apply(lambda x: create_doc(x['Country'],x['Population']),axis=1)

In [29]:
docs = dataframe.DataFrameLoader(data,page_content_column='doc').load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)


In [30]:
# Create Ollama embeddings and vector store
embeddings = OllamaEmbeddings(model="llama3")
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

In [31]:
# Create the retriever
retriever = vectorstore.as_retriever()

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [32]:
# Define the Ollama LLM function
def ollama_llm(question, context):
    formatted_prompt = f"Question: {question}\n\nContext: {context} , give the answer with respect to context"
    response = ollama.chat(model='llama3', messages=[{'role': 'user', 'content': formatted_prompt}])
    return response['message']['content']

In [33]:
# Define the RAG chain
def rag_chain(question):
    retrieved_docs = retriever.invoke(question)
    formatted_context = format_docs(retrieved_docs)
    return ollama_llm(question, formatted_context) , retrieved_docs

In [56]:
result , evm_docs =rag_chain("the population str of country Russia?")

In [57]:
print(result)


I see what you're getting at!

You want me to extract the population figures from the given contexts, right?

Alright, let's do that:

**Russia**: The population of Russia is approximately **144.44 million** (as per your context).

**Japan**: And, according to your context, the population of Japan is approximately **12.32945 million**.

Please note that these figures might not be up-to-date or accurate, as they are based on the provided contexts and not actual census data.


In [58]:
print(evm_docs)

[Document(page_content='country:  Russia  population: str(144444359)', metadata={'Country': 'Russia', 'Population': 144444359}), Document(page_content='country:  Russia  population: str(144444359)', metadata={'Country': 'Russia', 'Population': 144444359}), Document(page_content='country:  Japan  population: str(123294513)', metadata={'Country': 'Japan', 'Population': 123294513}), Document(page_content='country:  Japan  population: str(123294513)', metadata={'Country': 'Japan', 'Population': 123294513})]
