In [None]:
from langchain_groq import ChatGroq
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper

import os 
os.environ["GROQ_API_KEY"] = ""

import requests
import wikipedia
from bs4 import BeautifulSoup

import os
import time
import pickle
import streamlit as st
from datetime import datetime
from streamlit_chat import message

from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.schema import AIMessage, HumanMessage, SystemMessage





In [4]:
llm = ChatGroq(
    temperature=0, 
    model_name="llama-3.3-70b-versatile"
)
response = llm.invoke("The first person to land on moon was ...")
print(response.content)

The first person to land on the moon was Neil Armstrong. He stepped out of the lunar module Eagle and onto the moon's surface on July 20, 1969, during the Apollo 11 mission. Armstrong famously declared, "That's one small step for man, one giant leap for mankind," as he became the first human to set foot on the moon.


In [None]:
import chromadb
import uuid


def get_wiki(search):
    # set language to English (default is auto-detect)
    lang = "en"

    """
    fetching summary from wikipedia
    """
    # set language to English (default is auto-detect)
    #summary = wikipedia.summary(search, sentences = 5)

    """
    scrape wikipedia page of the requested query
    """

    # create URL based on user input and language
    url = f"https://{lang}.wikipedia.org/wiki/{search}"

    # send GET request to URL and parse HTML content
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # extract main content of page
    content_div = soup.find(id="mw-content-text")

    # Find all headers (h2, h3, etc.)
    headers = content_div.find_all(['h3'])

    chroma_client = chromadb.PersistentClient('vectorstore')
    collection = chroma_client.get_or_create_collection(name=search)
    for header in headers:
            collection.add(documents=header.get_text(strip=True),
                                ids=[str(uuid.uuid4())])
    selected_headers_names = collection.query(query_texts="Spots that could be interesting for a tourist. The tourist claims to have following interest : {}".format("I like history"), n_results=2).get('documents', [])
    selected_text = ""
    print(selected_headers_names)
    for header in headers:
        headline_text = header.get_text(strip=True)
        if headline_text in selected_headers_names[0]:
            section_text = [headline_text+" : "]
            sibling_count = 0
            for sibling in header.find_parent().find_next_siblings():
                if (sibling.name and sibling.name.startswith('h')) or sibling_count>1:
                    print(sibling_count)
                    break  # Stop if another header is found
                if sibling.name == 'p':
                    print("---")
                    section_text.append(sibling.get_text())
                    if sibling.get_text() != "":
                        sibling_count+=1

            selected_text = selected_text.join(section_text)

    return selected_text




In [17]:
page_data = get_wiki("Valencia")
print(page_data)


[['Tourism', 'Tourism', 'Tourism', 'Tourism', 'Tourism']]
---
---
2
Tourism : Starting in the mid-1990s, Valencia, formerly an industrial centre, saw rapid development that expanded its cultural and tourism possibilities, and transformed it into a newly vibrant city. Many local landmarks were restored, including the medieval Torres de Serranos and Quart Towers and the Monasterio de San Miguel de los Reyes, which now holds a conservation library. Whole sections of the old city, for example the Carmen Quarter, have been extensively renovated. The Passeig Marítim, a 4 km (2 mi) long palm tree-lined beach promenade, was constructed along the beaches of the north side of the port (Platja de Les Arenes, Platja del Cabanyal and Platja de la Malva-rosa).
Valencia boasts a highly active and diverse nightlife, with bars, dance bars, beach bars and nightclubs staying open well past midnight.[105] The city has numerous convention centres and venues for trade events, among them the Institución Feri

In [26]:
from langchain_core.prompts import PromptTemplate

prompt_extract = PromptTemplate.from_template(
        """
        ### SCRAPED TEXT FROM WEBSITE:
        {page_data}
        ### INSTRUCTION:
        The scraped text is from wikipedia site of {search}.
        Your job is to extract the places, buildings or cites which a tourist would find interesting. Return them in JSON format containing the 
        following keys: `name`, `type`, `description`, 'history'. The key "type" is a categorical key. The corresponding categories are : history, art, culture, architecture, general.
        Only return the valid JSON.
        ### VALID JSON (NO PREAMBLE):    
        """
)

chain_extract = prompt_extract | llm 
res = chain_extract.invoke(input={'page_data':page_data, 'search':"Garching"})
type(res.content)

str

In [27]:
from langchain_core.output_parsers import JsonOutputParser

json_parser = JsonOutputParser()
json_res = json_parser.parse(res.content)
json_res

[{'name': 'Garching',
  'type': 'general',
  'description': 'A city in Bavaria, near Munich, known for its research institutes and university departments',
  'history': 'Garching was a small Bavarian village until the Free State of Bavaria decided to implement a technology and urban planning policy'},
 {'name': 'Campus Garching',
  'type': 'architecture',
  'description': 'A campus located in Garching, home to several research institutes and university departments',
  'history': 'Established as part of the urban planning policy to cluster science north of Munich'},
 {'name': 'Max Planck Institute for Plasma Physics',
  'type': 'culture',
  'description': 'A research institute established in Garching in 1960',
  'history': 'Founded in 1960 as part of the Max Planck Society'},
 {'name': 'Max Planck Institute for Extraterrestrial Physics',
  'type': 'culture',
  'description': 'A research institute established in Garching in 1964',
  'history': 'Founded in 1964 as part of the Max Planck S

In [28]:
import json
import csv

# now we will open a file for writing
data_file = open('spots_Garching.csv', 'w')
 
# create the csv writer object
csv_writer = csv.writer(data_file)
 

count = 0
 
for spot in json_res:
    if count == 0:
 
        # Writing headers of CSV file
        header = spot.keys()
        csv_writer.writerow(header)
        count += 1
 
    # Writing data of CSV file
    csv_writer.writerow(spot.values())
 
data_file.close()