<a href="https://colab.research.google.com/github/orhod/Basic-MineSweeper/blob/main/HW2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports and Package installations

In [1]:
# pip installs
!pip install firebase
!pip install gradio
!pip install paho-mqtt

#================================= make sure all pip installs are above this line ============================================

# import to clear the installation code output
from IPython.display import clear_output
clear_output()

In [11]:
#imports
import gradio as gr
import json
import time
from firebase import firebase
import paho.mqtt.client as mqtt
import requests
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.parse import urljoin, urlparse
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from typing_extensions import Never
import os
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
DBLink = "https://wordbank-c75f1-default-rtdb.firebaseio.com/"


# Admin panel functions

# Index

In [9]:
class DbService:
    def __init__(self,Dblink):
        self.dbLink= Dblink

    def insert_to_db(self,results):
        FBconn = firebase.FirebaseApplication(self.dbLink,None)
        FBconn.put('/','terms',results)

    def get_from_db(self):
        FBconn = firebase.FirebaseApplication(self.dbLink,None)
        results = FBconn.get('/','terms')
        return results


class QueryService:
    def __init__(self,url):
        self.url = url

    def fetch_page(self):
        response = requests.get(self.url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            return soup
        else:
            return None

    def index_words(self, soup, index = {}, url = ''):
        words = re.findall(r'\w+', soup.get_text())
        for word in words:
            word = word.lower()
            if word in index:
                index[word]["Appearences"] += 1
                # Add URL to docid if it's not already there
                if url and url not in index[word]["DocIDs"]:
                    index[word]["DocIDs"].append(url)
            else:
                # Initialize with count and docid list containing the current URL
                index[word] = {"Appearences": 1, "DocIDs": [url] if url else []}

        return index

    def remove_stop_words(self,index):
      stop_words = set(stopwords.words('english'))
      for stop_word in stop_words:
        if stop_word in index:
          del index[stop_word]
      return index

class Crawler:
  def __init__(self, url):
    self.url = url

  #Fetches all sub urls from a given url
  def get_sub_urls(self, url):
    sub_urls = []
    stack = [url]
    while stack:
      url = stack.pop()
      response = requests.get(url)
      response.raise_for_status()  # Raise an exception for bad responses
      soup = BeautifulSoup(response.content, 'html.parser')
      for link in soup.find_all('a', href=True):
          href = link['href']
          absolute_url = urljoin(url, href)  # Make URL absolute

          if (absolute_url.startswith(url)) and (absolute_url != url) and (absolute_url not in sub_urls):
              sub_urls.append(absolute_url)
              stack.append(absolute_url)

    return sub_urls



def main():
  dbService = DbService(DBLink)
  url = "https://mqtt.org/"
  crawler = Crawler(url)
  sub_urls = crawler.get_sub_urls(url)
  index = {}
  for sub_url in sub_urls:
    queryService = QueryService(sub_url)
    soup = queryService.fetch_page()
    index = queryService.index_words(soup, index, sub_url)
    index = queryService.remove_stop_words(index)
  dbService.insert_to_db(index)


if __name__ == '__main__':
  main()

#Search Engine UI

In [22]:

def search_word(query):
    if not query:
        return "Please enter a search term"

    # Get the index from the database
    dbService = DbService(DBLink)
    index = dbService.get_from_db()

    if not index:
        return "No index found in the database. Please run the indexing process first."

    # Process the query - split into individual words
    words = re.findall(r'\w+', query.lower())

    if not words:
        return "Please enter valid search terms"

    # Dictionary to track all found URLs and their related words
    all_results = {}
    # Dictionary to track word appearance counts
    word_appearances = {}
    # Keep track of words not found
    words_not_found = []

    # Search for each word in the index
    for word in words:
        if word in index:
            urls = index[word]["DocIDs"]
            appearances = index[word]["Appearences"]
            word_appearances[word] = appearances

            # Add each URL to the results dictionary
            for url in urls:
                if url in all_results:
                    all_results[url].append(word)
                else:
                    all_results[url] = [word]
        else:
            words_not_found.append(word)

    # Format the results
    if not all_results:
        return f"No results found for any of the search terms: {', '.join(words)}"

    # Count the total number of URLs found and appearances
    total_urls = len(all_results)
    total_appearances = sum(word_appearances.values())

    # Start building the result string
    result = f"Found {len(words) - len(words_not_found)} of {len(words)} search terms in {total_urls} pages with {total_appearances} total appearances:\n\n"

    # Sort results by number of matching words (most matches first)
    sorted_results = sorted(all_results.items(), key=lambda x: len(x[1]), reverse=True)

    for i, (url, found_words) in enumerate(sorted_results, 1):
        result += f"{i}. {url} \n Contains words: \t {', '.join(found_words)}\n\n"

    # Add information about words not found
    if words_not_found:
        result += f"\nTerms not found: {', '.join(words_not_found)}"

    return result

# Create the Gradio interface for the search engine
search_interface = gr.Interface(
    fn=search_word,
    inputs=gr.Textbox(placeholder="Enter words to search..."),
    outputs=gr.Textbox(label="Search Results", lines=10),
    title="Multi-Word Search Engine",
    description="Search for multiple words and find the URLs where they appear.",
    allow_flagging='never',
)

# Launch the search interface
search_interface.launch(inline=True)



It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://40e20892b8c42cc205.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# Sensor data caching and processing

In [None]:
# Data proccessing
#def process_data(data):

db_url = "https://project-pheonix-39eef-default-rtdb.europe-west1.firebasedatabase.app/"
FBconn = firebase.FirebaseApplication(db_url,None)

# Data saving in DB
def send_to_db(path, data):
  FBconn.put('/',f'/fake/{path}',data)
  return

# create an mqtt connection
def on_connect(client, userdata, flags, rc):
  if rc == 0:
    print("Connected to MQTT Broker!\nSubscribing to topics")

    # Subscribe to the relevant topics
    client.subscribe("braude/D106/indoor")
    client.subscribe("braude/D106/outdoor")

    print("Successfully subscribed to topics!")
  else:
    print(f"Failed to connect, return code {rc}")

def on_disconnect(client, userdata, rc):
  if rc != 0:
    for i in range(5):
      print(f"Unexpected disconnection (error code: {rc}). Attempting to reconnect number {i + 1} in 5 seconds...")
      time.sleep(5)
      try:
        client.reconnect()
      except Exception as e:
        print(f"Reconnection attempt failed: {e}")

def on_message(client, userdata, msg):

  topic = msg.topic
  payload = msg.payload.decode('utf-8')  # Decode the byte string to a string

  print(f"Received JSON message on topic '{topic}': {payload}")
  entry = None
  try:
    sensor_data = json.loads(payload)
    print(f"Parsed JSON data: {sensor_data}")

    if topic == "braude/D106/indoor":
      send_to_db(f"indoor/{int(time.time())}", sensor_data)

    elif topic == "braude/D106/outdoor":
      send_to_db(f"outdoor/{int(time.time())}", sensor_data)

  except json.JSONDecodeError as e:
    print(f"Error decoding JSON: {e}")
    print(f"Problematic payload: {payload}")

# connect to the MQTT publisher
client = mqtt.Client()
client.on_connect = on_connect
client.on_disconnect = on_disconnect
client.on_message = on_message
client.connect("broker.hivemq.com", 1883, keepalive = 600)
client.loop_start()

time.sleep(5)
client.loop_stop()
client.disconnect()

  client = mqtt.Client()


Connected to MQTT Broker!
Subscribing to topics
Successfully subscribed to topics!


<MQTTErrorCode.MQTT_ERR_SUCCESS: 0>

# Screens

In [None]:
import plotly.graph_objects as go
from datetime import datetime
# Sensor data pulling from DB
data = FBconn.get('/fake/indoor',None)
data_keys = list(data.keys())
data_values = list(data.values())
readable_times = [datetime.utcfromtimestamp(int(ts)).strftime('%H:%M:%S') for ts in data_keys]

#values for each indoor sensor
data_values_Distance = [value['Distance'] for value in data_values]
data_values_Temperature = [value['Temperature'] for value in data_values]
data_values_Humidity = [value['Humidity'] for value in data_values]
data_values_Pressure = [value['Pressure'] for value in data_values]

# Data visualization
def plot_graph(name):
    val_arr = [value[name] for value in data_values]
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=readable_times, y=val_arr, mode='lines+markers', name=name))
    fig.update_layout(title='Sensor {} Over Time'.format(name), xaxis_title='Time', yaxis_title=name)
    return fig

# Gradio interface
gr.Interface(fn=plot_graph, inputs=gr.Dropdown(['Distance','Temperature','Humidity','Pressure']), outputs=gr.Plot(label="Graph")).launch(inline=True)


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d8a5920b371ce9e002.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


