In [None]:
!pip install streamlit

In [None]:
!pip install python-terrier
!pip install nltk



In [None]:
%%writefile main.py
import streamlit as st
import pyterrier as pt
if not pt.started():
  pt.init()


import requests
from bs4 import BeautifulSoup
import os
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
pd.set_option('display.max_colwidth', 150)
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nltk.download('punkt')
from nltk.stem import *
from nltk.stem.porter import *
import math

from sklearn.metrics.pairwise import cosine_similarity
import requests
from bs4 import BeautifulSoup
def scrape_documents(topic, max_documents):
    search_url = f"https://en.wikipedia.org/wiki/{topic}"

    response = requests.get(search_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        document_links = soup.find_all('a', class_='mw-redirect')

        results = []

        for idx, link in enumerate(document_links, start=1):
            title = link.text.strip()
            href = 'https://en.wikipedia.org' + link['href']

            document_response = requests.get(href)
            if document_response.status_code == 200:
                document_soup = BeautifulSoup(document_response.content, 'html.parser')
                document_content = document_soup.find('div', class_='mw-parser-output')
                if document_content:
                    text = document_content.get_text(separator='\n')
                else:
                    text = "No content found"

                document_info = {
                    'title': title,
                    'link': href,
                    'text': text
                }
                results.append(document_info)

            if idx == max_documents:
                break

        return results
    else:
        print("Failed to retrieve page:", response.status_code)
        return []


def clean_words_and_remove_spaces(text):
    valid_characters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

    cleaned_words = []
    for letter in text:
        if letter in valid_characters:
            cleaned_words.append(letter)
        else:
            cleaned_words.append(' ')

    cleaned_text = ''.join(cleaned_words)
    cleaned_text = ' '.join(cleaned_text.split())

    return cleaned_text


def put_in_data_frame(all_results):
  df = pd.DataFrame(all_results)
  df['docno'] = range(1, len(df) + 1)
  return df


def Preprocess_my_text(text):
    stemmer = PorterStemmer()
    text = re.sub(r"http\S+", " ", text)
    text = re.sub(r"RT ", " ", text)
    text = re.sub(r"@[\w]*", " ", text)
    text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", text)
    text = re.sub(r'\t', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r"\s+", " ", text)
    tokens = word_tokenize(text)
    filtered_stemmed_tokens = [stemmer.stem(word.lower()) for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_stemmed_tokens)


def find_expand_and_rank(df,query):
  # indexing
  indexer = pt.DFIndexer("./DatasetIndex", overwrite=True)
  index_ref = indexer.index(df["cleaned_text"].astype(str), df["docno"].astype(str))
  index_ref.toString()
  index = pt.IndexFactory.of(index_ref)

  #expand
  print(query)
  QE = pt.BatchRetrieve(index, wmodel="TF_IDF", controls={"qemodel" : "Bo1", "qe" : "on"}).search(' '.join(query))

  return QE

def return_the_links(df, q_results):
    df["docno"] = df["docno"].astype(str)
    q_results["docno"] = q_results["docno"].astype(str)

    merged_df = pd.merge(df, q_results, on="docno", how="inner")
    links_list = [(title, link) for title, link in zip(merged_df["title"], merged_df["link"])]
    return links_list


def search(query,topic):

    if topic.lower() == "auto":
      topic = "science"

    documents_per_topic = 20 #changable
    all_results = scrape_documents(topic, documents_per_topic)
    # for result in all_results:
    #     print("Title:", result['title'])

    for i in range(len(all_results)):
      document = all_results[i]
      document['text'] = clean_words_and_remove_spaces(document['text'])

    df = put_in_data_frame(all_results)
    df['cleaned_text'] = df['text'].apply(Preprocess_my_text).apply(clean_words_and_remove_spaces)
    df.drop(columns = {"text"},inplace = True)
    df.dropna(inplace = True)

    query = Preprocess_my_text(query).split()
    q_results = find_expand_and_rank(df,query)

    links_list = return_the_links(df, q_results)

    return links_list

def main():
    st.title("Browser")
    st.text("Enter the general topic then what you are searching for :)")

    search_topic = st.text_input("Topic:")
    search_query = st.text_input(label="Query:")
    submit = st.button("Search")


    # submit = st.form_submit_button("Abracadabra!", use_container_width=True)

    if submit and search_query and search_topic:
        results = search(search_query,search_topic)
        st.write("Search Results:")
        for title, link in results:
            st.write(f"{title}: {link}")


if __name__ == "__main__":
    main()

In [None]:
!streamlit run main.py & npx localtunnel --port 8501


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://35.197.39.220:8501[0m
[0m
[1G[0JNeed to install the following packages:
  localtunnel@2.0.2
Ok to proceed? (y) [20G