In [None]:
#install the Pyterrier framework
!pip install python-terrier
# install the nltk modules
!pip install nltk

Collecting python-terrier
  Downloading python-terrier-0.10.1.tar.gz (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.7/110.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wget (from python-terrier)
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyjnius>=1.4.2 (from python-terrier)
  Downloading pyjnius-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting matchpy (from python-terrier)
  Downloading matchpy-0.5.5-py3-none-any.whl (69 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.6/69.6 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deprecated (from python-terrier)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting chest (fro

In [None]:
import pyterrier as pt

if not pt.started():
  # In this lab, we need to specify that we start PyTerrier with PRF enabled
  pt.init(boot_packages=["com.github.terrierteam:terrier-prf:-SNAPSHOT"])

terrier-assemblies 5.9 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.8 jar not found, downloading to /root/.pyterrier...
Done
terrier-prf -SNAPSHOT jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.10.1 has loaded Terrier 5.9 (built by craigm on 2024-05-02 17:40) and terrier-helper 0.0.8



In [None]:
# Import Libraries

import re
import pandas as pd
import nltk
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub

from nltk.stem import *
from nltk.stem.porter import *
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


# **1) Data Collection**

In [None]:
# Read the CSV file

import pandas as pd

vaswani_dataset = pt.datasets.get_dataset("vaswani")

df = vaswani_dataset.get_topics()

df['docno'] = df.index

# Rename column 'A' to 'X'
df = df.rename(columns={'query': 'Text'})

qrels = vaswani_dataset.get_qrels()

qrels['docno']=qrels['docno'].astype(str)

df

Downloading vaswani qrels to /root/.pyterrier/corpora/vaswani/qrels


qrels:   0%|          | 0.00/6.63k [00:00<?, ?iB/s]

Unnamed: 0,qid,Text,docno
0,1,measurement of dielectric constant of liquids ...,0
1,2,mathematical analysis and design details of wa...,1
2,3,use of digital computers in the design of band...,2
3,4,systems of data coding for information transfer,3
4,5,use of programs in engineering testing of comp...,4
...,...,...,...
88,89,tunnel diode construction and its electrical c...,88
89,90,electronic density of states at the surface of...,89
90,91,resistivity of metallic thin films related to ...,90
91,92,the phenomenon of radiation caused by charged ...,91


In [None]:
# Download the stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Download NLTK resources

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Get the list of English stopwords

stop_words = set(stopwords.words('english'))

In [None]:
# Initialize Porter stemmer

stemmer = PorterStemmer()

# **2) Preprocessing**

In [None]:
# 1) Clean

def clean(text):
   text = re.sub(r"http\S+", " ", str(text)) # remove urls
   text = re.sub(r"RT ", " ", str(text)) # remove rt
   text = re.sub(r"@[\w]*", " ", str(text)) # remove handles
   text = re.sub(r"[\.\,\#_\|\:\?\?\/\=]", " ", str(text)) # remove special characters
   text = re.sub(r'\t', ' ', str(text)) # remove tabs
   text = re.sub(r'\n', ' ', str(text)) # remove line jump
   text = re.sub(r"\s+", " ", str(text)) # remove extra white space
   text = str(text).strip()
   return str(text)

df["processed_text"] = df["Text"].apply(clean)
df

Unnamed: 0,qid,Text,docno,processed_text
0,1,measurement of dielectric constant of liquids ...,0,measurement of dielectric constant of liquids ...
1,2,mathematical analysis and design details of wa...,1,mathematical analysis and design details of wa...
2,3,use of digital computers in the design of band...,2,use of digital computers in the design of band...
3,4,systems of data coding for information transfer,3,systems of data coding for information transfer
4,5,use of programs in engineering testing of comp...,4,use of programs in engineering testing of comp...
...,...,...,...,...
88,89,tunnel diode construction and its electrical c...,88,tunnel diode construction and its electrical c...
89,90,electronic density of states at the surface of...,89,electronic density of states at the surface of...
90,91,resistivity of metallic thin films related to ...,90,resistivity of metallic thin films related to ...
91,92,the phenomenon of radiation caused by charged ...,91,the phenomenon of radiation caused by charged ...


In [None]:
# 2) Remove stop words

def remove_stop(text) :

  tokens = word_tokenize(str(text))
  sentence = []

  for i in tokens:
    if i not in stop_words:
      sentence.append(i)

  return' '.join(sentence)


df["processed_text"] = df["processed_text"].apply(remove_stop)
df

Unnamed: 0,qid,Text,docno,processed_text
0,1,measurement of dielectric constant of liquids ...,0,measurement dielectric constant liquids use mi...
1,2,mathematical analysis and design details of wa...,1,mathematical analysis design details waveguide...
2,3,use of digital computers in the design of band...,2,use digital computers design band pass filters...
3,4,systems of data coding for information transfer,3,systems data coding information transfer
4,5,use of programs in engineering testing of comp...,4,use programs engineering testing computers
...,...,...,...,...
88,89,tunnel diode construction and its electrical c...,88,tunnel diode construction electrical character...
89,90,electronic density of states at the surface of...,89,electronic density states surface semiconducto...
90,91,resistivity of metallic thin films related to ...,90,resistivity metallic thin films related surfac...
91,92,the phenomenon of radiation caused by charged ...,91,phenomenon radiation caused charged particles ...


In [None]:
# 3) Steeming

def steeming(text) :

  tokens = word_tokenize(text)
  steemed_text = []
  j = 0

  for i in tokens:
    steemed_text.append(stemmer.stem(i))
    #print(f"tokens : {steemed_text[j]}")
    j = j+1


  return ' '.join(steemed_text)

df["processed_text"] = df["processed_text"].apply(steeming)
df

Unnamed: 0,qid,Text,docno,processed_text
0,1,measurement of dielectric constant of liquids ...,0,measur dielectr constant liquid use microwav t...
1,2,mathematical analysis and design details of wa...,1,mathemat analysi design detail waveguid fed mi...
2,3,use of digital computers in the design of band...,2,use digit comput design band pass filter given...
3,4,systems of data coding for information transfer,3,system data code inform transfer
4,5,use of programs in engineering testing of comp...,4,use program engin test comput
...,...,...,...,...
88,89,tunnel diode construction and its electrical c...,88,tunnel diod construct electr characterist explain
89,90,electronic density of states at the surface of...,89,electron densiti state surfac semiconductor co...
90,91,resistivity of metallic thin films related to ...,90,resist metal thin film relat surfac rough
91,92,the phenomenon of radiation caused by charged ...,91,phenomenon radiat caus charg particl move vari...


# **3) Indexing**

In [None]:
# Ba8yar el type bta3 el column l string 3a4an el indexing
df['docno'] = df['docno'].astype(str)

# Ba3d keda bataba3 el indexer
indexer = pt.DFIndexer("./myFirstIndex", overwrite=True)
index_ref = indexer.index(df["processed_text"], df["docno"])
index = pt.IndexFactory.of(index_ref)

In [None]:
print(index_ref.toString())
#we will first load the index
index = pt.IndexFactory.of(index_ref)
#we will call getCollectionStatistics() to check the stats
print(index.getCollectionStatistics().toString())

print(index.toString())

./myFirstIndex/data.properties
Number of documents: 93
Number of terms: 339
Number of postings: 646
Number of fields: 0
Number of tokens: 653
Field names: []
Positions:   false

/content/myFirstIndex/data.properties


In [None]:
for kv in index.getLexicon():
  print("%s -> %s " % (kv.getKey(), kv.getValue().toString()))

absorpt -> term112 Nt=1 TF=1 maxTF=1 @{0 0 0} 
abstract -> term138 Nt=2 TF=2 maxTF=1 @{0 1 2} 
activ -> term272 Nt=2 TF=2 maxTF=1 @{0 4 0} 
adder -> term207 Nt=1 TF=1 maxTF=1 @{0 6 0} 
advantag -> term284 Nt=1 TF=1 maxTF=1 @{0 7 4} 
affect -> term294 Nt=1 TF=1 maxTF=1 @{0 9 2} 
altitud -> term179 Nt=1 TF=1 maxTF=1 @{0 11 0} 
amplifi -> term72 Nt=8 TF=8 maxTF=1 @{0 12 4} 
analogu -> term167 Nt=4 TF=4 maxTF=1 @{0 19 4} 
analysi -> term11 Nt=4 TF=4 maxTF=1 @{0 23 4} 
approach -> term78 Nt=1 TF=1 maxTF=1 @{0 28 0} 
approxim -> term88 Nt=1 TF=1 maxTF=1 @{0 29 0} 
arc -> term44 Nt=1 TF=1 maxTF=1 @{0 30 2} 
arithmet -> term211 Nt=1 TF=1 maxTF=1 @{0 31 2} 
articl -> term301 Nt=2 TF=2 maxTF=1 @{0 32 6} 
atmosph -> term105 Nt=3 TF=3 maxTF=1 @{0 34 6} 
attenu -> term20 Nt=3 TF=3 maxTF=1 @{0 37 4} 
audio -> term277 Nt=1 TF=1 maxTF=1 @{0 40 4} 
aurora -> term182 Nt=2 TF=2 maxTF=1 @{0 42 2} 
avail -> term218 Nt=1 TF=1 maxTF=1 @{0 45 0} 
back -> term147 Nt=1 TF=1 maxTF=1 @{0 46 4} 
balloon -> term176

# **4) Query Processing**

In [None]:
def preprocess(sentence):
  sentence = remove_stop(sentence)
  sentence = clean(sentence)
  sentence = steeming(sentence)

  return sentence

In [None]:
query="measurement"
query = preprocess(query)
query

'measur'

In [None]:
splited_query = query.split()
len(splited_query)

1

In [None]:
# Identify the documents that have the query

# ha7ot fe el variable el esmo (processed) el column bta3 el processed_text
def se(text , doc_ids , query):

 docs_dictionary = {}

 i = 0

# halef 3ala kol document fe el column dah
 for docs in text:

  # ha7ot fe el variable el esmo (x) el job id el bta3 el job el 7alya
   x = doc_ids[i]

  # ha7ot fe el deictionary el esmha (docs_dictionary) el key el hwa rakam el document w ha7ot el value el hwa el terms el fe el document dah
   docs_dictionary[x] = word_tokenize(text[i])
   i = i +1

 # hena ana batala3 2asma2 el job_IDs el zahar feha kelmet (AI)

 j = 0

# ha3mel list feha el keys bs bta3et el docs_dictionray
 docs_dictionary_keys = list(docs_dictionary.keys())

 result_docs = []

# halef 3ala kol el job
 for doc in docs_dictionary.values():

  # halef 3ala kol term fe kol job
   for term in doc:

   # law el term dah = kelmet (ai)
    if term == query and docs_dictionary_keys[j] not in result_docs:

      # yeb2a 7otly el document dah fe el list el esmha (result_docs)
       result_docs.append(docs_dictionary_keys[j])
   j = j + 1

 return result_docs

dici = se(df["processed_text"] , df["docno"] , query)

dici = [int(x) for x in dici]

dici

[0, 7, 34]

In [None]:
# Retrieve documents that contain all the terms from the query

def Ret(docs):

  c = df["Text"]
  for doc in docs:
    print(f"Doc Number {doc} ---> {c[doc]}")

Ret(dici)

Doc Number 0 ---> measurement of dielectric constant of liquids by the use of microwave techniques
Doc Number 7 ---> measurement of plasma temperatures in arc discharge using shock wave techniques
Doc Number 34 ---> measurements of ionospheric drifts near the equator


In [None]:
# Rank the retrievd documents based ranking algorithm (TF-IDF)

tfidf_retr = pt.BatchRetrieve(index, controls = {"wmodel": "TF_IDF"})

In [None]:
results=tfidf_retr.search(query)
results

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,34,34,0,3.091368,measur
1,1,0,0,1,2.899859,measur
2,1,7,7,2,2.580178,measur


# **5) Query expansion**

In [None]:
import pandas as pd
import pyterrier as pt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import os
pd.set_option('display.max_colwidth', 150)

## **Query Expansion using RM3**

In [None]:
# Need to install additional terrier package for PRF. It will take around 1 min
!git clone https://github.com/terrierteam/terrier-prf/
!apt-get install maven   #used for Java projects to manage project dependencies and build processes
%cd /content/terrier-prf/
!mvn install
!pwd
%cd ..

Cloning into 'terrier-prf'...
remote: Enumerating objects: 196, done.[K
remote: Counting objects: 100% (196/196), done.[K
remote: Compressing objects: 100% (81/81), done.[K
remote: Total 196 (delta 52), reused 173 (delta 36), pack-reused 0[K
Receiving objects: 100% (196/196), 28.00 KiB | 3.11 MiB/s, done.
Resolving deltas: 100% (52/52), done.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libaopalliance-java libapache-pom-java libatinject-jsr330-api-java libcdi-api-java
  libcommons-cli-java libcommons-io-java libcommons-lang3-java libcommons-parent-java
  libgeronimo-annotation-1.3-spec-java libgeronimo-interceptor-3.0-spec-java libguava-java
  libguice-java libhawtjni-runtime-java libjansi-java libjansi-native-java libjsr305-java
  libmaven-parent-java libmaven-resolver-java libmaven-shared-utils-java libmaven3-core-java
  libplexus-cipher-java libplexus-classworlds-java libpl

In [None]:
# Define our retrieval model
bm25 = pt.BatchRetrieve(index, wmodel="BM25",num_results=10)

result = bm25.search(query)
result

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,34,34,0,5.318946,measur
1,1,0,0,1,4.98944,measur
2,1,7,7,2,4.439402,measur


In [None]:
df[['Text']][df['docno'].isin(results['docno'].loc[0:4].tolist())]

Unnamed: 0,Text
0,measurement of dielectric constant of liquids by the use of microwave techniques
7,measurement of plasma temperatures in arc discharge using shock wave techniques
34,measurements of ionospheric drifts near the equator


In [None]:
# "rewrite" function from PyTerrier will be used to expand queries specifying RM3 as the model
# fb_docs ==> no. expansion documents
# fb_terms ==> no. expansion terms
rm3_expander = pt.rewrite.RM3(index,fb_terms=10, fb_docs=100)

#output of the BM25 will be fed into the RM3 expander for query expansion.
rm3_qe = bm25 >> rm3_expander
expanded_query = rm3_qe.search(query).iloc[0]["query"]

expanded_query

'applypipeline:off constant^0.031249994 drift^0.037499998 ionosph^0.037499998 equat^0.037499998 techniqu^0.054687496 measur^0.692187488 dielectr^0.031249994 discharg^0.023437496 microwav^0.031249994 wave^0.023437496'

In [None]:
# Just print the expanded query with term scores
for s in expanded_query.split()[1:]:
  print(s)

print("\n" + query)

constant^0.031249994
drift^0.037499998
ionosph^0.037499998
equat^0.037499998
techniqu^0.054687496
measur^0.692187488
dielectr^0.031249994
discharg^0.023437496
microwav^0.031249994
wave^0.023437496

measur


In [None]:
# After that you can search using the expanded query
expanded_query_formatted = ' '.join(expanded_query.split()[1:])

results_wqe = bm25.search(expanded_query_formatted)

print("   Before Expansion    After Expansion")
print(pd.concat([results[['docid','score']][0:5].add_suffix('_1'),
            results_wqe[['docid','score']][0:5].add_suffix('_2')], axis=1).fillna(''))

#Let's check the tweets text for the top 5 retrieved tweets
df[['Text']][df['docno'].isin(results_wqe['docno'].loc[0:5].tolist())]

   Before Expansion    After Expansion
  docid_1   score_1  docid_2   score_2
0    34.0  3.091368       34  6.293362
1     0.0  2.899859        0  6.174352
2     7.0  2.580178        7  5.054554
3                         40  0.428144
4                         24  0.395350


Unnamed: 0,Text
0,measurement of dielectric constant of liquids by the use of microwave techniques
7,measurement of plasma temperatures in arc discharge using shock wave techniques
24,equations governing the propagation of electromagnetic and hydromagnetic waves in the solar corona
34,measurements of ionospheric drifts near the equator
40,ferromagnetic techniques for computer stores
81,analysis of nonlinear systems using phase plane techniques


## **Query Expansion usinhg pre-built mappings:**

In [None]:
# Function 3a4an ageeb kol el terms el fe el Document


def sk(text):

 docs_dictionary = []

 i = 0

 for docs in text:

   docs_dictionary.append(word_tokenize(text[i]))
   i = i +1

 return docs_dictionary


coll = sk(df["processed_text"])


In [None]:
from scipy import spatial
import gensim
from gensim.models import Word2Vec

In [None]:


# Train skip-gram model
model = Word2Vec(sentences = coll,
                 sg=1,
                 vector_size=100,
                 window=2,
                 min_count=1,
                 workers=4,
                 epochs=20)

# Get word embeddings
word_embeddings = model.wv

In [None]:
# Expand the query usnig (most_similar) function

qrr = "input"

qrr = preprocess(qrr)

mo = word_embeddings.most_similar(qrr)

m = mo[0][0]

print(f"Expanded Query is: {qrr} {m}")

Expanded Query is: input capacitor


In [None]:
# Expand the query usnig From scratch function

c = qrr.split()

max = 0

expandd = ''

for k in coll:
  for n in k:
   for q in c:
    if n != q:
      term_emb = word_embeddings[n]
      qur_emb =  word_embeddings[q]
      similarity =  1 - spatial.distance.cosine(term_emb, qur_emb)

      if similarity >= max:
       expandd = n
       max = similarity


print(f"Expanded Query is: {qrr} {expandd}")

Expanded Query is: input capacitor


# **6) User Interface**

In [None]:
!pip install flask_ngrok

Collecting flask_ngrok
  Downloading flask_ngrok-0.0.25-py3-none-any.whl (3.1 kB)
Installing collected packages: flask_ngrok
Successfully installed flask_ngrok-0.0.25


In [None]:
df2 = df

df2 = df2.to_dict()

df2

{'qid': {0: '1',
  1: '2',
  2: '3',
  3: '4',
  4: '5',
  5: '6',
  6: '7',
  7: '8',
  8: '9',
  9: '10',
  10: '11',
  11: '12',
  12: '13',
  13: '14',
  14: '15',
  15: '16',
  16: '17',
  17: '18',
  18: '19',
  19: '20',
  20: '21',
  21: '22',
  22: '23',
  23: '24',
  24: '25',
  25: '26',
  26: '27',
  27: '28',
  28: '29',
  29: '30',
  30: '31',
  31: '32',
  32: '33',
  33: '34',
  34: '35',
  35: '36',
  36: '37',
  37: '38',
  38: '39',
  39: '40',
  40: '41',
  41: '42',
  42: '43',
  43: '44',
  44: '45',
  45: '46',
  46: '47',
  47: '48',
  48: '49',
  49: '50',
  50: '51',
  51: '52',
  52: '53',
  53: '54',
  54: '55',
  55: '56',
  56: '57',
  57: '58',
  58: '59',
  59: '60',
  60: '61',
  61: '62',
  62: '63',
  63: '64',
  64: '65',
  65: '66',
  66: '67',
  67: '68',
  68: '69',
  69: '70',
  70: '71',
  71: '72',
  72: '73',
  73: '74',
  74: '75',
  75: '76',
  76: '77',
  77: '78',
  78: '79',
  79: '80',
  80: '81',
  81: '82',
  82: '83',
  83: '84',
  84

In [None]:
q = "mohamed"
q = q.split()
q

['mohamed']

In [None]:
def sui(df2 , que):
 i = 0

 quer = preprocess(que)

 qu = quer.split()

 docs_id = []
 for j in qu:
  i = 0
  for key, value in df2.items():
    if key == 'processed_text':
          val = value.values()
          for doc in val:
            terms = doc.split()
            for term in terms:
              if (term == j) and (i not in docs_id):
                docs_id.append(f'''Document number {i} -----> {df["Text"][i]}''')
            i = i + 1
 return docs_id

In [None]:
query2 = "measurement of temperatures electromagnetic"

x = sui(df2 , query2)
x

['Document number 0 -----> measurement of dielectric constant of liquids by the use of microwave techniques',
 'Document number 7 -----> measurement of plasma temperatures in arc discharge using shock wave techniques',
 'Document number 34 -----> measurements of ionospheric drifts near the equator',
 'Document number 7 -----> measurement of plasma temperatures in arc discharge using shock wave techniques',
 'Document number 11 -----> temperature independent methods for tuning highly stable high frequency oscillators',
 'Document number 25 -----> estimates of the density of ionization and temperature in the solar corona',
 'Document number 86 -----> electronic specific heat of a superconductor showing a discontinuity at the superconducting critical temperature',
 'Document number 24 -----> equations governing the propagation of electromagnetic and hydromagnetic waves in the solar corona',
 'Document number 63 -----> similarities between the diffraction theory of electromagnetic waves an

In [None]:
from google.colab.output import eval_js
print (eval_js("google.colab.kernel.proxyPort(5000)"))

https://rrfzw5j3p18-496ff2e9c6d22116-5000-colab.googleusercontent.com/


In [None]:
from flask import Flask, request
from flask_ngrok import run_with_ngrok

# Assuming you've already defined the sui function and imported necessary modules

app = Flask(__name__)
run_with_ngrok(app)

@app.route("/")
def home():
    return """
    <style>
        body {
            background-color: white;
            font-family: Arial, sans-serif;
            margin: 0;
            padding: 0;
        }

        .header {
            background-color: black;
            color: white;
            padding: 20px 0;
        }

        .container {
            text-align: center;
        }

        h1 {
            text-align: center;
            margin: 0;
            padding: 10px 0;
        }

        #searchInput {
            padding: 10px;
            border: 1px solid #ccc;
            border-radius: 20px; /* Increased border-radius for a rounded appearance */
            margin-bottom: 10px;
            width: 300px; /* Adjust the width as needed */
            box-sizing: border-box; /* Include padding and border in the element's total width */
            transition: border-color 0.3s; /* Smooth transition for border color change */
        }

        #searchInput:focus {
            border-color: #007bff; /* Change border color on focus */
        }

        button {
            padding: 10px 20px;
            background-color: #007bff;
            color: white;
            border: none;
            border-radius: 20px; /* Increased border-radius for a rounded appearance */
            cursor: pointer;
            transition: background-color 0.3s; /* Smooth transition for background color change */
        }

        button:hover {
            background-color: #0056b3; /* Change background color on hover */
        }
    </style>

    <div class="header">
        <h1>Welcome to Emam's Search Engine</h1>
    </div>
    <div class="container">
        <input type="text" id="searchInput" placeholder="Enter your query...">
        <button onclick="search()">Search</button>
    </div>
    <div id="searchResult"></div>

    <script>
        function search() {
            var searchTerm = document.getElementById("searchInput").value;
            fetch('/search', {
                method: 'POST',
                body: JSON.stringify({ query: searchTerm }),
                headers:{
                    'Content-Type': 'application/json'
                }
            })
            .then(response => response.json())
            .then(data => {
                console.log("Received data:", data); // Debug: Check if data is received
                var resultDiv = document.getElementById("searchResult");
                resultDiv.innerHTML = "<h2>Relevant Documents IDs:</h2>";
                if (data.results.length === 0) {
                    resultDiv.innerHTML += "<p>No documents found</p>";
                } else {
                    data.results.forEach(doc => {
                        console.log("Displaying document:", doc); // Debug: Check if document is displayed
                        resultDiv.innerHTML += "<p>" + doc + "</p>";
                    });
                }
            })
            .catch(error => {
                console.error('Error occurred during fetch:', error); // Debug: Log fetch errors
            });
        }
    </script>
    """

@app.route("/search", methods=['POST'])
def search():
    query = request.json['query']
    print("Received query:", query)  # Debug: Check if Flask receives the query
    results = sui(df2, query)
    print("Search results:", results)  # Debug: Check if sui function returns results
    return {'results': results}

app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [11/May/2024 19:30:46] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [11/May/2024 19:30:47] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
Exception in thread Thread-23:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/urllib3/connection.py", line 203, in _new_conn
    sock = connection.create_connection(
  File "/usr/local/lib/python3.10/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.10/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 791, in urlopen
    response = self._make_request(
  File "

Received query: measurement of temperatures electromagnetic
Search results: ['Document number 0 -----> measurement of dielectric constant of liquids by the use of microwave techniques', 'Document number 7 -----> measurement of plasma temperatures in arc discharge using shock wave techniques', 'Document number 34 -----> measurements of ionospheric drifts near the equator', 'Document number 7 -----> measurement of plasma temperatures in arc discharge using shock wave techniques', 'Document number 11 -----> temperature independent methods for tuning highly stable high frequency oscillators', 'Document number 25 -----> estimates of the density of ionization and temperature in the solar corona', 'Document number 86 -----> electronic specific heat of a superconductor showing a discontinuity at the superconducting critical temperature', 'Document number 24 -----> equations governing the propagation of electromagnetic and hydromagnetic waves in the solar corona', 'Document number 63 -----> sim

# **7) Evaluation**

In [None]:
indexref2 = vaswani_dataset.get_index()
index2 = pt.IndexFactory.of(indexref2)

print(index2.getCollectionStatistics().toString())

Downloading vaswani index to /root/.pyterrier/corpora/vaswani/index


data.direct.bf:   0%|          | 0.00/388k [00:00<?, ?iB/s]

data.document.fsarrayfile:   0%|          | 0.00/234k [00:00<?, ?iB/s]

data.inverted.bf:   0%|          | 0.00/362k [00:00<?, ?iB/s]

data.lexicon.fsomapfile:   0%|          | 0.00/682k [00:00<?, ?iB/s]

data.lexicon.fsomaphash:   0%|          | 0.00/777 [00:00<?, ?iB/s]

data.lexicon.fsomapid:   0%|          | 0.00/30.3k [00:00<?, ?iB/s]

data.meta-0.fsomapfile:   0%|          | 0.00/725k [00:00<?, ?iB/s]

data.meta.idx:   0%|          | 0.00/89.3k [00:00<?, ?iB/s]

data.meta.zdata:   0%|          | 0.00/224k [00:00<?, ?iB/s]

data.properties:   0%|          | 0.00/4.29k [00:00<?, ?iB/s]

md5sums:   0%|          | 0.00/619 [00:00<?, ?iB/s]

Number of documents: 11429
Number of terms: 7756
Number of postings: 224573
Number of fields: 1
Number of tokens: 271581
Field names: [text]
Positions:   false



In [None]:
retr = pt.BatchRetrieve(index2, controls = {"wmodel": "TF_IDF"})

res = retr.search("measurement")
res

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,3012,3013,0,3.103851,measurement
1,1,8754,8755,1,3.074043,measurement
2,1,3180,3181,2,2.975172,measurement
3,1,5417,5418,3,2.975172,measurement
4,1,7439,7440,4,2.975172,measurement
...,...,...,...,...,...,...
995,1,1779,1780,995,1.475301,measurement
996,1,2254,2255,996,1.475301,measurement
997,1,2639,2640,997,1.475301,measurement
998,1,3682,3683,998,1.475301,measurement


In [None]:
eval = pt.Evaluate(res,qrels)
eval

{'map': 8.465417545503938e-05, 'ndcg': 0.002206259359105843}