In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep
from urllib.parse import urljoin
from collections import defaultdict
from urllib.robotparser import RobotFileParser
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import nltk
import re
nltk.download('stopwords')
nltk.download('wordnet')

# Rest of your code goes here...


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Initialize a RobotFileParser for the website's robots.txt file
robot_parser = RobotFileParser()
robot_parser.set_url('https://pureportal.coventry.ac.uk/robots.txt')
robot_parser.read()

base_url = 'https://pureportal.coventry.ac.uk/en/organisations/ihw-centre-for-intelligent-healthcare-cih/publications/'

url = base_url

# Initialize an empty DataFrame
df = pd.DataFrame(columns=['Title', 'Authors', 'ResearchAreas', 'PublicationLink', 'Date', 'JournalNumber',
                           'NumberOfPages', 'PersonLinks'])

while url:
    # Check if crawling the current URL is allowed by robots.txt
    if not robot_parser.can_fetch('*', url):
        print(f"Crawling of {url} is not allowed by robots.txt")
        break

    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        ul = soup.find('ul', class_='list-results')

        if ul:
            for li in ul.find_all('li', class_=lambda x: x and x.startswith('list-result-item')):
                if li:
                    title_tag = li.find('h3', class_='title')
                    names = []
                    concepts = []
                    publication_link = None
                    date = None
                    journal_number = None
                    number_of_pages = None
                    person_links_dict = defaultdict(list)

                    # Extracting Title and PublicationLink
                    if title_tag:
                        a_tag = title_tag.find('a')
                        publication_link = a_tag['href'] if a_tag else None

                    # Extracting Date
                    date_tag = li.find('span', class_='date')
                    if date_tag:
                        date = date_tag.get_text(strip=True)

                    # Extracting JournalNumber
                    journal_number_tag = li.find('span', class_='journalnumber')
                    if journal_number_tag:
                        journal_number = journal_number_tag.get_text(strip=True)

                    # Extracting NumberOfPages
                    number_of_pages_tag = li.find('span', class_='numberofpages')
                    if number_of_pages_tag:
                        number_of_pages = number_of_pages_tag.get_text(strip=True)

                    # Extracting Authors and PersonLinks
                    if title_tag:
                        for sibling in title_tag.find_next_siblings(['span', 'a'], limit=4):
                            if sibling.name == 'a':
                                names.append(sibling.span.text if sibling.span else sibling.text)
                                if 'href' in sibling.attrs:
                                    person_links_dict[len(names) - 1].append(sibling['href'])
                            else:
                                names.append(sibling.text)

                    # Extracting ResearchAreas
                    concept_tags = li.find_all('span', class_='concept')
                    for tag in concept_tags:
                        concepts.append(tag.text)

                    # Adding new row to DataFrame
                    df = pd.concat([df, pd.DataFrame([{
                        'Title': title_tag.text,
                        'Authors': ', '.join(names),
                        'ResearchAreas': ', '.join(concepts),
                        'PublicationLink': publication_link,
                        'Date': date,
                        'JournalNumber': journal_number,
                        'NumberOfPages': number_of_pages,
                        'PersonLinks': dict(person_links_dict)
                    }])], ignore_index=True)

        # Going to the next page
        next_link_element = soup.find('a', {'class': 'nextLink'})
        if next_link_element and 'href' in next_link_element.attrs:
            url = urljoin(base_url, next_link_element['href'])
        else:
            break

        sleep(1)

# Save the DataFrame to a CSV file
df.to_csv('complete_data.csv', index=False)

# Display the first few rows of the DataFrame


In [None]:
df.head(10)

Unnamed: 0,Title,Authors,ResearchAreas,PublicationLink,Date,JournalNumber,NumberOfPages,PersonLinks
0,Advanced Sensing Techniques for Intelligent Hu...,"Shah, S. A., Abbasi, Q. H., Ahmad, J., Imran, ...",Human Activity Recognition,https://pureportal.coventry.ac.uk/en/publicati...,Oct 2023,19.0,2 p.,{0: ['https://pureportal.coventry.ac.uk/en/per...
1,Age-Related Changes in Blood Volume Pulse Wave...,"Lin, W. H., Zheng, D., Li, G., Chen, F.","Photoelectric Plethysmography, Pulse Wave, Blo...",https://pureportal.coventry.ac.uk/en/publicati...,5 Jun 2023,,11 p.,{1: ['https://pureportal.coventry.ac.uk/en/per...
2,A mixed-methods evaluation of prospective acce...,"Bell, L., Whelan, M., Thomas, L., Wright, H.","Healthy Lifestyle, Evaluation Study, Mixed Met...",https://pureportal.coventry.ac.uk/en/publicati...,29 May 2023,,,{1: ['https://pureportal.coventry.ac.uk/en/per...
3,A phantom study of a protective trolley for ne...,"Wang, X., Xu, M., Chen, C., Bao, Z.","Radiation, Personal Protective Equipment, Orga...",https://pureportal.coventry.ac.uk/en/publicati...,Feb 2023,2.0,5 p.,{}
4,Applications of Magnetic Resonance Imaging in ...,"Wang, X., Liu, H., Sept 2023, 1 ed.","Stroke, Magnetic Resonance Imaging, Brain, Isc...",https://pureportal.coventry.ac.uk/en/publicati...,Sept 2023,,16 p.,{1: ['https://pureportal.coventry.ac.uk/en/per...
5,A Serious Game for Patients With Eating Disord...,"Guala, M., Bul, K., Skårderud, F., Nielsen, A. S.","Eating Disorders, Patient with Eating Disorder...",https://pureportal.coventry.ac.uk/en/publicati...,27 Jan 2023,,13 p.,{1: ['https://pureportal.coventry.ac.uk/en/per...
6,Association of the COVID-19 pandemic on stroke...,"Van Dusen, R. A., Abernethy, K., Chaudhary, N....","Apoplexy, COVID-19, Therapeutic Procedure, Sys...",https://pureportal.coventry.ac.uk/en/publicati...,17 Mar 2023,3.0,13 p.,{}
7,"Capability, Opportunity, and Motivation—Identi...","Kite, C., Atkinson, L., McGregor, G., Clark, C...","Physical Activity, Woman, Motivation, Polycyst...",https://pureportal.coventry.ac.uk/en/publicati...,28 Jan 2023,3.0,17 p.,{2: ['https://pureportal.coventry.ac.uk/en/per...
8,Cerebral Hemodynamics Underlying Artery-to-Art...,"Feng, X., Fang, H., Ip, B. Y. M., Chan, K. L.","Stroke, Pressure, Perfusion, Computed Tomograp...",https://pureportal.coventry.ac.uk/en/publicati...,10 Mar 2023,,,{}
9,Congenic hematopoietic stem cell transplantati...,"Sadozai, H., Rojas-Luengas, V., Farrokhi, K., ...","Allotransplantation, Autoantibody, Autologous ...",https://pureportal.coventry.ac.uk/en/publicati...,5 Jul 2023,1.0,17 p.,{}


# Inverted Index

In [None]:


# Function to preprocess
def preprocess_text(text):
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text).lower()

    # Split the text into words
    tokens = text.split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    return ' '.join(lemmatized_tokens)

# Function to create an inverted index from specified DataFrame columns
def create_inverted_index(df, columns):
    inverted_index = defaultdict(set)

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        # Concatenate text from specified columns for each row
        combined_text = ' '.join(str(row[col]) for col in columns)
        # Preprocess the text
        processed_text = preprocess_text(combined_text)
        # Split the text into words and update the inverted index
        for word in processed_text.split():
            inverted_index[word].add(index)

    return inverted_index

# Specified columns for creating the inverted index
columns = ['Title', 'Authors', 'ResearchAreas']

# Creating the inverted index
inverted_index = create_inverted_index(df, columns)

# Display a small part of the inverted index
for word, postings in list(inverted_index.items())[:10]:
    print(f"{word}: {sorted(postings)}")

advanced: [0, 337]
sensing: [0, 11, 12, 14, 26, 31, 35, 52, 117, 128, 144, 151, 200, 205, 212, 219, 249, 313, 318, 369, 377, 383, 398, 432, 463, 473, 480, 484, 495, 496, 550, 565]
technique: [0, 30, 244, 379, 411, 430]
intelligent: [0, 151, 195, 564]
human: [0, 10, 17, 31, 60, 69, 195, 205, 233, 241, 242, 249, 315, 318, 391, 452, 463, 564, 586, 593, 596, 653, 694, 725]
activity: [0, 7, 10, 17, 20, 37, 56, 64, 69, 109, 124, 126, 146, 148, 166, 177, 195, 203, 214, 217, 219, 226, 235, 239, 241, 247, 257, 258, 312, 317, 318, 320, 321, 322, 323, 332, 339, 345, 347, 362, 363, 391, 422, 430, 435, 443, 457, 458, 463, 484, 498, 500, 502, 510, 516, 538, 555, 564, 596, 601, 627, 654, 655, 656, 662, 670, 690]
recognition: [0, 10, 17, 69, 128, 147, 195, 219, 241, 318, 352, 391, 463, 564]
using: [0, 10, 18, 24, 26, 27, 31, 51, 63, 114, 117, 127, 146, 151, 176, 182, 205, 214, 219, 232, 245, 255, 317, 318, 321, 322, 325, 364, 366, 372, 379, 383, 399, 427, 451, 461, 466, 480, 484, 489, 581, 592, 606, 6

## Search function

#{'document': 'This is the first document.', 'relevance_score': 2}
#{'document': 'And this is the third document.', 'relevance_score': 2}
#{'document': 'The second document contains some more text.', 'relevance_score': 1}


In [None]:
def search_inverted_index(query, inverted_index, df):
    processed_query = preprocess_text(query)
    query_words = processed_query.split()
    doc_scores = defaultdict(int)
    for word in query_words:
        if word in inverted_index:
            for doc_index in inverted_index[word]:
                doc_scores[doc_index] += 1
    sorted_docs = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)
    top_results = []
    for doc_index, score in sorted_docs:
        doc_details = df.iloc[doc_index].to_dict()
        doc_details['relevance_score'] = score
        top_results.append(doc_details)
    return top_results


# test the search engine

In [None]:
sample_query = input('what is your seach')
search_results = search_inverted_index(sample_query, inverted_index, df)

# Display the top search results (adjust the number as needed)
for result in search_results[:5]:
    print(result['Title'])
    print(result['Authors'])
    print(result['ResearchAreas'])
    print('')


what is your seachAge-Related Changes in Blood Volume Pulse Wave
Age-Related Changes in Blood Volume Pulse Wave at Fingers and Ears
Lin, W. H., Zheng, D., Li, G., Chen, F.
Photoelectric Plethysmography, Pulse Wave, Blood Volume, Arterial Stiffness, Biological Marker

Age-related changes in pulse risetime measured by multi-site photoplethysmography
Allen, J., O'Sullivan, J., Stansby, G., Murray, A.
Pulse Rate, Site, Pulse, Age, Female

Filtering-induced changes of pulse transmit time across different ages: a neglected concern in photoplethysmography-based cuffless blood pressure measurement
Liao, S., Liu, H., Lin, W-H., Zheng, D.
Blood Pressure Measurement, Time, Filtration, Pulse Rate, Photoelectric Plethysmography

Changes of oscillogram envelope maximum with blood pressure and aging: a quantitative observation
Pan, F., He, P., Qian, Y., Gao, H.
Blood Pressure, Cuff, Age Groups, Surface Pressure, Pressure Measurement

Peripheral arterial volume distensibility changes with applied exte