# Информационный поиск

**Выполнила:** Медведева Ольга

**Задания:** 1 - 2

## Задание №1
Набор данных хранится в папке `data`, собран из энциклопедии [Britannica](https://www.britannica.com/). Данные представляют собой html-страницы и разделены на 4 группы:
- Geography
- History
- Sport
- Technology

## Задание №2
Разработать систему поиска по запросу, сформулированному на естественном языке к набору данных из `Задания №1`. Метрика релевантности рассчитать на основе tf-idf и словаря.

In [1]:
# Импортирую библиотеки

import re
import os
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from bs4 import BeautifulSoup
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Создаю DataFrame, в котором будут содержаться путь к файлу, название файла, класс, текс файла и обработанный текст

df = pd.DataFrame(columns=['filepath', 'title', 'label', 'text', 'clear_text'])
df

Unnamed: 0,filepath,title,label,text,clear_text


In [3]:
os.listdir('data')

['Geography', 'History', 'Sport', 'Technology']

In [4]:
# Заполняю столбцы filepath, title, label

path = 'data'
df_index = 0

for label, folder in enumerate(os.listdir(path)):
    for file in os.listdir(os.path.join(path, folder)):
        df.loc[df_index, 'filepath'] = os.path.join(path, folder, file)
        df.loc[df_index, 'title'] = file
        df.loc[df_index, 'label'] = label
        df_index += 1

In [5]:
df

Unnamed: 0,filepath,title,label,text,clear_text
0,"data\Geography\Amazon River Facts, History, Lo...","Amazon River Facts, History, Location, Length,...",0,,
1,"data\Geography\Antarctica History, Map, Climat...","Antarctica History, Map, Climate, & Facts Brit...",0,,
2,"data\Geography\Arctic Ocean Definition, Locati...","Arctic Ocean Definition, Location, Map, Climat...",0,,
3,"data\Geography\atmosphere Definition, Layers, ...","atmosphere Definition, Layers, & Facts Britann...",0,,
4,"data\Geography\Brazil History, Map, Culture, P...","Brazil History, Map, Culture, Population, & Fa...",0,,
...,...,...,...,...,...
98,"data\Technology\semiconductor Definition, Exam...","semiconductor Definition, Examples, Types, Use...",3,,
99,data\Technology\smartphone Britannica.htm,smartphone Britannica.htm,3,,
100,"data\Technology\supercomputer Definition, Char...","supercomputer Definition, Characteristics, Exa...",3,,
101,data\Technology\tablet computer Definition & F...,tablet computer Definition & Facts Britannica.htm,3,,


In [6]:
# Функция для получения текста из файла

def get_text(filepath):
    with open(filepath, 'r', encoding='utf-8') as fp:
        text = BeautifulSoup(fp, 'html.parser').text
    return text

In [7]:
# Применяю функцию get_text к столбцу text

df.text = df.filepath.apply(get_text)
df

Unnamed: 0,filepath,title,label,text,clear_text
0,"data\Geography\Amazon River Facts, History, Lo...","Amazon River Facts, History, Location, Length,...",0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,
1,"data\Geography\Antarctica History, Map, Climat...","Antarctica History, Map, Climate, & Facts Brit...",0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,
2,"data\Geography\Arctic Ocean Definition, Locati...","Arctic Ocean Definition, Location, Map, Climat...",0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,
3,"data\Geography\atmosphere Definition, Layers, ...","atmosphere Definition, Layers, & Facts Britann...",0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,
4,"data\Geography\Brazil History, Map, Culture, P...","Brazil History, Map, Culture, Population, & Fa...",0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,
...,...,...,...,...,...
98,"data\Technology\semiconductor Definition, Exam...","semiconductor Definition, Examples, Types, Use...",3,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,
99,data\Technology\smartphone Britannica.htm,smartphone Britannica.htm,3,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,
100,"data\Technology\supercomputer Definition, Char...","supercomputer Definition, Characteristics, Exa...",3,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,
101,data\Technology\tablet computer Definition & F...,tablet computer Definition & Facts Britannica.htm,3,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,


In [8]:
# Функция для предобработки текста

def clear_text(text):
    
    # Удаление мусора при помощи регулярных выражений
    text = re.sub('[^a-zA-Z ]+', ' ', text)
    text = re.sub('[ ]+', ' ', text)
    
    # Приведение к строчному регистру
    text = text.lower()
    
    # Токенизация слов
    tokens = word_tokenize(text)
    
    # Удаление стоп-слов
    stoplist = set(stopwords.words('english'))
    words = [token for token in tokens if token not in stoplist and len(token) > 1]
    
    # Лемматизация слов
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(word) for word in words if len(lemmatizer.lemmatize(word)) > 1]
    return ' '.join(lemmas)

In [9]:
# Применение функции clear_text для записи предобработанного текста в колнку clear_text

df.clear_text = df.text.apply(clear_text)
df

Unnamed: 0,filepath,title,label,text,clear_text
0,"data\Geography\Amazon River Facts, History, Lo...","Amazon River Facts, History, Location, Length,...",0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,amazon river fact history location length anim...
1,"data\Geography\Antarctica History, Map, Climat...","Antarctica History, Map, Climate, & Facts Brit...",0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,antarctica history map climate fact britannica...
2,"data\Geography\Arctic Ocean Definition, Locati...","Arctic Ocean Definition, Location, Map, Climat...",0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,arctic ocean definition location map climate f...
3,"data\Geography\atmosphere Definition, Layers, ...","atmosphere Definition, Layers, & Facts Britann...",0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,atmosphere definition layer fact britannica br...
4,"data\Geography\Brazil History, Map, Culture, P...","Brazil History, Map, Culture, Population, & Fa...",0,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,brazil history map culture population fact bri...
...,...,...,...,...,...
98,"data\Technology\semiconductor Definition, Exam...","semiconductor Definition, Examples, Types, Use...",3,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,semiconductor definition example type us mater...
99,data\Technology\smartphone Britannica.htm,smartphone Britannica.htm,3,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,smartphone britannica browse search dictionary...
100,"data\Technology\supercomputer Definition, Char...","supercomputer Definition, Characteristics, Exa...",3,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,supercomputer definition characteristic exampl...
101,data\Technology\tablet computer Definition & F...,tablet computer Definition & Facts Britannica.htm,3,\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,tablet computer definition fact britannica bro...


In [10]:
# Функция для извлечения и оценки релевантности текстовых объектов с помощью TfidfVectorizer

def feature_extraction(text):
    vectorizer = TfidfVectorizer()
    vectorizer.fit(text)
    
    data = vectorizer.transform(text)
    TfIdf = pd.DataFrame(data.toarray(), columns=vectorizer.get_feature_names())
    return TfIdf

In [11]:
# Применение функции feature_extraction

features = feature_extraction(df['clear_text'])
features.head(10)

Unnamed: 0,aaa,aac,aaron,aau,abandoned,abandonment,abbot,abbreviated,abbreviation,abc,...,zone,zonehydrologysurface,zonesseamountsocean,zonesubantarctic,zonesubtropical,zonethe,zonetrade,zoologist,zuse,zwass
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.053856,0.0,0.0,0.0,...,0.033496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Функция для извлечения ссылок из html-страниц, третья ссылка на текущую страницу в Интернете

def get_html(filepath):
    with open (filepath, 'r', encoding='utf-8') as fp:
        html = fp.read()
        pattern = re.compile(r'href="(https://.*?|http://.*?)"')
        urls = re.findall(pattern, html)
    return urls[2]

In [13]:
# Функция для поиска совпадений слов из запроса и из features

def search_request(request):
    clear_request = clear_text(request).split()
    diction = features.columns
    inter = list(set(clear_request).intersection(set(diction)))
    idx_list = features[inter].sum(axis=1).sort_values(ascending=False)
    for idx, df_idx in enumerate(idx_list.index[:10]):
        print('{}. {}'.format(idx+1, get_html(df.filepath[df_idx])))
    return idx_list

In [14]:
# Применение функции search_request к запросу

request = search_request('Volleyball is climate zone?')

1. https://www.britannica.com/sports/volleyball
2. https://www.britannica.com/science/climate-meteorology
3. https://www.britannica.com/science/weather
4. https://www.britannica.com/place/Russia
5. https://www.britannica.com/topic/Southeast-Indian
6. https://www.britannica.com/science/sea-level
7. https://www.britannica.com/place/Indian-Ocean
8. https://www.britannica.com/place/Arctic-Ocean
9. https://www.britannica.com/place/Antarctica
10. https://www.britannica.com/technology/supercomputer
