In [51]:
pip install requests beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [103]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer


In [105]:
import requests
def load_documents(file_url):
    # Fetch the text content from the URL
    response = requests.get(file_url)
    if response.status_code == 200:
        text = response.text
    else:
        raise Exception(f"Failed to fetch the file. Status code: {response.status_code}")
    return text

In [107]:
file_url = "https://raw.githubusercontent.com/martinabgn/HelsinkiLib/week2/enwiki-20181001-corpus.1000-articles.txt"
text = load_documents(file_url)
print(text[:200])

<article name="Anarchism">
Anarchism is a political philosophy that advocates self-governed societies based on voluntary institutions. These are often described as stateless societies, although severa


In [109]:
from bs4 import BeautifulSoup
def split_article(text):
    # Split the text based on the <article> tag using BeautifulSoup
    soup = BeautifulSoup(text, 'html.parser')
    articles = soup.find_all('article')
    article_dict = {}
    article_index= {}
    for article in articles:
        name = article.get("name")
        content = article.text.strip()  # Strip excess whitespace
        if name:
            article_dict[name] = content
    return article_dict
            
    
    

In [111]:
article_dict = split_article(text)
# Example: Print the article names and their first 200 characters
for name, content article_dict.items():
    print(f"Article Name: {name}")
    print(f"Content Preview: {content[:100]}...\n")

Article Name: Anarchism
Content Preview: Anarchism is a political philosophy that advocates self-governed societies based on voluntary instit...

Article Name: Autism
Content Preview: Autism is a developmental disorder characterized by troubles with social interaction and communicati...

Article Name: Albedo
Content Preview: Albedo ( ) ( , meaning "whiteness") is the measure of the diffuse reflection of solar radiation out ...

Article Name: A
Content Preview: A (named , plural As, A's, as, a's or aes ) is the first letter and the first vowel of the ISO basic...

Article Name: Alabama
Content Preview: Alabama is a state in the southeastern region of the United States. It is bordered by Tennessee to t...

Article Name: Achilles
Content Preview: In Greek mythology, Achilles or Achilleus ( ; , Achilleus ) was a Greek hero of the Trojan War and t...

Article Name: Abraham Lincoln
Content Preview: Abraham Lincoln (February 12, 1809 – April 15, 1865) was an American statesman and lawyer who 

In [189]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
documents, names =  list(article_dict.values()), list(article_dict.keys())
cv = CountVectorizer(lowercase=True)
dense_matrix = cv.fit_transform(documents).T.todense()
print("Term-document matrix:\n")
print(dense_matrix)

Term-document matrix:

[[0 0 0 ... 0 0 0]
 [1 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [161]:
t2i = cv.vocabulary_  # shorter notation: t2i = term-to-index

In [175]:
query = input("enter a query").lower()

enter a query Lincoln


In [177]:
print(dense_matrix[t2i[query]])

[[  0   0   0   0   0   0 172   0   0   1   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    2   1   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0  68
    0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   4   0   0
    1   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0 

In [191]:
hits_list = np.array(dense_matrix[t2i[query]])[0]

In [203]:
for i, nhits in enumerate(hits_list):
    if nhits >= 1:
        print(f"\033[1m\033[94m{query.upper()}\033[0m occurs in article with name: \033[1m{names[i]}\033[0m for {nhits} times")
        print(f"\033[1mFirst 100 characters of the article: {documents[i][:100]}\033[0m")


[1m[94mLINCOLN[0m occurs in article with name: [1mAbraham Lincoln[0m for 172 times
First 100 characters of the article: [1mAbraham Lincoln (February 12, 1809 – April 15, 1865) was an American statesman and lawyer who served[0m
[1m[94mLINCOLN[0m occurs in article with name: [1mAcademy Award for Best Production Design[0m for 1 times
First 100 characters of the article: [1mThe Academy Award for Best Production Design recognizes achievement for art direction in film. The c[0m
[1m[94mLINCOLN[0m occurs in article with name: [1mAlaska[0m for 1 times
First 100 characters of the article: [1mAlaska ( ) ( ; ; ) is a U.S. state located in the northwest extremity of North America. The Canadian[0m
[1m[94mLINCOLN[0m occurs in article with name: [1mAlbert Einstein[0m for 2 times
First 100 characters of the article: [1mAlbert Einstein ( ; ; 14 March 1879 – 18 April 1955) was a German-born theoretical physicist a pivot[0m
[1m[94mLINCOLN[0m occurs in article with name: [1m

In [None]:
nhits, "time(s) in document:", documents[i])

In [143]:
hits_list

array([  0,   0,   0,   0,   0,   0, 172,   0,   0,   1,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   2,
         1,   0,   0,   0,   0,   1,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        68,   0,   0,   1,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   