In [1]:
from bs4 import BeautifulSoup
import requests
from bs4 import BeautifulSoup
import urllib.request
import re
import os
import numpy as np
import pickle
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.text import ConcordanceIndex
import string

In [38]:
# Define stop words
stop_words = set(stopwords.words('english'))

In [39]:
starter_url = "https://en.wikipedia.org/wiki/Taylor_Swift"

r = requests.get(starter_url)
data = r.text
soup = BeautifulSoup(data, 'html.parser')  # Specify the parser explicitly

counter = 0
# write URLs to a file
with open('urls.txt', 'w') as f:
    for link in soup.find_all('a', href=True):  # Find all 'a' tags with 'href' attribute
        url = link.get('href')
        if url.startswith('/wiki/') and ':' not in url:  # Filter out non-article links
            full_url = "https://en.wikipedia.org" + url
            f.write(full_url + '\n')  # Write full URL to file
            counter += 1
            if counter >= 25:
                break

# End of program
print("End of crawler")


End of crawler


In [40]:
def scrape_text_from_url(url):
    try:
        html = urllib.request.urlopen(url)
        soup = BeautifulSoup(html, 'html.parser')
        data = soup.findAll(text=True)
        result = filter(visible, data)
        temp_list = list(result)
        temp_str = ' '.join(temp_list)
        return temp_str
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return None

In [41]:
# Function to scrape text from each page and write to files
def scrape_text_and_write_to_files(urls_file, output_dir):
    with open(urls_file, 'r') as f:
        urls = f.readlines()

    for i, url in enumerate(urls):
        url = url.strip()  # Remove leading/trailing whitespaces and newlines
        text = scrape_text_from_url(url)
        if text:
            with open(f"{output_dir}/page_{i}.txt", 'w', encoding='utf-8') as f:
                f.write(text)

In [43]:
# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Join the tokens back into a string
    cleaned_text = ' '.join(filtered_tokens)
    return cleaned_text

In [44]:
# Function to clean up text files
def clean_text_files(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)  # Create output directory if it doesn't exist
    for file_name in os.listdir(input_dir):
        if file_name.endswith(".txt"):
            with open(os.path.join(input_dir, file_name), 'r', encoding='utf-8') as f:
                text = f.read()
                cleaned_text = preprocess_text(text)
                with open(os.path.join(output_dir, file_name), 'w', encoding='utf-8') as outfile:
                    outfile.write(cleaned_text)

In [45]:
# Load scraped text data from files and concatenate
def load_scraped_text(input_dir):
    scraped_text = ""
    for file_name in os.listdir(input_dir):
        if file_name.endswith(".txt"):
            with open(os.path.join(input_dir, file_name), 'r', encoding='utf-8') as f:
                scraped_text += f.read()
    return scraped_text

In [46]:
# Function to determine if an element is visible
def visible(element):
    if element.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(element.encode('utf-8'))):
        return False
    return True

In [47]:
# Function to extract important terms using TF-IDF
def extract_important_terms(input_dir, num_terms=40, stop_words=None):
    # Load cleaned text files
    documents = []
    for file_name in os.listdir(input_dir):
        if file_name.endswith(".txt"):
            with open(os.path.join(input_dir, file_name), 'r', encoding='utf-8') as f:
                documents.append(f.read())

    # Preprocess the documents
    preprocessed_documents = [preprocess_text(doc) for doc in documents]

    # Calculate TF-IDF
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(preprocessed_documents)

    # Get feature names (terms)
    feature_names = tfidf.get_feature_names_out()

    # Get top important terms
    important_terms_indices = tfidf_matrix.sum(axis=0).argsort()[0, -num_terms:][::-1]
    important_terms = [feature_names[i] for i in important_terms_indices]

    return important_terms

In [48]:
# Main code
urls_file = "urls.txt"
output_dir = "scraped_text"
cleaned_output_dir = "cleaned_text"
os.makedirs(output_dir, exist_ok=True)  # Create output directory for scraped text if it doesn't exist

# Scrape text from the URLs listed in 'urls.txt'
scrape_text_and_write_to_files(urls_file, output_dir)

# Clean up the text files
clean_text_files(output_dir, cleaned_output_dir)

cleaned_text_dir = "cleaned_text"

In [49]:
# Extract important terms using TF-IDF
important_terms_tfidf = extract_important_terms(cleaned_text_dir, 40, stop_words)
print("Important terms extracted using TF-IDF:", important_terms_tfidf)

Important terms extracted using TF-IDF: [array([['american', 'video', 'time', '2024', 'folk', 'awards', 'tour',
        'january', 'edit', 'nashville', 'new', '2017', 'july', '2015',
        'march', 'album', 'pop', 'may', 'april', 'february', 'august',
        'records', 'billboard', 'rock', 'swifts', 'december', '2019',
        '2022', 'country', '2020', 'october', 'november', '2021', '2023',
        'archived', 'original', 'music', 'retrieved', 'taylor', 'swift']],
      dtype=object)]


In [50]:
# Flatten the array of terms extracted using TF-IDF
flattened_tfidf_terms = [term for sublist in important_terms_tfidf for term in sublist[0]]

In [51]:
# Manually determine additional important terms based on domain knowledge
additional_terms = ["Reputation", "Lover", "Red (album)", "Grammy Awards", "Country Music", "Pop Music", "Eras Tour", "Billboard charts", "1989 (album)", "Taylor Swift Foundation", "Travis Kelce", "Personal Information", "Height"]

In [52]:
# Combine important terms
important_terms = flattened_tfidf_terms + additional_terms

In [53]:
# Convert the combined terms into a list of strings
important_terms_strings = [str(term) for term in important_terms]

In [54]:
print("Important terms:", important_terms_strings)

Important terms: ['american', 'video', 'time', '2024', 'folk', 'awards', 'tour', 'january', 'edit', 'nashville', 'new', '2017', 'july', '2015', 'march', 'album', 'pop', 'may', 'april', 'february', 'august', 'records', 'billboard', 'rock', 'swifts', 'december', '2019', '2022', 'country', '2020', 'october', 'november', '2021', '2023', 'archived', 'original', 'music', 'retrieved', 'taylor', 'swift', 'Reputation', 'Lover', 'Red (album)', 'Grammy Awards', 'Country Music', 'Pop Music', 'Eras Tour', 'Billboard charts', '1989 (album)', 'Taylor Swift Foundation', 'Travis Kelce', 'Personal Information', 'Height']


In [72]:
# Function to load text from files and tokenize into sentences
def tokenize_sentences_from_files(input_dir):
    sentences = []
    for file_name in os.listdir(input_dir):
        if file_name.endswith(".txt"):
            with open(os.path.join(input_dir, file_name), 'r', encoding='utf-8') as f:
                text = f.read()
                # Preprocess the text to remove non-alphanumeric characters
                text = re.sub(r'\n{2,}', '\n', text)  # Collapse consecutive newlines into a single newline
                #text = re.sub(r'[^\w\s,.]', '', text)
                #text = ''.join(char for char in text if char.isalnum() or char in string.whitespace)
                sentences.extend(sent_tokenize(text))
    return sentences

In [73]:
# Function to build knowledge base
def build_knowledge_base(sentences, important_terms):
    knowledge_base = {term: [] for term in important_terms}
    for sentence in sentences:
        for term in important_terms:
            if term.lower() in sentence.lower():  # Case insensitive search
                knowledge_base[term].append(sentence)
    return knowledge_base

In [74]:
# Tokenize sentences from text files
initial_sentences = tokenize_sentences_from_files(output_dir)

# Build knowledge base
knowledge_base = build_knowledge_base(initial_sentences, important_terms_strings)

In [75]:
# Save the knowledge base to a file using pickle
with open('knowledge_base.pkl', 'wb') as f:
    pickle.dump(knowledge_base, f)

In [76]:
# Load the knowledge base from the pickle file
with open('knowledge_base.pkl', 'rb') as f:
    knowledge_base = pickle.load(f)

# Print a limited number of sentences for each term

for term, sentences_info in knowledge_base.items():
    print("Term:", term)
    for i, sentence_info in enumerate(sentences_info[:3]):  # Print only the first three sentences for each term
        print(f"Sentence {i+1}:", sentence_info)
    print()  # Add an empty line between terms


Term: american
Sentence 1: Swift outside the  Late Show with David Letterman  studio in 2012 
 The American singer-songwriter  Taylor Swift  is a subject of extensive  mass media  interest and press coverage, eliciting a range of  public opinions  and perceptions of her life and career.
Sentence 2: American  public relations  executive  Tree Paine  has been Swift's  publicist  since 2014.
Sentence 3: She began dating American football player  Travis Kelce  in 2023, which has had a significant cultural impact, including a contribution of $331.5 million in  brand value  for the  National Football League  (NFL).

Term: video
Sentence 1: They have been featured or referenced in her  videos  and other works.
Sentence 2: Benton characterized Swift as an unintentional but reliable "attention-grabber" whose internet media coverage has a " memetic " quality, inviting "fans and haters alike to debate and dissect her", elevating the engagements with an article or a video, which in turn encourages

In [77]:
# Load the knowledge base from the pickle file
with open('knowledge_base.pkl', 'rb') as f:
    knowledge_base = pickle.load(f)

# Print all keys
print("Keys in the knowledge base:")
for key in knowledge_base.keys():
    print(key)


Keys in the knowledge base:
american
video
time
2024
folk
awards
tour
january
edit
nashville
new
2017
july
2015
march
album
pop
may
april
february
august
records
billboard
rock
swifts
december
2019
2022
country
2020
october
november
2021
2023
archived
original
music
retrieved
taylor
swift
Reputation
Lover
Red (album)
Grammy Awards
Country Music
Pop Music
Eras Tour
Billboard charts
1989 (album)
Taylor Swift Foundation
Travis Kelce
Personal Information
Height
