In [33]:
import pandas as pd
import hashlib
from datetime import datetime

In [34]:
# Function to generate a hash from text
def generate_hash(text):
    return hashlib.sha256(text.encode()).hexdigest()


t = "Hello World"
a = generate_hash(t)
print(t)
print(a)
print(datetime.now())

Hello World
a591a6d40bf420404a011733cfb7b190d62c65bf0bcda32b57b277d9ad9f146e
2024-12-08 20:08:10.586861


In [35]:
DF = pd.DataFrame({
    'text': [t],
    'hash': [a],
    'date': [datetime.now()]
})

In [36]:
mask = DF['hash'] == a
df = DF[mask]

df_not = DF[~mask]

In [37]:
DF.to_pickle('/app/data.pkl')

In [1]:
import os
import re
import time
import requests
from urllib.parse import urljoin, urlparse
from collections import deque
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import hashlib
from datetime import datetime



class Scraper:
    def __init__(self, URLs, output_dir='/app/BFS', max_depth=4,frame=None):
        self.URLs = URLs
        self.max_depth = max_depth
        self.visited_urls = set()
        self.frame = frame
        self.output_dir = output_dir
        self.check_dataframe()

    # Function to generate a hash from text
    def generate_hash(self,text):
        return hashlib.sha256(text.encode()).hexdigest()

    def check_dataframe(self):
        if self.frame is None:
            self.frame = pd.DataFrame(columns=['url', 'text_hash','date_accessed'])
        else:
            self.frame = pd.read_pickle(self.frame)

    def save_dataframe(self):
        self.frame.to_pickle(os.path.join(self.output_dir,'dataframe.pkl'))

    def url_to_filename(self,url):
        filename = re.sub(r'^(http|https)://', '', url)
        filename = filename.replace('/', '_')
        filename = re.sub(r'[^a-zA-Z0-9\-_]', '_', filename)
        max_length = 255
        return filename[:max_length]


    def reset_visited_urls(self):
        self.visited_urls = set()

    def scrape_text_bfs(self,start_url, base_url, max_depth=4):
        self.reset_visited_urls()
        
        queue = deque([(start_url, 0)])  # Queue stores tuples of (url, current_depth)

        while queue and len(self.visited_urls) < 5:
            url, depth = queue.popleft()
            print(f"Scraping {url} at depth {depth}")
            
            # Check if the URL has already been visited or if it exceeds max depth
            if url in self.visited_urls or depth > max_depth:
                continue
            
            mask = self.frame['url'] == url
            entry = self.frame[mask]
            
            try:
                date_accessed = entry['date_accessed'].values[0]
            except:
                date_accessed = None

            # Mark the URL as visited
            self.visited_urls.add(url)

            try:
                response = requests.get(url)
                response.raise_for_status()
            except requests.exceptions.RequestException as e:
                print(f"Failed to retrieve {url}: {e}")
                continue

            soup = BeautifulSoup(response.text, 'html.parser')
            if (mask.sum() == 1 and pd.Timestamp.now()-date_accessed >= pd.Timedelta(weeks=1)) or mask.sum() == 0:
                # Extract and save text
                text = soup.get_text(separator=' ', strip=True)
                text = f"URL:{url}\n{text}"

                try:
                    text_hash = entry['text_hash'].values[0]
                except:
                    text_hash = None

                file_path = os.path.join(self.output_dir, self.url_to_filename(url) + '.txt')

                #See if the stored hash is the same as the hash for current text
                if text_hash == self.generate_hash(text):
                    pass
                elif mask.sum() == 0:
                    
                    with open(file_path, 'w') as f:
                        f.write(text)

                    # Add the URL to the dataframe
                    self.frame = pd.concat([self.frame , pd.DataFrame({'url': [url], 'text_hash': [self.generate_hash(text)],'date_accessed':[datetime.now()],'file_path': [file_path]})], ignore_index=True)
                
                elif mask.sum() == 1:
                    #Delete the corresponding file
                    os.remove(entry['file_path'].values[0])
                    
                    #Delete the entry in the dataframe
                    self.frame = self.frame[~mask]

                    
                    with open(file_path, 'w') as f:
                        f.write(text)

                    # Add the URL to the dataframe
                    self.frame = pd.concat([self.frame , pd.DataFrame({'url': [url], 'text_hash': [self.generate_hash(text)],'date_accessed':[datetime.now()],'file_path': [file_path]})], ignore_index=True)
            
            
            # If the current depth is less than max_depth, find and add links to the queue
            if depth < max_depth:
                for link in soup.find_all('a', href=True):
                    href = link['href']
                    next_url = urljoin(base_url, href)
                    if urlparse(next_url).netloc == urlparse(base_url).netloc and next_url not in self.visited_urls:
                        queue.append((next_url, depth + 1))
                        time.sleep(0.5)  # Sleep for 500ms to avoid hammering the server
        self.save_dataframe()

    def breadth_scrape(self):
        for start_url in tqdm(self.URLs):
            self.scrape_text_bfs(start_url, start_url, max_depth=self.max_depth)

In [2]:
scrape = Scraper(['https://health.uoregon.edu/'],'/app/obj_test/',frame='/app/obj_test/dataframe.pkl')

In [3]:
scrape.breadth_scrape()

  0%|          | 0/1 [00:00<?, ?it/s]

Scraping https://health.uoregon.edu/ at depth 0
Scraping https://health.uoregon.edu/#main-content at depth 1
Scraping https://health.uoregon.edu/search at depth 1
Scraping https://health.uoregon.edu/search at depth 1
Scraping https://health.uoregon.edu/medical-care at depth 1
Scraping https://health.uoregon.edu/primary-care at depth 1


100%|██████████| 1/1 [05:12<00:00, 312.64s/it]


In [4]:
new_frame = pd.read_pickle('/app/obj_test/dataframe.pkl')

In [6]:
new_frame

Unnamed: 0,url,text_hash,date_accessed,file_path
0,https://health.uoregon.edu/#main-content,2e5ad0ed1337bbbe488e2a238243c141d11cd03b787f35...,2024-11-08 19:27:49.164375,/app/obj_test/health_uoregon_edu__main-content...
1,https://health.uoregon.edu/search,bb589f5c128a6362d5f43f6b4501b57398e8072934d15c...,2024-12-08 19:29:02.032485,/app/obj_test/health_uoregon_edu_search.txt
2,https://health.uoregon.edu/medical-care,af53886487c41e2cfc1c751bee63cf030ca0c2c776b91a...,2024-12-08 19:42:45.458370,/app/obj_test/health_uoregon_edu_medical-care.txt
3,https://health.uoregon.edu/primary-care,d43497317cd5608d4969eb47b4ac9fb621dca19763b475...,2024-12-08 19:43:41.080676,/app/obj_test/health_uoregon_edu_primary-care.txt
4,https://health.uoregon.edu/,2d1b790621defe1821be3164d970ca9953812b532f23fb...,2024-12-08 20:19:02.791407,/app/obj_test/health_uoregon_edu_.txt


In [None]:
new_frame

Unnamed: 0,url,text_hash,date_accessed,file_path
0,https://health.uoregon.edu/,2d1b790621defe1821be3164d970ca9953812b532f23fb...,2024-12-08 19:26:35.925884,/app/obj_test/health_uoregon_edu_.txt
1,https://health.uoregon.edu/#main-content,2e5ad0ed1337bbbe488e2a238243c141d11cd03b787f35...,2024-12-08 19:27:49.164375,/app/obj_test/health_uoregon_edu__main-content...
2,https://health.uoregon.edu/search,bb589f5c128a6362d5f43f6b4501b57398e8072934d15c...,2024-12-08 19:29:02.032485,/app/obj_test/health_uoregon_edu_search.txt


In [None]:
new_frame

Unnamed: 0,url,text_hash,date_accessed,file_path
0,https://health.uoregon.edu/,2d1b790621defe1821be3164d970ca9953812b532f23fb...,2024-12-08 19:26:35.925884,/app/obj_test/health_uoregon_edu_.txt
1,https://health.uoregon.edu/#main-content,2e5ad0ed1337bbbe488e2a238243c141d11cd03b787f35...,2024-12-08 19:27:49.164375,/app/obj_test/health_uoregon_edu__main-content...
2,https://health.uoregon.edu/search,bb589f5c128a6362d5f43f6b4501b57398e8072934d15c...,2024-12-08 19:29:02.032485,/app/obj_test/health_uoregon_edu_search.txt


In [None]:
new_frame

Unnamed: 0,url,text_hash,date_accessed,file_path
0,https://health.uoregon.edu/,2d1b790621defe1821be3164d970ca9953812b532f23fb...,2024-12-08 19:26:35.925884,/app/obj_test/health_uoregon_edu_.txt
1,https://health.uoregon.edu/#main-content,2e5ad0ed1337bbbe488e2a238243c141d11cd03b787f35...,2024-12-08 19:27:49.164375,/app/obj_test/health_uoregon_edu__main-content...
2,https://health.uoregon.edu/search,bb589f5c128a6362d5f43f6b4501b57398e8072934d15c...,2024-12-08 19:29:02.032485,/app/obj_test/health_uoregon_edu_search.txt
3,https://health.uoregon.edu/medical-care,af53886487c41e2cfc1c751bee63cf030ca0c2c776b91a...,2024-12-08 19:42:45.458370,/app/obj_test/health_uoregon_edu_medical-care.txt
4,https://health.uoregon.edu/primary-care,d43497317cd5608d4969eb47b4ac9fb621dca19763b475...,2024-12-08 19:43:41.080676,/app/obj_test/health_uoregon_edu_primary-care.txt


In [5]:
new_frame['date_accessed'][0] = pd.Timestamp('2024-11-08 19:27:49.164375')

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  new_frame['date_accessed'][0] = pd.Timestamp('2024-11-08 19:27:49.164375')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-vers

In [6]:
new_frame

Unnamed: 0,url,text_hash,date_accessed,file_path
0,https://health.uoregon.edu/,2d1b790621defe1821be316d970ca9953812b532f23fb7...,2024-11-08 19:27:49.164375,/app/obj_test/health_uoregon_edu_.txt
1,https://health.uoregon.edu/#main-content,2e5ad0ed1337bbbe488e2a238243c141d11cd03b787f35...,2024-11-08 19:27:49.164375,/app/obj_test/health_uoregon_edu__main-content...
2,https://health.uoregon.edu/search,bb589f5c128a6362d5f43f6b4501b57398e8072934d15c...,2024-12-08 19:29:02.032485,/app/obj_test/health_uoregon_edu_search.txt
3,https://health.uoregon.edu/medical-care,af53886487c41e2cfc1c751bee63cf030ca0c2c776b91a...,2024-12-08 19:42:45.458370,/app/obj_test/health_uoregon_edu_medical-care.txt
4,https://health.uoregon.edu/primary-care,d43497317cd5608d4969eb47b4ac9fb621dca19763b475...,2024-12-08 19:43:41.080676,/app/obj_test/health_uoregon_edu_primary-care.txt


In [7]:
new_frame.to_pickle('/app/obj_test/dataframe.pkl')

In [None]:
new_frame

Unnamed: 0,url,text_hash,date_accessed,file_path
0,https://health.uoregon.edu/,2d1b790621defe1821be316d970ca9953812b532f23fb7...,2024-12-08 19:26:35.925884,/app/obj_test/health_uoregon_edu_.txt
1,https://health.uoregon.edu/#main-content,2e5ad0ed1337bbbe488e2a238243c141d11cd03b787f35...,2024-11-08 19:27:49.164375,/app/obj_test/health_uoregon_edu__main-content...
2,https://health.uoregon.edu/search,bb589f5c128a6362d5f43f6b4501b57398e8072934d15c...,2024-12-08 19:29:02.032485,/app/obj_test/health_uoregon_edu_search.txt
3,https://health.uoregon.edu/medical-care,af53886487c41e2cfc1c751bee63cf030ca0c2c776b91a...,2024-12-08 19:42:45.458370,/app/obj_test/health_uoregon_edu_medical-care.txt
4,https://health.uoregon.edu/primary-care,d43497317cd5608d4969eb47b4ac9fb621dca19763b475...,2024-12-08 19:43:41.080676,/app/obj_test/health_uoregon_edu_primary-care.txt
