In [4]:
import logging
import asyncio
import pandas as pd
from crawl4ai import AsyncWebCrawler

In [None]:
logging.basicConfig(
    filename='scraper_log.log', 
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True  
)


async def main():
    df = pd.read_csv('blm_output_vgg_gpt.csv')
    
    crawler = AsyncWebCrawler(verbose=True)
    
    # Update User-Agent to mimic legitimate browser requests
    crawler.crawler_strategy.update_user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
    
    # Set custom headers to include necessary cookies or authentication information
    crawler.crawler_strategy.set_custom_headers({})
    
    contents = []
    scraped_count = 0
    
    async with crawler as c:
        for index, url in enumerate(df['url'], start=1):
            try:
                # Run the crawler on each URL with session-based crawling to mimic human behavior
                result = await c.arun(url=url, session_id=f"session_{index}")

                contents.append(result.markdown)
                scraped_count += 1
                logging.info(f"Successfully scraped URL {index}: {url}")
            except Exception as e:
                contents.append(f"Error: {str(e)}")
                logging.error(f"Error scraping URL {index}: {url} - {str(e)}")
    
    df['content'] = contents
    
    df.to_csv('updated_dataset.csv', index=False)
    logging.info(f"Scraping completed. Total URLs scraped successfully: {scraped_count}/{len(df)}")

if not asyncio.get_event_loop().is_running():
    asyncio.run(main())
else:
    await main()

[LOG] 🌤️  Warming up the AsyncWebCrawler
[LOG] 🌞 AsyncWebCrawler is ready to crawl
[LOG] 🚀 Content extracted for https://www.cnn.com/2021/02/25/us/black-lives-matter-2020-donation-report-trnd/index.html, success: True, time taken: 0.16 seconds
[LOG] 🚀 Extraction done for https://www.cnn.com/2021/02/25/us/black-lives-matter-2020-donation-report-trnd/index.html, time taken: 0.16 seconds.
[LOG] 🚀 Content extracted for https://www.cnn.com/2020/06/23/politics/black-lives-matter-support-impact/index.html, success: True, time taken: 0.18 seconds
[LOG] 🚀 Extraction done for https://www.cnn.com/2020/06/23/politics/black-lives-matter-support-impact/index.html, time taken: 0.18 seconds.
[LOG] 🚀 Content extracted for https://www.cnn.com/2020/06/01/world/george-floyd-global-protests-intl/index.html, success: True, time taken: 0.28 seconds
[LOG] 🚀 Extraction done for https://www.cnn.com/2020/06/01/world/george-floyd-global-protests-intl/index.html, time taken: 0.28 seconds.
[LOG] 🚀 Content extracted

In [None]:
# Function to extract date and main content from the 'content' column
def extract_date_and_content(text):
    date_match = re.search(r'(January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}', text)
    date_extracted = date_match.group(0) if date_match else "Unknown"
    
    # Extract main content by removing typical footer-like content (not working well)
    content_extracted = re.split(r'values your feedback|Your browser does not support the', text, maxsplit=1)[0].strip()
    
    return date_extracted, content_extracted

# Fill NaN values in the 'content' column with empty strings to prevent errors
df['content'] = df['content'].fillna('')
df[['date_extracted', 'content_extracted']] = df['content'].apply(lambda x: pd.Series(extract_date_and_content(x)))

#df_without_content_extracted = df.drop(columns=['content_extracted'])

output_file_path = '/path/to/output_dataset.csv'
df_without_content_extracted.to_csv(output_file_path, index=False)

print(f"Dataset processed and saved to {output_file_path}")