From 60c1d86e6b7e35eeae8bd0250aab54bfd5184b7e Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlov Date: Thu, 6 Apr 2023 23:59:59 +0300 Subject: [PATCH] cleanup for visited links --- .../WebReaper.ConsoleApplication/Program.cs | 5 ++-- README.md | 10 ++++++- WebReaper/Builders/ScraperEngineBuilder.cs | 13 ++++++--- .../Abstract/IVisitedLinkTracker.cs | 4 +++ .../Concrete/FileVisitedLinkedTracker.cs | 29 ++++++++++++++----- .../Concrete/InMemoryVisitedLinkTracker.cs | 6 +++- .../Concrete/RedisVisitedLinkTracker.cs | 19 +++++++++++- WebReaper/Sinks/Concrete/CsvFileSink.cs | 2 ++ 8 files changed, 72 insertions(+), 16 deletions(-) diff --git a/Examples/WebReaper.ConsoleApplication/Program.cs b/Examples/WebReaper.ConsoleApplication/Program.cs index 618d978..91e8cd9 100644 --- a/Examples/WebReaper.ConsoleApplication/Program.cs +++ b/Examples/WebReaper.ConsoleApplication/Program.cs @@ -7,16 +7,17 @@ .ScrollToEnd() .RepeatWithDelay(10, 2000) .Build()) - .FollowWithBrowser("a.SQnoC3ObvgnGjWt90zD9Z._2INHSNB8V5eaWp4P0rY_mE") + .Follow("a.SQnoC3ObvgnGjWt90zD9Z._2INHSNB8V5eaWp4P0rY_mE") .Parse(new() { new("title", "._eYtD2XCVieq6emjKBH3m"), new("text", "._3xX726aBn29LDbsDtzr_6E._1Ap4F5maDtT1E1YuCiaO0r.D3IL3FD0RFy_mkKLPwL4") }) .WriteToJsonFile("output.json", dataCleanupOnStart: true) + .TrackVisitedLinksInFile("visited.txt", dataCleanupOnStart: true) .LogToConsole() .PageCrawlLimit(10) - .HeadlessMode(false) + .HeadlessMode(true) .BuildAsync(); await engine.RunAsync(); diff --git a/README.md b/README.md index 450c23b..31b36fc 100644 --- a/README.md +++ b/README.md @@ -158,7 +158,7 @@ var engine = await new ScraperEngineBuilder() ... ``` -### How to clean prevously scraped data during the next web scrapping run +### How to clean scraped data from the previous web scrapping run You may want to clean the data recived during the previous scraping to start you web scraping from scratch. In this case use dataCleanupOnStart when adding a new sink: @@ -172,6 +172,14 @@ var engine = await new ScraperEngineBuilder() This dataCleanupOnStart parameter is present for all sinks, e.g. MongoDbSink, RedisSink, CosmosSink, etc. +### How to clean visited links from the previous web scrapping run + +```C# +var engine = await new ScraperEngineBuilder() + .Get("https://www.reddit.com/r/dotnet/") + .TrackVisitedLinksInFile("visited.txt", dataCleanupOnStart: true) +``` + ### Distributed web scraping with Serverless approach In the Examples folder you can find the project called WebReaper.AzureFuncs. It demonstrates the use of WebReaper with diff --git a/WebReaper/Builders/ScraperEngineBuilder.cs b/WebReaper/Builders/ScraperEngineBuilder.cs index db3e615..a3ea21c 100644 --- a/WebReaper/Builders/ScraperEngineBuilder.cs +++ b/WebReaper/Builders/ScraperEngineBuilder.cs @@ -27,6 +27,7 @@ namespace WebReaper.Builders; /// public class ScraperEngineBuilder { + private IVisitedLinkTracker _visitedLinksTracker = new InMemoryVisitedLinkTracker(); private ConfigBuilder ConfigBuilder { get; } = new(); private SpiderBuilder SpiderBuilder { get; } = new(); @@ -73,15 +74,17 @@ public ScraperEngineBuilder WithLinkTracker(IVisitedLinkTracker linkTracker) return this; } - public ScraperEngineBuilder TrackVisitedLinksInFile(string fileName) + public ScraperEngineBuilder TrackVisitedLinksInFile(string fileName, bool dataCleanupOnStart = false) { - SpiderBuilder.WithLinkTracker(new FileVisitedLinkedTracker(fileName)); + _visitedLinksTracker = new FileVisitedLinkedTracker(fileName, dataCleanupOnStart); + SpiderBuilder.WithLinkTracker(_visitedLinksTracker); return this; } - public ScraperEngineBuilder TrackVisitedLinksInRedis(string connectionString, string redisKey) + public ScraperEngineBuilder TrackVisitedLinksInRedis(string connectionString, string redisKey, bool dataCleanupOnStart = false) { - SpiderBuilder.WithLinkTracker(new RedisVisitedLinkTracker(connectionString, redisKey)); + _visitedLinksTracker = new RedisVisitedLinkTracker(connectionString, redisKey, dataCleanupOnStart); + SpiderBuilder.WithLinkTracker(_visitedLinksTracker); return this; } @@ -300,6 +303,8 @@ public ScraperEngineBuilder PostProcess(Func action) public async Task BuildAsync() { + await _visitedLinksTracker.Initialization; + SpiderBuilder.WithConfigStorage(ConfigStorage); var config = ConfigBuilder.Build(); diff --git a/WebReaper/Core/LinkTracker/Abstract/IVisitedLinkTracker.cs b/WebReaper/Core/LinkTracker/Abstract/IVisitedLinkTracker.cs index d74623d..0793da1 100644 --- a/WebReaper/Core/LinkTracker/Abstract/IVisitedLinkTracker.cs +++ b/WebReaper/Core/LinkTracker/Abstract/IVisitedLinkTracker.cs @@ -2,8 +2,12 @@ namespace WebReaper.Core.LinkTracker.Abstract; public interface IVisitedLinkTracker { + public bool DataCleanupOnStart { get; set; } + Task AddVisitedLinkAsync(string visitedLink); Task> GetVisitedLinksAsync(); Task> GetNotVisitedLinks(IEnumerable links); Task GetVisitedLinksCount(); + + Task Initialization { get; } } \ No newline at end of file diff --git a/WebReaper/Core/LinkTracker/Concrete/FileVisitedLinkedTracker.cs b/WebReaper/Core/LinkTracker/Concrete/FileVisitedLinkedTracker.cs index 8d7c63e..1218b67 100644 --- a/WebReaper/Core/LinkTracker/Concrete/FileVisitedLinkedTracker.cs +++ b/WebReaper/Core/LinkTracker/Concrete/FileVisitedLinkedTracker.cs @@ -1,28 +1,43 @@ using System.Collections.Concurrent; +using Microsoft.Azure.Amqp.Framing; using WebReaper.Core.LinkTracker.Abstract; namespace WebReaper.Core.LinkTracker.Concrete; public class FileVisitedLinkedTracker : IVisitedLinkTracker { + public bool DataCleanupOnStart { get; set; } + public Task Initialization { get; } + private readonly string _fileName; private readonly SemaphoreSlim _semaphore = new(1, 1); - private readonly ConcurrentBag _visitedLinks; - - public FileVisitedLinkedTracker(string fileName) + private ConcurrentBag _visitedLinks; + + public FileVisitedLinkedTracker(string fileName, bool dataCleanupOnStart = false) { _fileName = fileName; - - if (!File.Exists(fileName)) + DataCleanupOnStart = dataCleanupOnStart; + + Initialization = InitializeAsync(); + } + + private async Task InitializeAsync() + { + if (DataCleanupOnStart) + { + File.Delete(_fileName); + } + + if (!File.Exists(_fileName)) { _visitedLinks = new ConcurrentBag(); - var file = File.Create(fileName); + var file = File.Create(_fileName); file.Close(); return; } - var allLinks = File.ReadLines(fileName); + var allLinks = File.ReadLines(_fileName); _visitedLinks = new ConcurrentBag(allLinks); } diff --git a/WebReaper/Core/LinkTracker/Concrete/InMemoryVisitedLinkTracker.cs b/WebReaper/Core/LinkTracker/Concrete/InMemoryVisitedLinkTracker.cs index e531198..a4708e6 100644 --- a/WebReaper/Core/LinkTracker/Concrete/InMemoryVisitedLinkTracker.cs +++ b/WebReaper/Core/LinkTracker/Concrete/InMemoryVisitedLinkTracker.cs @@ -5,6 +5,8 @@ namespace WebReaper.Core.LinkTracker.Concrete; public class InMemoryVisitedLinkTracker : IVisitedLinkTracker { + public bool DataCleanupOnStart { get; set; } + private ImmutableHashSet visitedUrls = ImmutableHashSet.Create(); public Task AddVisitedLinkAsync(string visitedLink) @@ -13,7 +15,7 @@ public Task AddVisitedLinkAsync(string visitedLink) return Task.CompletedTask; } - + public Task> GetVisitedLinksAsync() { return Task.FromResult(visitedUrls.ToList()); @@ -28,4 +30,6 @@ public Task GetVisitedLinksCount() { return Task.FromResult((long)visitedUrls.Count); } + + public Task Initialization => Task.CompletedTask; } \ No newline at end of file diff --git a/WebReaper/Core/LinkTracker/Concrete/RedisVisitedLinkTracker.cs b/WebReaper/Core/LinkTracker/Concrete/RedisVisitedLinkTracker.cs index f82560b..1a85608 100644 --- a/WebReaper/Core/LinkTracker/Concrete/RedisVisitedLinkTracker.cs +++ b/WebReaper/Core/LinkTracker/Concrete/RedisVisitedLinkTracker.cs @@ -6,12 +6,29 @@ namespace WebReaper.Core.LinkTracker.Concrete; public class RedisVisitedLinkTracker : RedisBase, IVisitedLinkTracker { private readonly string _redisKey; + + public bool DataCleanupOnStart { get; set; } - public RedisVisitedLinkTracker(string connectionString, string redisKey) : base(connectionString) + public RedisVisitedLinkTracker(string connectionString, string redisKey, bool dataCleanupOnStart = false) + : base(connectionString) { _redisKey = redisKey; + Initialization = InitializeAsync(); + DataCleanupOnStart = dataCleanupOnStart; } + public Task Initialization { get; } + + private async Task InitializeAsync() + { + if (!DataCleanupOnStart) + return; + + var db = Redis.GetDatabase(); + + await db.KeyDeleteAsync(_redisKey); + } + public async Task AddVisitedLinkAsync(string visitedLink) { var db = Redis!.GetDatabase(); diff --git a/WebReaper/Sinks/Concrete/CsvFileSink.cs b/WebReaper/Sinks/Concrete/CsvFileSink.cs index 7cbec6a..5ce8681 100644 --- a/WebReaper/Sinks/Concrete/CsvFileSink.cs +++ b/WebReaper/Sinks/Concrete/CsvFileSink.cs @@ -38,10 +38,12 @@ private async Task Init(JObject scrapedData, CancellationToken cancellationToken return; if (DataCleanupOnStart) + { lock (_lock) { File.Delete(filePath); } + } var flattened = scrapedData .Descendants()