Skip to content

Commit

Permalink
cleanup for visited links
Browse files Browse the repository at this point in the history
  • Loading branch information
Oleksandr Pavlov committed Apr 6, 2023
1 parent 14a3cb6 commit 60c1d86
Show file tree
Hide file tree
Showing 8 changed files with 72 additions and 16 deletions.
5 changes: 3 additions & 2 deletions Examples/WebReaper.ConsoleApplication/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,17 @@
.ScrollToEnd()
.RepeatWithDelay(10, 2000)
.Build())
.FollowWithBrowser("a.SQnoC3ObvgnGjWt90zD9Z._2INHSNB8V5eaWp4P0rY_mE")
.Follow("a.SQnoC3ObvgnGjWt90zD9Z._2INHSNB8V5eaWp4P0rY_mE")
.Parse(new()
{
new("title", "._eYtD2XCVieq6emjKBH3m"),
new("text", "._3xX726aBn29LDbsDtzr_6E._1Ap4F5maDtT1E1YuCiaO0r.D3IL3FD0RFy_mkKLPwL4")
})
.WriteToJsonFile("output.json", dataCleanupOnStart: true)
.TrackVisitedLinksInFile("visited.txt", dataCleanupOnStart: true)
.LogToConsole()
.PageCrawlLimit(10)
.HeadlessMode(false)
.HeadlessMode(true)
.BuildAsync();

await engine.RunAsync();
Expand Down
10 changes: 9 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ var engine = await new ScraperEngineBuilder()
...
```

### How to clean prevously scraped data during the next web scrapping run
### How to clean scraped data from the previous web scrapping run

You may want to clean the data recived during the previous scraping to start you web scraping from scratch. In this case
use dataCleanupOnStart when adding a new sink:
Expand All @@ -172,6 +172,14 @@ var engine = await new ScraperEngineBuilder()

This dataCleanupOnStart parameter is present for all sinks, e.g. MongoDbSink, RedisSink, CosmosSink, etc.

### How to clean visited links from the previous web scrapping run

```C#
var engine = await new ScraperEngineBuilder()
.Get("https://www.reddit.com/r/dotnet/")
.TrackVisitedLinksInFile("visited.txt", dataCleanupOnStart: true)
```

### Distributed web scraping with Serverless approach

In the Examples folder you can find the project called WebReaper.AzureFuncs. It demonstrates the use of WebReaper with
Expand Down
13 changes: 9 additions & 4 deletions WebReaper/Builders/ScraperEngineBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ namespace WebReaper.Builders;
/// </summary>
public class ScraperEngineBuilder
{
private IVisitedLinkTracker _visitedLinksTracker = new InMemoryVisitedLinkTracker();
private ConfigBuilder ConfigBuilder { get; } = new();
private SpiderBuilder SpiderBuilder { get; } = new();

Expand Down Expand Up @@ -73,15 +74,17 @@ public ScraperEngineBuilder WithLinkTracker(IVisitedLinkTracker linkTracker)
return this;
}

public ScraperEngineBuilder TrackVisitedLinksInFile(string fileName)
public ScraperEngineBuilder TrackVisitedLinksInFile(string fileName, bool dataCleanupOnStart = false)
{
SpiderBuilder.WithLinkTracker(new FileVisitedLinkedTracker(fileName));
_visitedLinksTracker = new FileVisitedLinkedTracker(fileName, dataCleanupOnStart);
SpiderBuilder.WithLinkTracker(_visitedLinksTracker);
return this;
}

public ScraperEngineBuilder TrackVisitedLinksInRedis(string connectionString, string redisKey)
public ScraperEngineBuilder TrackVisitedLinksInRedis(string connectionString, string redisKey, bool dataCleanupOnStart = false)
{
SpiderBuilder.WithLinkTracker(new RedisVisitedLinkTracker(connectionString, redisKey));
_visitedLinksTracker = new RedisVisitedLinkTracker(connectionString, redisKey, dataCleanupOnStart);
SpiderBuilder.WithLinkTracker(_visitedLinksTracker);
return this;
}

Expand Down Expand Up @@ -300,6 +303,8 @@ public ScraperEngineBuilder PostProcess(Func<Metadata, JObject, Task> action)

public async Task<ScraperEngine> BuildAsync()
{
await _visitedLinksTracker.Initialization;

SpiderBuilder.WithConfigStorage(ConfigStorage);

var config = ConfigBuilder.Build();
Expand Down
4 changes: 4 additions & 0 deletions WebReaper/Core/LinkTracker/Abstract/IVisitedLinkTracker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@ namespace WebReaper.Core.LinkTracker.Abstract;

public interface IVisitedLinkTracker
{
public bool DataCleanupOnStart { get; set; }

Task AddVisitedLinkAsync(string visitedLink);
Task<List<string>> GetVisitedLinksAsync();
Task<List<string>> GetNotVisitedLinks(IEnumerable<string> links);
Task<long> GetVisitedLinksCount();

Task Initialization { get; }
}
29 changes: 22 additions & 7 deletions WebReaper/Core/LinkTracker/Concrete/FileVisitedLinkedTracker.cs
Original file line number Diff line number Diff line change
@@ -1,28 +1,43 @@
using System.Collections.Concurrent;
using Microsoft.Azure.Amqp.Framing;
using WebReaper.Core.LinkTracker.Abstract;

namespace WebReaper.Core.LinkTracker.Concrete;

public class FileVisitedLinkedTracker : IVisitedLinkTracker
{
public bool DataCleanupOnStart { get; set; }
public Task Initialization { get; }

private readonly string _fileName;

private readonly SemaphoreSlim _semaphore = new(1, 1);
private readonly ConcurrentBag<string> _visitedLinks;

public FileVisitedLinkedTracker(string fileName)
private ConcurrentBag<string> _visitedLinks;
public FileVisitedLinkedTracker(string fileName, bool dataCleanupOnStart = false)
{
_fileName = fileName;

if (!File.Exists(fileName))
DataCleanupOnStart = dataCleanupOnStart;

Initialization = InitializeAsync();
}

private async Task InitializeAsync()
{
if (DataCleanupOnStart)
{
File.Delete(_fileName);
}

if (!File.Exists(_fileName))
{
_visitedLinks = new ConcurrentBag<string>();
var file = File.Create(fileName);
var file = File.Create(_fileName);
file.Close();
return;
}

var allLinks = File.ReadLines(fileName);
var allLinks = File.ReadLines(_fileName);
_visitedLinks = new ConcurrentBag<string>(allLinks);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ namespace WebReaper.Core.LinkTracker.Concrete;

public class InMemoryVisitedLinkTracker : IVisitedLinkTracker
{
public bool DataCleanupOnStart { get; set; }

private ImmutableHashSet<string> visitedUrls = ImmutableHashSet.Create<string>();

public Task AddVisitedLinkAsync(string visitedLink)
Expand All @@ -13,7 +15,7 @@ public Task AddVisitedLinkAsync(string visitedLink)

return Task.CompletedTask;
}

public Task<List<string>> GetVisitedLinksAsync()
{
return Task.FromResult(visitedUrls.ToList());
Expand All @@ -28,4 +30,6 @@ public Task<long> GetVisitedLinksCount()
{
return Task.FromResult((long)visitedUrls.Count);
}

public Task Initialization => Task.CompletedTask;
}
19 changes: 18 additions & 1 deletion WebReaper/Core/LinkTracker/Concrete/RedisVisitedLinkTracker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,29 @@ namespace WebReaper.Core.LinkTracker.Concrete;
public class RedisVisitedLinkTracker : RedisBase, IVisitedLinkTracker
{
private readonly string _redisKey;

public bool DataCleanupOnStart { get; set; }

public RedisVisitedLinkTracker(string connectionString, string redisKey) : base(connectionString)
public RedisVisitedLinkTracker(string connectionString, string redisKey, bool dataCleanupOnStart = false)
: base(connectionString)
{
_redisKey = redisKey;
Initialization = InitializeAsync();
DataCleanupOnStart = dataCleanupOnStart;
}

public Task Initialization { get; }

private async Task InitializeAsync()
{
if (!DataCleanupOnStart)
return;

var db = Redis.GetDatabase();

await db.KeyDeleteAsync(_redisKey);
}

public async Task AddVisitedLinkAsync(string visitedLink)
{
var db = Redis!.GetDatabase();
Expand Down
2 changes: 2 additions & 0 deletions WebReaper/Sinks/Concrete/CsvFileSink.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,12 @@ private async Task Init(JObject scrapedData, CancellationToken cancellationToken
return;

if (DataCleanupOnStart)
{
lock (_lock)
{
File.Delete(filePath);
}
}

var flattened = scrapedData
.Descendants()
Expand Down

0 comments on commit 60c1d86

Please sign in to comment.