Skip to content

Commit

Permalink
up
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexander Pavlov committed Apr 18, 2023
1 parent e4b2f10 commit 3357c67
Show file tree
Hide file tree
Showing 5 changed files with 7 additions and 6 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,12 @@

namespace WebReaper.AzureFuncs
{
public class ExoScraperSpider
public class WebReaperSpider
{
public IVisitedLinkTracker LinkTracker { get; }
public CosmosSink CosmosSink { get; }

public ExoScraperSpider(IVisitedLinkTracker linkTracker, CosmosSink cosmosSink)
public WebReaperSpider(IVisitedLinkTracker linkTracker, CosmosSink cosmosSink)
{
LinkTracker = linkTracker;
CosmosSink = cosmosSink;
Expand Down
3 changes: 0 additions & 3 deletions WebReaper/Builders/ScraperEngineBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -328,9 +328,6 @@ public ScraperEngineBuilder WithParallelismDegree(int parallelismDegree)

public async Task<ScraperEngine> BuildAsync()
{
await _visitedLinksTracker.Initialization;
await Scheduler.Initialization;

SpiderBuilder.WithConfigStorage(ConfigStorage);

var config = ConfigBuilder.Build();
Expand Down
2 changes: 2 additions & 0 deletions WebReaper/Core/ScraperEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ public class ScraperEngine

public async Task RunAsync(CancellationToken cancellationToken = default)
{
await Scheduler.Initialization;

Logger.LogInformation("Start {class}.{method}", nameof(ScraperEngine), nameof(RunAsync));

var config = await ConfigStorage.GetConfigAsync();
Expand Down
2 changes: 2 additions & 0 deletions WebReaper/Core/Spider/Concrete/Spider.cs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ public class Spider : ISpider

public async Task<List<Job>> CrawlAsync(Job job, CancellationToken cancellationToken = default)
{
await LinkTracker.Initialization;

var config = await ScraperConfigStorage.GetConfigAsync();

if (config.UrlBlackList.Contains(job.Url)) return Enumerable.Empty<Job>().ToList();
Expand Down
2 changes: 1 addition & 1 deletion WebReaper/todo.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
- [X] Add flag for headless/not headless chrome
- [X] Clean up on start in sink, link tracker, scheduler
- [ ] Remove nested builders
- [ ] ~~Use factory for async object creation~~
- [ ] Use factory for async object creation?
- [ ] Write tests using specflow
- [ ] Separate nuget packages for MongoDb, Cosmos Db, Redis, etc.
- [ ] Add support for xpath and regex
Expand Down

0 comments on commit 3357c67

Please sign in to comment.