Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ard: use experimental topic search #959

Merged
merged 3 commits into from Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/main/java/mServer/crawler/sender/ard/ArdConstants.java
Expand Up @@ -7,11 +7,13 @@ public class ArdConstants {

public static final String ITEM_URL = API_URL + "/page-gateway/pages/ard/item/";

public static final String TOPICS_URL = API_URL + "/page-gateway/pages/%s/shows/";
public static final String TOPICS_URL = API_URL + "/page-gateway/pages/%s/editorial/experiment-a-z?embedded=false";
public static final String TOPICS_COMPILATION_URL = API_URL + "/page-gateway/widgets/%s/editorials/%s?pageNumber=0&pageSize=%s";
public static final String TOPIC_URL = API_URL + "/page-gateway/widgets/ard/asset/%s?pageSize=%d";
public static final String DAY_PAGE_URL = API_URL + "/page-gateway/compilations/%s/pastbroadcasts?startDateTime=%sT00:00:00.000Z&endDateTime=%sT23:59:59.000Z&pageNumber=0&pageSize=%d";

public static final int DAY_PAGE_SIZE = 100;
public static final int TOPICS_COMPILATION_PAGE_SIZE = 200;
public static final int TOPIC_PAGE_SIZE = 50;

public static final String DEFAULT_CLIENT = "ard";
Expand Down
31 changes: 16 additions & 15 deletions src/main/java/mServer/crawler/sender/ard/ArdCrawler.java
Expand Up @@ -6,10 +6,7 @@
import mServer.crawler.CrawlerTool;
import mServer.crawler.FilmeSuchen;
import mServer.crawler.sender.MediathekCrawler;
import mServer.crawler.sender.ard.tasks.ArdDayPageTask;
import mServer.crawler.sender.ard.tasks.ArdFilmDetailTask;
import mServer.crawler.sender.ard.tasks.ArdTopicPageTask;
import mServer.crawler.sender.ard.tasks.ArdTopicsOverviewTask;
import mServer.crawler.sender.ard.tasks.*;
import mServer.crawler.sender.base.CrawlerUrlDTO;

import java.time.LocalDateTime;
Expand All @@ -22,13 +19,12 @@

public class ArdCrawler extends MediathekCrawler {

public static final String SENDERNAME = Const.ARD;
private static final int MAX_DAYS_PAST = 2;
private static final int MAX_DAYS_PAST_AVAILABLE = 6;
private static final DateTimeFormatter DAY_PAGE_DATE_FORMATTER
= DateTimeFormatter.ofPattern("yyyy-MM-dd");

public static final String SENDERNAME = Const.ARD;

public ArdCrawler(FilmeSuchen ssearch, int startPrio) {
super(ssearch, SENDERNAME, 0, 1, startPrio);
}
Expand Down Expand Up @@ -73,13 +69,13 @@ private void addDayUrls(ConcurrentLinkedQueue<CrawlerUrlDTO> dayUrlsToCrawl, Loc
}

private void addSpecialDays(
ConcurrentLinkedQueue<CrawlerUrlDTO> dayUrlsToCrawl) {
final LocalDateTime[] specialDates = new LocalDateTime[] {
ConcurrentLinkedQueue<CrawlerUrlDTO> dayUrlsToCrawl) {
final LocalDateTime[] specialDates = new LocalDateTime[]{
};

final LocalDateTime minDayOnline = LocalDateTime.now().minusDays(MAX_DAYS_PAST_AVAILABLE);

for(LocalDateTime specialDate : specialDates) {
for (LocalDateTime specialDate : specialDates) {
if (specialDate.isAfter(minDayOnline)) {
addDayUrls(dayUrlsToCrawl, specialDate);
}
Expand All @@ -95,7 +91,7 @@ protected RecursiveTask<Set<DatenFilm>> createCrawlerTask() {
if (CrawlerTool.loadLongMax()) {
shows.addAll(getTopicsEntries());
}

Log.sysLog("ARD Anzahl topics: " + shows.size());
getDaysEntries().forEach(show -> {
if (!shows.contains(show)) {
shows.add(show);
Expand Down Expand Up @@ -125,20 +121,25 @@ private Set<ArdFilmInfoDto> getTopicsEntries() throws ExecutionException, Interr
topics.addAll(getTopicEntriesBySender(client));
}

Log.sysLog("ard mediathek topics: " + topics.size());
ConcurrentLinkedQueue<CrawlerUrlDTO> topicUrls = new ConcurrentLinkedQueue<>(topics);

final ArdTopicPageTask topicTask = new ArdTopicPageTask(this, topicUrls);
final Set<ArdFilmInfoDto> filmInfos = forkJoinPool.submit(topicTask).get();
Log.sysLog("ard shows by topics: " + filmInfos.size());
return filmInfos;
}

private ConcurrentLinkedQueue<CrawlerUrlDTO> getTopicEntriesBySender(final String sender) throws ExecutionException, InterruptedException {
ArdTopicsOverviewTask topicsTask
= new ArdTopicsOverviewTask(this, createTopicsOverviewUrl(sender));
private Set<CrawlerUrlDTO> getTopicEntriesBySender(final String sender) throws ExecutionException, InterruptedException {
ArdTopicsTask topicsTask
= new ArdTopicsTask(this, sender, createTopicsOverviewUrl(sender));

ConcurrentLinkedQueue<CrawlerUrlDTO> queue = new ConcurrentLinkedQueue<>(forkJoinPool.submit(topicsTask).get());
Log.sysLog(sender + " topic entries: " + queue.size());
return queue;
Log.sysLog(sender + " topics task entries: " + queue.size());

final Set<CrawlerUrlDTO> topicUrls = forkJoinPool.submit(new ArdTopicsLetterTask(this, sender, queue)).get();
Log.sysLog(sender + " topics: " + topicUrls.size());
return topicUrls;
}

private ConcurrentLinkedQueue<CrawlerUrlDTO> createTopicsOverviewUrl(final String client) {
Expand Down
40 changes: 40 additions & 0 deletions src/main/java/mServer/crawler/sender/ard/PaginationUrlDto.java
@@ -0,0 +1,40 @@
package mServer.crawler.sender.ard;

import mServer.crawler.sender.base.CrawlerUrlDTO;

import java.util.HashSet;
import java.util.Set;

public class PaginationUrlDto {
private final Set<CrawlerUrlDTO> urls = new HashSet<>();
private int actualPage;
private int maxPages;

public void addUrl(CrawlerUrlDTO url) {
urls.add(url);
}

public void addAll(Set<CrawlerUrlDTO> urls) {
this.urls.addAll(urls);
}

public Set<CrawlerUrlDTO> getUrls() {
return urls;
}

public int getActualPage() {
return actualPage;
}

public int getMaxPages() {
return maxPages;
}

public void setActualPage(int actualPage) {
this.actualPage = actualPage;
}

public void setMaxPages(int maxPages) {
this.maxPages = maxPages;
}
}
@@ -0,0 +1,62 @@
package mServer.crawler.sender.ard.json;


import com.google.gson.JsonArray;
import com.google.gson.JsonDeserializationContext;
import com.google.gson.JsonDeserializer;
import com.google.gson.JsonElement;
import mServer.crawler.sender.ard.ArdConstants;
import mServer.crawler.sender.base.CrawlerUrlDTO;
import mServer.crawler.sender.base.JsonUtils;

import java.lang.reflect.Type;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;

public class ArdTopicsDeserializer implements JsonDeserializer<Set<CrawlerUrlDTO>> {
private static final String ELEMENT_WIDGETS = "widgets";
private static final String ELEMENT_LINKS = "links";
private static final String ELEMENT_SELF = "self";

private static final String ATTRIBUTE_ID = "id";

private final String sender;

public ArdTopicsDeserializer(String sender) {
this.sender = sender;
}

@Override
public Set<CrawlerUrlDTO> deserialize(
JsonElement jsonElement, Type type, JsonDeserializationContext jsonDeserializationContext) {
final Set<CrawlerUrlDTO> result = new HashSet<>();

if (JsonUtils.hasElements(jsonElement, ELEMENT_WIDGETS)) {
final JsonArray widgets = jsonElement.getAsJsonObject().getAsJsonArray(ELEMENT_WIDGETS);
widgets.forEach(widget -> parseWidget(widget.getAsJsonObject()).ifPresent(result::add));
}

return result;
}

private Optional<CrawlerUrlDTO> parseWidget(final JsonElement compilation) {
if (JsonUtils.hasElements(compilation, ELEMENT_LINKS)) {
final JsonElement selfLink =
compilation.getAsJsonObject().get(ELEMENT_LINKS).getAsJsonObject().get(ELEMENT_SELF);
final Optional<String> id =
JsonUtils.getAttributeAsString(selfLink.getAsJsonObject(), ATTRIBUTE_ID);
if (id.isPresent()) {
return Optional.of(
new CrawlerUrlDTO(
String.format(
ArdConstants.TOPICS_COMPILATION_URL,
sender,
id.get(),
ArdConstants.TOPICS_COMPILATION_PAGE_SIZE)));
}
}

return Optional.empty();
}
}
@@ -0,0 +1,91 @@
package mServer.crawler.sender.ard.json;

import com.google.gson.JsonDeserializationContext;
import com.google.gson.JsonDeserializer;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import mServer.crawler.sender.ard.ArdConstants;
import mServer.crawler.sender.ard.PaginationUrlDto;
import mServer.crawler.sender.base.CrawlerUrlDTO;
import mServer.crawler.sender.base.JsonUtils;

import java.lang.reflect.Type;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;

public class ArdTopicsLetterDeserializer implements JsonDeserializer<PaginationUrlDto> {

private static final String ELEMENT_TEASERS = "teasers";
private static final String ELEMENT_LINKS = "links";
private static final String ELEMENT_TARGET = "target";
private static final String ELEMENT_PAGE_NUMBER = "pageNumber";
private static final String ELEMENT_TOTAL_ELEMENTS = "totalElements";
private static final String ELEMENT_PAGE_SIZE = "pageSize";
private static final String ELEMENT_PAGINATION = "pagination";

private static final String ATTRIBUTE_ID = "id";

@Override
public PaginationUrlDto deserialize(
final JsonElement jsonElement, final Type type, final JsonDeserializationContext context) {
final PaginationUrlDto results = new PaginationUrlDto();

if (!jsonElement.getAsJsonObject().has(ELEMENT_TEASERS)
|| !jsonElement.getAsJsonObject().get(ELEMENT_TEASERS).isJsonArray()
|| jsonElement.getAsJsonObject().getAsJsonArray(ELEMENT_TEASERS).isEmpty()) {
return results;
}

jsonElement.getAsJsonObject().getAsJsonArray(ELEMENT_TEASERS).forEach(teaser -> results.addAll(parseTeaser(teaser.getAsJsonObject())));

final JsonElement paginationElement = jsonElement.getAsJsonObject().get(ELEMENT_PAGINATION);
results.setActualPage(getChildElementAsIntOrNullIfNotExist(paginationElement, ELEMENT_PAGE_NUMBER));
final int totalElements = getChildElementAsIntOrNullIfNotExist(paginationElement, ELEMENT_TOTAL_ELEMENTS);
final int pageSize = getChildElementAsIntOrNullIfNotExist(paginationElement, ELEMENT_PAGE_SIZE);
int maxPageSize = pageSize == 0 ? 0 :
(totalElements+pageSize-1)/pageSize;
results.setMaxPages(maxPageSize);

return results;
}

private int getChildElementAsIntOrNullIfNotExist(
final JsonElement parentElement, final String childElementName) {
if (parentElement == null || parentElement.isJsonNull()) {
return 0;
}
return getJsonElementAsIntOrNullIfNotExist(
parentElement.getAsJsonObject().get(childElementName));
}

private int getJsonElementAsIntOrNullIfNotExist(final JsonElement element) {
if (element.isJsonNull()) {
return 0;
}
return element.getAsInt();
}

private Set<CrawlerUrlDTO> parseTeaser(final JsonObject teaserObject) {
final Set<CrawlerUrlDTO> results = new HashSet<>();

final Optional<String> id;

if (JsonUtils.checkTreePath(teaserObject, ELEMENT_LINKS, ELEMENT_TARGET)) {
final JsonObject targetObject =
teaserObject.get(ELEMENT_LINKS).getAsJsonObject().get(ELEMENT_TARGET).getAsJsonObject();
id = JsonUtils.getAttributeAsString(targetObject, ATTRIBUTE_ID);
} else {
id = JsonUtils.getAttributeAsString(teaserObject, ATTRIBUTE_ID);
}

id.ifPresent(
nonNullId ->
results.add(
new CrawlerUrlDTO(
String.format(
ArdConstants.TOPIC_URL, nonNullId, ArdConstants.TOPIC_PAGE_SIZE))));

return results;
}
}

This file was deleted.