diff --git a/crawler/src/Db/Post/ThreadPost.cs b/crawler/src/Db/Post/ThreadPost.cs index c2ba0d0d..587601e1 100644 --- a/crawler/src/Db/Post/ThreadPost.cs +++ b/crawler/src/Db/Post/ThreadPost.cs @@ -1,6 +1,4 @@ // ReSharper disable UnusedAutoPropertyAccessor.Global - -using Google.Protobuf.Collections; using TbClient.Post.Common; namespace tbm.Crawler.Db.Post diff --git a/crawler/src/GlobalUsings.cs b/crawler/src/GlobalUsings.cs index d25394a4..0875b3db 100644 --- a/crawler/src/GlobalUsings.cs +++ b/crawler/src/GlobalUsings.cs @@ -2,6 +2,7 @@ global using Autofac.Extensions.DependencyInjection; global using Autofac.Features.Indexed; global using Google.Protobuf; +global using Google.Protobuf.Collections; global using LinqKit; global using NLog; global using NLog.Extensions.Logging; diff --git a/crawler/src/Helper.cs b/crawler/src/Helper.cs index 751c22af..f7f3f0ef 100644 --- a/crawler/src/Helper.cs +++ b/crawler/src/Helper.cs @@ -1,6 +1,5 @@ using System.Text.Encodings.Web; using System.Text.Unicode; -using Google.Protobuf.Collections; namespace tbm.Crawler { diff --git a/crawler/src/Tieba/ClientRequester.cs b/crawler/src/Tieba/ClientRequester.cs index 32ca5384..4c0a8b92 100644 --- a/crawler/src/Tieba/ClientRequester.cs +++ b/crawler/src/Tieba/ClientRequester.cs @@ -24,10 +24,11 @@ public class ClientRequester return doc.RootElement.Clone(); }); - public Task RequestProtoBuf - (string url, string clientVersion, PropertyInfo paramDataProp, PropertyInfo paramCommonProp, Func responseFactory, TRequest param) + public Task RequestProtoBuf + (string url, string clientVersion, TRequest requestParam, + Action setCommonParamOnRequest, Func responseFactory) where TRequest : IMessage where TResponse : IMessage => - Request(() => PostProtoBuf(url, clientVersion, param, paramDataProp, paramCommonProp), stream => + Request(() => PostProtoBuf(url, clientVersion, requestParam, setCommonParamOnRequest), stream => { try { @@ -63,14 +64,14 @@ private static async Task Request(Func> requeste } } - private Task PostJson(string url, Dictionary data, string clientVersion) + private Task PostJson(string url, Dictionary param, string clientVersion) { var postData = new Dictionary { {"_client_id", $"wappc_{Rand.NextLong(1000000000000, 9999999999999)}_{Rand.Next(100, 999)}"}, {"_client_type", "2"}, {"_client_version", clientVersion} - }.Concat(data).ToList(); + }.Concat(param).ToList(); var sign = postData.Aggregate("", (acc, i) => { acc += i.Key + '=' + i.Value; @@ -80,15 +81,18 @@ private Task PostJson(string url, Dictionary _http.PostAsync(url, new FormUrlEncodedContent(postData)), - () => _logger.LogTrace("POST {} {}", url, data)); + () => _logger.LogTrace("POST {} {}", url, param)); } - private Task PostProtoBuf(string url, string clientVersion, IMessage paramProtoBuf, PropertyInfo dataProp, PropertyInfo commonProp) + private Task PostProtoBuf + (string url, string clientVersion, TRequest requestParam, + Action setCommonParamOnRequest) + where TRequest : IMessage { - commonProp.SetValue(dataProp.GetValue(paramProtoBuf), new Common {ClientVersion = clientVersion}); + setCommonParamOnRequest(requestParam, new() {ClientVersion = clientVersion}); // https://github.com/dotnet/runtime/issues/22996, http://test.greenbytes.de/tech/tc2231 - var protoBufFile = new ByteArrayContent(paramProtoBuf.ToByteArray()); + var protoBufFile = new ByteArrayContent(requestParam.ToByteArray()); protoBufFile.Headers.Add("Content-Disposition", "form-data; name=\"data\"; filename=\"file\""); var content = new MultipartFormDataContent {protoBufFile}; // https://stackoverflow.com/questions/30926645/httpcontent-boundary-double-quotes @@ -102,7 +106,7 @@ private Task PostProtoBuf(string url, string clientVersion, request.Headers.Connection.Add("keep-alive"); return Post(() => _http.SendAsync(request), - () => _logger.LogTrace("POST {} {}", url, paramProtoBuf)); + () => _logger.LogTrace("POST {} {}", url, requestParam)); } private Task Post(Func> responseTaskFactory, Action logTraceCallback) diff --git a/crawler/src/Tieba/Crawl/Crawler/BaseCrawler.cs b/crawler/src/Tieba/Crawl/Crawler/BaseCrawler.cs index c2e64d17..cb8a8ac6 100644 --- a/crawler/src/Tieba/Crawl/Crawler/BaseCrawler.cs +++ b/crawler/src/Tieba/Crawl/Crawler/BaseCrawler.cs @@ -5,35 +5,32 @@ public abstract class BaseCrawler { public record Response(TResponse Result, CrawlRequestFlag Flag = CrawlRequestFlag.None); protected record Request(Task Response, Page Page, CrawlRequestFlag Flag = CrawlRequestFlag.None); + protected ClientRequester Requester { get; } - protected abstract PropertyInfo ParamDataProp { get; } - protected abstract PropertyInfo ParamCommonProp { get; } - protected abstract PropertyInfo ResponseDataProp { get; } - protected abstract PropertyInfo ResponsePostListProp { get; } - protected abstract PropertyInfo ResponsePageProp { get; } - protected abstract PropertyInfo ResponseErrorProp { get; } + + protected BaseCrawler(ClientRequester requester) => Requester = requester; public abstract Exception FillExceptionData(Exception e); + protected abstract RepeatedField GetResponsePostList(TResponse response); + protected abstract int GetResponseErrorCode(TResponse response); + public abstract TbClient.Page GetResponsePage(TResponse response); protected abstract Task> RequestsFactory(Page page); public abstract IList GetValidPosts(TResponse response, CrawlRequestFlag flag); - protected BaseCrawler(ClientRequester requester) => Requester = requester; - - public TbClient.Page? GetPageFromResponse(TResponse res) => - (TbClient.Page?)ResponsePageProp.GetValue(ResponseDataProp.GetValue(res) as IMessage); - public async Task CrawlSinglePage(Page page) => await Task.WhenAll((await RequestsFactory(page)) .Select(async i => new Response(await i.Response, i.Flag))); protected void ValidateOtherErrorCode(TResponse response) { - if ((ResponseErrorProp.GetValue(response) as Error)?.Errorno != 0) + if (GetResponseErrorCode(response) != 0) throw new TiebaException("Error from tieba client.") {Data = {{"raw", response}}}; } - protected IList EnsureNonEmptyPostList(TResponse response, string exceptionMessage) => - ResponsePostListProp.GetValue(ResponseDataProp.GetValue(response)) is IList posts - && posts.Any() ? posts : throw new EmptyPostListException(exceptionMessage); + protected IList EnsureNonEmptyPostList(TResponse response, string exceptionMessage) + { + var posts = GetResponsePostList(response); + return posts.Any() ? posts : throw new EmptyPostListException(exceptionMessage); + } } } diff --git a/crawler/src/Tieba/Crawl/Crawler/ReplyCrawler.cs b/crawler/src/Tieba/Crawl/Crawler/ReplyCrawler.cs index 287b3476..295484b8 100644 --- a/crawler/src/Tieba/Crawl/Crawler/ReplyCrawler.cs +++ b/crawler/src/Tieba/Crawl/Crawler/ReplyCrawler.cs @@ -2,13 +2,6 @@ namespace tbm.Crawler.Tieba.Crawl.Crawler { public class ReplyCrawler : BaseCrawler { - protected override PropertyInfo ParamDataProp => typeof(ReplyRequest).GetProperty(nameof(ReplyRequest.Data))!; - protected override PropertyInfo ParamCommonProp => ParamDataProp.PropertyType.GetProperty(nameof(ReplyRequest.Data.Common))!; - protected override PropertyInfo ResponseDataProp => typeof(ReplyResponse).GetProperty(nameof(ReplyResponse.Data))!; - protected override PropertyInfo ResponsePostListProp => ResponseDataProp.PropertyType.GetProperty(nameof(ReplyResponse.Data.PostList))!; - protected override PropertyInfo ResponsePageProp => ResponseDataProp.PropertyType.GetProperty(nameof(ReplyResponse.Data.Page))!; - protected override PropertyInfo ResponseErrorProp => typeof(ReplyResponse).GetProperty(nameof(ReplyResponse.Error))!; - private readonly Fid _fid; private readonly Tid _tid; @@ -26,6 +19,10 @@ public override Exception FillExceptionData(Exception e) return e; } + protected override RepeatedField GetResponsePostList(ReplyResponse response) => response.Data.PostList; + protected override int GetResponseErrorCode(ReplyResponse response) => response.Error.Errorno; + public override TbClient.Page GetResponsePage(ReplyResponse response) => response.Data.Page; + protected override async Task> RequestsFactory(Page page) { const string url = "c/f/pb/page?cmd=302001"; @@ -37,8 +34,10 @@ protected override async Task> RequestsFactory(Page page) Rn = 30, QType = 2 }; - var response = await Requester.RequestProtoBuf(url, clientVersion, ParamDataProp, ParamCommonProp, - () => new ReplyResponse(), new ReplyRequest {Data = data}); + var response = await Requester.RequestProtoBuf(url, clientVersion, + new ReplyRequest {Data = data}, + (req, common) => req.Data.Common = common, + () => new ReplyResponse()); var ret = new List(2) {new(Task.FromResult(response), page)}; // as of client version 12.12.1.0 (not including), folded replies won't be include in response: // https://github.com/n0099/TiebaMonitor/commit/b8e7d2645e456271f52457f56500aaedaf28a010#diff-cf67f7f9e82d44aa5be8f85cd24946e5bb7829ca7940c9d056bb1e3849b8f981R32 @@ -47,8 +46,12 @@ protected override async Task> RequestsFactory(Page page) { var dataShowOnlyFolded = data.Clone(); dataShowOnlyFolded.IsFoldCommentReq = 1; - ret.Add(new(Requester.RequestProtoBuf(url, clientVersion, ParamDataProp, ParamCommonProp, - () => new ReplyResponse(), new ReplyRequest {Data = dataShowOnlyFolded}), page, CrawlRequestFlag.ReplyShowOnlyFolded)); + ret.Add(new( + Requester.RequestProtoBuf(url, clientVersion, + new ReplyRequest {Data = dataShowOnlyFolded}, + (req, common) => req.Data.Common = common, + () => new ReplyResponse()), + page, CrawlRequestFlag.ReplyShowOnlyFolded)); } return ret; } diff --git a/crawler/src/Tieba/Crawl/Crawler/SubReplyCrawler.cs b/crawler/src/Tieba/Crawl/Crawler/SubReplyCrawler.cs index 49919235..67fdac16 100644 --- a/crawler/src/Tieba/Crawl/Crawler/SubReplyCrawler.cs +++ b/crawler/src/Tieba/Crawl/Crawler/SubReplyCrawler.cs @@ -2,13 +2,6 @@ namespace tbm.Crawler.Tieba.Crawl.Crawler { public class SubReplyCrawler : BaseCrawler { - protected override PropertyInfo ParamDataProp => typeof(SubReplyRequest).GetProperty(nameof(SubReplyRequest.Data))!; - protected override PropertyInfo ParamCommonProp => ParamDataProp.PropertyType.GetProperty(nameof(SubReplyRequest.Data.Common))!; - protected override PropertyInfo ResponseDataProp => typeof(SubReplyResponse).GetProperty(nameof(SubReplyResponse.Data))!; - protected override PropertyInfo ResponsePostListProp => ResponseDataProp.PropertyType.GetProperty(nameof(SubReplyResponse.Data.SubpostList))!; - protected override PropertyInfo ResponsePageProp => ResponseDataProp.PropertyType.GetProperty(nameof(SubReplyResponse.Data.Page))!; - protected override PropertyInfo ResponseErrorProp => typeof(SubReplyResponse).GetProperty(nameof(SubReplyResponse.Error))!; - private readonly Tid _tid; private readonly Pid _pid; @@ -27,11 +20,15 @@ public override Exception FillExceptionData(Exception e) return e; } + protected override RepeatedField GetResponsePostList(SubReplyResponse response) => response.Data.SubpostList; + protected override int GetResponseErrorCode(SubReplyResponse response) => response.Error.Errorno; + public override TbClient.Page GetResponsePage(SubReplyResponse response) => response.Data.Page; + protected override Task> RequestsFactory(Page page) => Task.FromResult(new[] { new Request(Requester.RequestProtoBuf("c/f/pb/floor?cmd=302002", "12.26.1.0", - ParamDataProp, ParamCommonProp, () => new SubReplyResponse(), new SubReplyRequest + new SubReplyRequest { Data = new() { @@ -39,7 +36,9 @@ public override Exception FillExceptionData(Exception e) Pid = (long)_pid, Pn = (int)page } - }), page) + }, + (req, common) => req.Data.Common = common, + () => new SubReplyResponse()), page) }.AsEnumerable()); public override IList GetValidPosts(SubReplyResponse response, CrawlRequestFlag flag) diff --git a/crawler/src/Tieba/Crawl/Crawler/ThreadArchiveCrawler.cs b/crawler/src/Tieba/Crawl/Crawler/ThreadArchiveCrawler.cs index f9bebd28..27d5184b 100644 --- a/crawler/src/Tieba/Crawl/Crawler/ThreadArchiveCrawler.cs +++ b/crawler/src/Tieba/Crawl/Crawler/ThreadArchiveCrawler.cs @@ -8,8 +8,10 @@ public class ThreadArchiveCrawler : ThreadCrawler protected override async Task> RequestsFactory(Page page) { - var response = await Requester.RequestProtoBuf(EndPointUrl, "6.0.2", ParamDataProp, ParamCommonProp, - () => new ThreadResponse(), new ThreadRequest {Data = GetRequestDataForClientVersion602(page)}); + var response = await Requester.RequestProtoBuf(EndPointUrl, "6.0.2", + new ThreadRequest {Data = GetRequestDataForClientVersion602(page)}, + (req, common) => req.Data.Common = common, + () => new ThreadResponse()); return new[] { // passing CrawlRequestFlag.ThreadClientVersion602 in the second one in order to invokes ThreadParser.ShouldSkipParse() new Request(Task.FromResult(response), page), diff --git a/crawler/src/Tieba/Crawl/Crawler/ThreadCrawler.cs b/crawler/src/Tieba/Crawl/Crawler/ThreadCrawler.cs index 9864827d..10601f5b 100644 --- a/crawler/src/Tieba/Crawl/Crawler/ThreadCrawler.cs +++ b/crawler/src/Tieba/Crawl/Crawler/ThreadCrawler.cs @@ -2,13 +2,6 @@ namespace tbm.Crawler.Tieba.Crawl.Crawler { public class ThreadCrawler : BaseCrawler { - protected override PropertyInfo ParamDataProp => typeof(ThreadRequest).GetProperty(nameof(ThreadRequest.Data))!; - protected override PropertyInfo ParamCommonProp => ParamDataProp.PropertyType.GetProperty(nameof(ThreadRequest.Data.Common))!; - protected override PropertyInfo ResponseDataProp => typeof(ThreadResponse).GetProperty(nameof(ThreadResponse.Data))!; - protected override PropertyInfo ResponsePostListProp => ResponseDataProp.PropertyType.GetProperty(nameof(ThreadResponse.Data.ThreadList))!; - protected override PropertyInfo ResponsePageProp => ResponseDataProp.PropertyType.GetProperty(nameof(ThreadResponse.Data.Page))!; - protected override PropertyInfo ResponseErrorProp => typeof(ThreadResponse).GetProperty(nameof(ThreadResponse.Error))!; - private readonly string _forumName; public delegate ThreadCrawler New(string forumName); @@ -21,6 +14,10 @@ public override Exception FillExceptionData(Exception e) return e; } + protected override RepeatedField GetResponsePostList(ThreadResponse response) => response.Data.ThreadList; + protected override int GetResponseErrorCode(ThreadResponse response) => response.Error.Errorno; + public override TbClient.Page GetResponsePage(ThreadResponse response) => response.Data.Page; + protected const string EndPointUrl = "c/f/frs/page?cmd=301001"; protected ThreadRequest.Types.Data GetRequestDataForClientVersion602(Page page) => @@ -45,10 +42,14 @@ protected override Task> RequestsFactory(Page page) }; return Task.FromResult(new[] { - new Request(Requester.RequestProtoBuf(EndPointUrl, "12.26.1.0", ParamDataProp, ParamCommonProp, () => new ThreadResponse(), - new ThreadRequest {Data = data}), page), - new Request(Requester.RequestProtoBuf(EndPointUrl, "6.0.2", ParamDataProp, ParamCommonProp, () => new ThreadResponse(), - new ThreadRequest {Data = data602}), page, CrawlRequestFlag.ThreadClientVersion602), + new Request(Requester.RequestProtoBuf(EndPointUrl, "12.26.1.0", + new ThreadRequest {Data = data}, + (req, common) => req.Data.Common = common, + () => new ThreadResponse()), page), + new Request(Requester.RequestProtoBuf(EndPointUrl, "6.0.2", + new ThreadRequest {Data = data602}, + (req, common) => req.Data.Common = common, + () => new ThreadResponse()), page, CrawlRequestFlag.ThreadClientVersion602), new Request(RequestJsonForFirstPid(page), page, CrawlRequestFlag.ThreadClientVersion8888) }.AsEnumerable()); } diff --git a/crawler/src/Tieba/Crawl/Facade/BaseCrawlFacade.cs b/crawler/src/Tieba/Crawl/Facade/BaseCrawlFacade.cs index 1c10a634..f08ea2a8 100644 --- a/crawler/src/Tieba/Crawl/Facade/BaseCrawlFacade.cs +++ b/crawler/src/Tieba/Crawl/Facade/BaseCrawlFacade.cs @@ -91,7 +91,7 @@ public void Dispose() startPageResponse.ForEach(ValidateThenParse); var maxPage = startPageResponse - .Select(i => _crawler.GetPageFromResponse(i.Result)) + .Select(i => _crawler.GetResponsePage(i.Result)) .Max(i => (Page?)i?.TotalPage); endPage = Math.Min(endPage, maxPage ?? Page.MaxValue); }, startPage, 0);