Skip to content

Commit

Permalink
- abstract prop (Param|Response)[A-Za-z]Prop to prevent using slow …
Browse files Browse the repository at this point in the history
…reflection to get values from certain prop across three derived classes

+ abstract method `GetResponse(PostList|ErrorCode|Page)` and implement them in derived classes to select value out of protoBuf fields
- public method `GetPageFromResponse` in favor of `GetResponsePage()` that not using reflection
@ `BaseCrawler` and its derived classes

- param `param(Data|Common)Prop
* rename param `param` to `requestParam`
+ param `setCommonParamOnRequest()`
@ `ClientRequester.(Request|Post)ProtoBuf()`

+ `global using Google.Protobuf.Collections` @ GlobalUsings.cs
@ crawler
  • Loading branch information
n0099 committed Jan 1, 2023
1 parent ca9a359 commit 34f9c32
Show file tree
Hide file tree
Showing 10 changed files with 66 additions and 62 deletions.
2 changes: 0 additions & 2 deletions crawler/src/Db/Post/ThreadPost.cs
@@ -1,6 +1,4 @@
// ReSharper disable UnusedAutoPropertyAccessor.Global

using Google.Protobuf.Collections;
using TbClient.Post.Common;

namespace tbm.Crawler.Db.Post
Expand Down
1 change: 1 addition & 0 deletions crawler/src/GlobalUsings.cs
Expand Up @@ -2,6 +2,7 @@
global using Autofac.Extensions.DependencyInjection;
global using Autofac.Features.Indexed;
global using Google.Protobuf;
global using Google.Protobuf.Collections;
global using LinqKit;
global using NLog;
global using NLog.Extensions.Logging;
Expand Down
1 change: 0 additions & 1 deletion crawler/src/Helper.cs
@@ -1,6 +1,5 @@
using System.Text.Encodings.Web;
using System.Text.Unicode;
using Google.Protobuf.Collections;

namespace tbm.Crawler
{
Expand Down
24 changes: 14 additions & 10 deletions crawler/src/Tieba/ClientRequester.cs
Expand Up @@ -24,10 +24,11 @@ public class ClientRequester
return doc.RootElement.Clone();
});

public Task<TResponse> RequestProtoBuf<TRequest, TResponse>
(string url, string clientVersion, PropertyInfo paramDataProp, PropertyInfo paramCommonProp, Func<TResponse> responseFactory, TRequest param)
public Task<TResponse> RequestProtoBuf
<TRequest, TResponse>(string url, string clientVersion, TRequest requestParam,
Action<TRequest, Common> setCommonParamOnRequest, Func<TResponse> responseFactory)
where TRequest : IMessage<TRequest> where TResponse : IMessage<TResponse> =>
Request(() => PostProtoBuf(url, clientVersion, param, paramDataProp, paramCommonProp), stream =>
Request(() => PostProtoBuf(url, clientVersion, requestParam, setCommonParamOnRequest), stream =>
{
try
{
Expand Down Expand Up @@ -63,14 +64,14 @@ private static async Task<T> Request<T>(Func<Task<HttpResponseMessage>> requeste
}
}

private Task<HttpResponseMessage> PostJson(string url, Dictionary<string, string> data, string clientVersion)
private Task<HttpResponseMessage> PostJson(string url, Dictionary<string, string> param, string clientVersion)
{
var postData = new Dictionary<string, string>
{
{"_client_id", $"wappc_{Rand.NextLong(1000000000000, 9999999999999)}_{Rand.Next(100, 999)}"},
{"_client_type", "2"},
{"_client_version", clientVersion}
}.Concat(data).ToList();
}.Concat(param).ToList();
var sign = postData.Aggregate("", (acc, i) =>
{
acc += i.Key + '=' + i.Value;
Expand All @@ -80,15 +81,18 @@ private Task<HttpResponseMessage> PostJson(string url, Dictionary<string, string
postData.Add(KeyValuePair.Create("sign", signMd5));

return Post(() => _http.PostAsync(url, new FormUrlEncodedContent(postData)),
() => _logger.LogTrace("POST {} {}", url, data));
() => _logger.LogTrace("POST {} {}", url, param));
}

private Task<HttpResponseMessage> PostProtoBuf(string url, string clientVersion, IMessage paramProtoBuf, PropertyInfo dataProp, PropertyInfo commonProp)
private Task<HttpResponseMessage> PostProtoBuf
<TRequest>(string url, string clientVersion, TRequest requestParam,
Action<TRequest, Common> setCommonParamOnRequest)
where TRequest : IMessage<TRequest>
{
commonProp.SetValue(dataProp.GetValue(paramProtoBuf), new Common {ClientVersion = clientVersion});
setCommonParamOnRequest(requestParam, new() {ClientVersion = clientVersion});

// https://github.com/dotnet/runtime/issues/22996, http://test.greenbytes.de/tech/tc2231
var protoBufFile = new ByteArrayContent(paramProtoBuf.ToByteArray());
var protoBufFile = new ByteArrayContent(requestParam.ToByteArray());
protoBufFile.Headers.Add("Content-Disposition", "form-data; name=\"data\"; filename=\"file\"");
var content = new MultipartFormDataContent {protoBufFile};
// https://stackoverflow.com/questions/30926645/httpcontent-boundary-double-quotes
Expand All @@ -102,7 +106,7 @@ private Task<HttpResponseMessage> PostProtoBuf(string url, string clientVersion,
request.Headers.Connection.Add("keep-alive");

return Post(() => _http.SendAsync(request),
() => _logger.LogTrace("POST {} {}", url, paramProtoBuf));
() => _logger.LogTrace("POST {} {}", url, requestParam));
}

private Task<HttpResponseMessage> Post(Func<Task<HttpResponseMessage>> responseTaskFactory, Action logTraceCallback)
Expand Down
27 changes: 12 additions & 15 deletions crawler/src/Tieba/Crawl/Crawler/BaseCrawler.cs
Expand Up @@ -5,35 +5,32 @@ public abstract class BaseCrawler<TResponse, TPostProtoBuf>
{
public record Response(TResponse Result, CrawlRequestFlag Flag = CrawlRequestFlag.None);
protected record Request(Task<TResponse> Response, Page Page, CrawlRequestFlag Flag = CrawlRequestFlag.None);

protected ClientRequester Requester { get; }
protected abstract PropertyInfo ParamDataProp { get; }
protected abstract PropertyInfo ParamCommonProp { get; }
protected abstract PropertyInfo ResponseDataProp { get; }
protected abstract PropertyInfo ResponsePostListProp { get; }
protected abstract PropertyInfo ResponsePageProp { get; }
protected abstract PropertyInfo ResponseErrorProp { get; }

protected BaseCrawler(ClientRequester requester) => Requester = requester;

public abstract Exception FillExceptionData(Exception e);
protected abstract RepeatedField<TPostProtoBuf> GetResponsePostList(TResponse response);
protected abstract int GetResponseErrorCode(TResponse response);
public abstract TbClient.Page GetResponsePage(TResponse response);
protected abstract Task<IEnumerable<Request>> RequestsFactory(Page page);
public abstract IList<TPostProtoBuf> GetValidPosts(TResponse response, CrawlRequestFlag flag);

protected BaseCrawler(ClientRequester requester) => Requester = requester;

public TbClient.Page? GetPageFromResponse(TResponse res) =>
(TbClient.Page?)ResponsePageProp.GetValue(ResponseDataProp.GetValue(res) as IMessage);

public async Task<Response[]> CrawlSinglePage(Page page) =>
await Task.WhenAll((await RequestsFactory(page))
.Select(async i => new Response(await i.Response, i.Flag)));

protected void ValidateOtherErrorCode(TResponse response)
{
if ((ResponseErrorProp.GetValue(response) as Error)?.Errorno != 0)
if (GetResponseErrorCode(response) != 0)
throw new TiebaException("Error from tieba client.") {Data = {{"raw", response}}};
}

protected IList<TPostProtoBuf> EnsureNonEmptyPostList(TResponse response, string exceptionMessage) =>
ResponsePostListProp.GetValue(ResponseDataProp.GetValue(response)) is IList<TPostProtoBuf> posts
&& posts.Any() ? posts : throw new EmptyPostListException(exceptionMessage);
protected IList<TPostProtoBuf> EnsureNonEmptyPostList(TResponse response, string exceptionMessage)
{
var posts = GetResponsePostList(response);
return posts.Any() ? posts : throw new EmptyPostListException(exceptionMessage);
}
}
}
25 changes: 14 additions & 11 deletions crawler/src/Tieba/Crawl/Crawler/ReplyCrawler.cs
Expand Up @@ -2,13 +2,6 @@ namespace tbm.Crawler.Tieba.Crawl.Crawler
{
public class ReplyCrawler : BaseCrawler<ReplyResponse, Reply>
{
protected override PropertyInfo ParamDataProp => typeof(ReplyRequest).GetProperty(nameof(ReplyRequest.Data))!;
protected override PropertyInfo ParamCommonProp => ParamDataProp.PropertyType.GetProperty(nameof(ReplyRequest.Data.Common))!;
protected override PropertyInfo ResponseDataProp => typeof(ReplyResponse).GetProperty(nameof(ReplyResponse.Data))!;
protected override PropertyInfo ResponsePostListProp => ResponseDataProp.PropertyType.GetProperty(nameof(ReplyResponse.Data.PostList))!;
protected override PropertyInfo ResponsePageProp => ResponseDataProp.PropertyType.GetProperty(nameof(ReplyResponse.Data.Page))!;
protected override PropertyInfo ResponseErrorProp => typeof(ReplyResponse).GetProperty(nameof(ReplyResponse.Error))!;

private readonly Fid _fid;
private readonly Tid _tid;

Expand All @@ -26,6 +19,10 @@ public override Exception FillExceptionData(Exception e)
return e;
}

protected override RepeatedField<Reply> GetResponsePostList(ReplyResponse response) => response.Data.PostList;
protected override int GetResponseErrorCode(ReplyResponse response) => response.Error.Errorno;
public override TbClient.Page GetResponsePage(ReplyResponse response) => response.Data.Page;

protected override async Task<IEnumerable<Request>> RequestsFactory(Page page)
{
const string url = "c/f/pb/page?cmd=302001";
Expand All @@ -37,8 +34,10 @@ protected override async Task<IEnumerable<Request>> RequestsFactory(Page page)
Rn = 30,
QType = 2
};
var response = await Requester.RequestProtoBuf(url, clientVersion, ParamDataProp, ParamCommonProp,
() => new ReplyResponse(), new ReplyRequest {Data = data});
var response = await Requester.RequestProtoBuf(url, clientVersion,
new ReplyRequest {Data = data},
(req, common) => req.Data.Common = common,
() => new ReplyResponse());
var ret = new List<Request>(2) {new(Task.FromResult(response), page)};
// as of client version 12.12.1.0 (not including), folded replies won't be include in response:
// https://github.com/n0099/TiebaMonitor/commit/b8e7d2645e456271f52457f56500aaedaf28a010#diff-cf67f7f9e82d44aa5be8f85cd24946e5bb7829ca7940c9d056bb1e3849b8f981R32
Expand All @@ -47,8 +46,12 @@ protected override async Task<IEnumerable<Request>> RequestsFactory(Page page)
{
var dataShowOnlyFolded = data.Clone();
dataShowOnlyFolded.IsFoldCommentReq = 1;
ret.Add(new(Requester.RequestProtoBuf(url, clientVersion, ParamDataProp, ParamCommonProp,
() => new ReplyResponse(), new ReplyRequest {Data = dataShowOnlyFolded}), page, CrawlRequestFlag.ReplyShowOnlyFolded));
ret.Add(new(
Requester.RequestProtoBuf(url, clientVersion,
new ReplyRequest {Data = dataShowOnlyFolded},
(req, common) => req.Data.Common = common,
() => new ReplyResponse()),
page, CrawlRequestFlag.ReplyShowOnlyFolded));
}
return ret;
}
Expand Down
17 changes: 8 additions & 9 deletions crawler/src/Tieba/Crawl/Crawler/SubReplyCrawler.cs
Expand Up @@ -2,13 +2,6 @@ namespace tbm.Crawler.Tieba.Crawl.Crawler
{
public class SubReplyCrawler : BaseCrawler<SubReplyResponse, SubReply>
{
protected override PropertyInfo ParamDataProp => typeof(SubReplyRequest).GetProperty(nameof(SubReplyRequest.Data))!;
protected override PropertyInfo ParamCommonProp => ParamDataProp.PropertyType.GetProperty(nameof(SubReplyRequest.Data.Common))!;
protected override PropertyInfo ResponseDataProp => typeof(SubReplyResponse).GetProperty(nameof(SubReplyResponse.Data))!;
protected override PropertyInfo ResponsePostListProp => ResponseDataProp.PropertyType.GetProperty(nameof(SubReplyResponse.Data.SubpostList))!;
protected override PropertyInfo ResponsePageProp => ResponseDataProp.PropertyType.GetProperty(nameof(SubReplyResponse.Data.Page))!;
protected override PropertyInfo ResponseErrorProp => typeof(SubReplyResponse).GetProperty(nameof(SubReplyResponse.Error))!;

private readonly Tid _tid;
private readonly Pid _pid;

Expand All @@ -27,19 +20,25 @@ public override Exception FillExceptionData(Exception e)
return e;
}

protected override RepeatedField<SubReply> GetResponsePostList(SubReplyResponse response) => response.Data.SubpostList;
protected override int GetResponseErrorCode(SubReplyResponse response) => response.Error.Errorno;
public override TbClient.Page GetResponsePage(SubReplyResponse response) => response.Data.Page;

protected override Task<IEnumerable<Request>> RequestsFactory(Page page) =>
Task.FromResult(new[]
{
new Request(Requester.RequestProtoBuf("c/f/pb/floor?cmd=302002", "12.26.1.0",
ParamDataProp, ParamCommonProp, () => new SubReplyResponse(), new SubReplyRequest
new SubReplyRequest
{
Data = new()
{
Kz = (long)_tid,
Pid = (long)_pid,
Pn = (int)page
}
}), page)
},
(req, common) => req.Data.Common = common,
() => new SubReplyResponse()), page)
}.AsEnumerable());

public override IList<SubReply> GetValidPosts(SubReplyResponse response, CrawlRequestFlag flag)
Expand Down
6 changes: 4 additions & 2 deletions crawler/src/Tieba/Crawl/Crawler/ThreadArchiveCrawler.cs
Expand Up @@ -8,8 +8,10 @@ public class ThreadArchiveCrawler : ThreadCrawler

protected override async Task<IEnumerable<Request>> RequestsFactory(Page page)
{
var response = await Requester.RequestProtoBuf(EndPointUrl, "6.0.2", ParamDataProp, ParamCommonProp,
() => new ThreadResponse(), new ThreadRequest {Data = GetRequestDataForClientVersion602(page)});
var response = await Requester.RequestProtoBuf(EndPointUrl, "6.0.2",
new ThreadRequest {Data = GetRequestDataForClientVersion602(page)},
(req, common) => req.Data.Common = common,
() => new ThreadResponse());
return new[]
{ // passing CrawlRequestFlag.ThreadClientVersion602 in the second one in order to invokes ThreadParser.ShouldSkipParse()
new Request(Task.FromResult(response), page),
Expand Down
23 changes: 12 additions & 11 deletions crawler/src/Tieba/Crawl/Crawler/ThreadCrawler.cs
Expand Up @@ -2,13 +2,6 @@ namespace tbm.Crawler.Tieba.Crawl.Crawler
{
public class ThreadCrawler : BaseCrawler<ThreadResponse, Thread>
{
protected override PropertyInfo ParamDataProp => typeof(ThreadRequest).GetProperty(nameof(ThreadRequest.Data))!;
protected override PropertyInfo ParamCommonProp => ParamDataProp.PropertyType.GetProperty(nameof(ThreadRequest.Data.Common))!;
protected override PropertyInfo ResponseDataProp => typeof(ThreadResponse).GetProperty(nameof(ThreadResponse.Data))!;
protected override PropertyInfo ResponsePostListProp => ResponseDataProp.PropertyType.GetProperty(nameof(ThreadResponse.Data.ThreadList))!;
protected override PropertyInfo ResponsePageProp => ResponseDataProp.PropertyType.GetProperty(nameof(ThreadResponse.Data.Page))!;
protected override PropertyInfo ResponseErrorProp => typeof(ThreadResponse).GetProperty(nameof(ThreadResponse.Error))!;

private readonly string _forumName;

public delegate ThreadCrawler New(string forumName);
Expand All @@ -21,6 +14,10 @@ public override Exception FillExceptionData(Exception e)
return e;
}

protected override RepeatedField<Thread> GetResponsePostList(ThreadResponse response) => response.Data.ThreadList;
protected override int GetResponseErrorCode(ThreadResponse response) => response.Error.Errorno;
public override TbClient.Page GetResponsePage(ThreadResponse response) => response.Data.Page;

protected const string EndPointUrl = "c/f/frs/page?cmd=301001";

protected ThreadRequest.Types.Data GetRequestDataForClientVersion602(Page page) =>
Expand All @@ -45,10 +42,14 @@ protected override Task<IEnumerable<Request>> RequestsFactory(Page page)
};
return Task.FromResult(new[]
{
new Request(Requester.RequestProtoBuf(EndPointUrl, "12.26.1.0", ParamDataProp, ParamCommonProp, () => new ThreadResponse(),
new ThreadRequest {Data = data}), page),
new Request(Requester.RequestProtoBuf(EndPointUrl, "6.0.2", ParamDataProp, ParamCommonProp, () => new ThreadResponse(),
new ThreadRequest {Data = data602}), page, CrawlRequestFlag.ThreadClientVersion602),
new Request(Requester.RequestProtoBuf(EndPointUrl, "12.26.1.0",
new ThreadRequest {Data = data},
(req, common) => req.Data.Common = common,
() => new ThreadResponse()), page),
new Request(Requester.RequestProtoBuf(EndPointUrl, "6.0.2",
new ThreadRequest {Data = data602},
(req, common) => req.Data.Common = common,
() => new ThreadResponse()), page, CrawlRequestFlag.ThreadClientVersion602),
new Request(RequestJsonForFirstPid(page), page, CrawlRequestFlag.ThreadClientVersion8888)
}.AsEnumerable());
}
Expand Down
2 changes: 1 addition & 1 deletion crawler/src/Tieba/Crawl/Facade/BaseCrawlFacade.cs
Expand Up @@ -91,7 +91,7 @@ public void Dispose()
startPageResponse.ForEach(ValidateThenParse);
var maxPage = startPageResponse
.Select(i => _crawler.GetPageFromResponse(i.Result))
.Select(i => _crawler.GetResponsePage(i.Result))
.Max(i => (Page?)i?.TotalPage);
endPage = Math.Min(endPage, maxPage ?? Page.MaxValue);
}, startPage, 0);
Expand Down

0 comments on commit 34f9c32

Please sign in to comment.