Skip to content

Commit

Permalink
feat(conditions): Introduce that allows filter results expected based…
Browse files Browse the repository at this point in the history
… in some operations. Close #346
  • Loading branch information
obetomuniz committed Mar 24, 2023
1 parent 2063478 commit d833200
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 29 deletions.
29 changes: 25 additions & 4 deletions lib/types.ts
@@ -1,25 +1,46 @@
import { AxiosRequestConfig } from "axios"
import { LaunchOptions } from "puppeteer"

export type EngineTypes = "html" | "spa" | "json" | "xml"

export enum EngineType {
Html = "html",
Spa = "spa",
Json = "json",
Xml = "xml",
}

export type EngineTypes = "html" | "spa" | "json" | "xml"

export enum OperationType {
Difference = "difference",
Equal = "equal",
Contains = "contains",
Regex = "regex",
}

export type OperationTypes = "difference" | "equal" | "contains" | "regex"

export type OperatorFn = (a: string, b: string, sensitive: boolean) => boolean

export type Operators = {
[key: string]: OperatorFn
}

export type TScrapedData = Record<string, any>

export type TScrapedDataPromise = Promise<TScrapedData>

export interface ISelectorWithAttribute {
export interface Condition {
value: string
operation: EngineType | OperationTypes
sensitive?: boolean
}
export interface ISelector {
selector: string
attribute?: string
conditions?: Condition[]
}

export type TSelectors = Record<string, ISelectorWithAttribute>
export type TSelectors = Record<string, ISelector>

export interface IScrapeHtmlOptions {
selectors: TSelectors
Expand Down
36 changes: 21 additions & 15 deletions lib/utils/extract/html.ts
@@ -1,29 +1,35 @@
import { TScrapedData, TSelectors } from "../../types"
import { applyConditions } from "../tools/condition"

const getSelectorValue = (element: Element, selector: any): string | null => {
const value = selector.attribute
? element.getAttribute(selector.attribute) || ""
: element.textContent?.trim() || ""

if (selector.conditions && !applyConditions(value, selector.conditions)) {
return null
}

return value
}

const extractData = (
document: Document,
selectors: TSelectors
): TScrapedData => {
const data: TScrapedData = {}

for (const [key, value] of Object.entries(selectors)) {
const elements = document.querySelectorAll(value.selector)
for (const [key, selector] of Object.entries(selectors)) {
const elements = document.querySelectorAll(selector.selector)

if (!elements.length) {
data[key] = ""
} else if (elements.length === 1) {
if (value?.attribute) {
data[key] = elements[0].getAttribute(value?.attribute) || ""
} else {
data[key] = elements[0].textContent?.trim() || ""
}
} else if (elements.length > 1) {
const values = Array.from(elements, (element) => {
if (value?.attribute) {
return element.getAttribute(value?.attribute) || ""
}
return element.textContent?.trim() || ""
})
data[key] = values
data[key] = getSelectorValue(elements[0], selector) || ""
} else {
data[key] = Array.from(elements)
.map((element) => getSelectorValue(element, selector))
.filter((value) => value !== null)
}
}

Expand Down
35 changes: 26 additions & 9 deletions lib/utils/extract/json.ts
@@ -1,23 +1,40 @@
import get from "lodash.get"
import { TScrapedData, TSelectors } from "../../types"
import { applyConditions } from "../tools/condition"

const getSelectorValue = (value: any, selector: any): any => {
if (typeof value !== "string") {
value = JSON.stringify(value)
}

if (selector.conditions && !applyConditions(value, selector.conditions)) {
return null
}

return value
}

const extractData = (j: any, selectors: TSelectors): TScrapedData => {
const data: TScrapedData = {}

if (Array.isArray(j)) {
return j.map((obj) => {
const data: TScrapedData = {}
return j
.map((obj) => {
const data: TScrapedData = {}

for (const [key, value] of Object.entries(selectors)) {
data[key] = get(obj, value.selector, "")
}
for (const [key, selector] of Object.entries(selectors)) {
const value = get(obj, selector.selector, "")
data[key] = getSelectorValue(value, selector)
}

return data
})
return data
})
.filter((item) => Object.values(item).some((value) => value !== null))
}

for (const [key, value] of Object.entries(selectors)) {
data[key] = get(j, value.selector, "")
for (const [key, selector] of Object.entries(selectors)) {
const value = get(j, selector.selector, "")
data[key] = getSelectorValue(value, selector)
}

return data
Expand Down
14 changes: 13 additions & 1 deletion lib/utils/extract/xml.ts
@@ -1,5 +1,16 @@
import xpath, { XPathResult } from "xpath-ts"
import { TScrapedData, TSelectors } from "../../types"
import { applyConditions } from "../tools/condition"

const getSelectorValue = (values: string[], selector: any): any => {
if (selector.conditions) {
values = values.filter((value) =>
applyConditions(value, selector.conditions)
)
}

return values.length === 1 ? values[0] : values
}

const extractData = (
document: Document,
Expand All @@ -21,7 +32,8 @@ const extractData = (
nodeValues.push(node.textContent || "")
node = nodes.iterateNext()
}
data[key] = nodeValues.length === 1 ? nodeValues[0] : nodeValues

data[key] = getSelectorValue(nodeValues, value)
}

return data
Expand Down
21 changes: 21 additions & 0 deletions lib/utils/tools/condition.ts
@@ -0,0 +1,21 @@
import { OperatorFn, Operators } from "../../types"

const operators: Operators = {
difference: (a: string, b: string, sensitive: boolean) =>
sensitive ? a !== b : a.toLowerCase() !== b.toLowerCase(),
equal: (a: string, b: string, sensitive: boolean) =>
sensitive ? a === b : a.toLowerCase() === b.toLowerCase(),
contains: (a: string, b: string, sensitive: boolean) =>
sensitive ? a.includes(b) : a.toLowerCase().includes(b.toLowerCase()),
regex: (a: string, b: string, sensitive: boolean) =>
new RegExp(b, sensitive ? "" : "i").test(a),
}

export const applyConditions = (value: string, conditions: any[]): boolean => {
return conditions.some((condition) => {
const operation: OperatorFn | undefined = operators[condition.operation]
return operation
? operation(value, condition.value, condition.sensitive || false)
: false
})
}

0 comments on commit d833200

Please sign in to comment.