Skip to content

Commit f379837

Browse files
authored
Add metascraper lang detector package (#114)
* Fix tests * Add metascraper-audio * Add metascraper-lang-detector * Update * Simplify implementation
1 parent face7e1 commit f379837

File tree

7 files changed

+142
-1
lines changed

7 files changed

+142
-1
lines changed

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,9 @@ The output will be something like:
9595

9696
Here is an example of the metadata that **metascraper** can collect:
9797

98+
- `audio` — eg. *https://cf-media.sndcdn.com/U78RIfDPV6ok.128.mp3*<br/>
99+
A audio URL that best represents the article.
100+
98101
- `author` — eg. *Noah Kulwin*<br/>
99102
A human-readable representation of the author's name.
100103

@@ -103,7 +106,7 @@ Here is an example of the metadata that **metascraper** can collect:
103106

104107
- `description` — eg. *Venture capitalists are raising money at the fastest rate...*<br/>
105108
The publisher's chosen description of the article.
106-
109+
107110
- `audio` — eg. *https://cf-media.sndcdn.com/U78RIfDPV6ok.128.mp3*<br/>
108111
A audio URL that best represents the article.
109112

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
unsafe-perm=true
2+
save-prefix=~
3+
shrinkwrap=false
4+
save=false
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# metascraper-lang-detector
2+
3+
[![npm](https://img.shields.io/npm/v/metascraper-lang-detector.svg?style=flat-square)](https://www.npmjs.com/package/metascraper-lang-detector)
4+
[![Dependency Status](https://david-dm.org/microlinkhq/metascraper.svg?path=packages/metascraper-lang-detector&style=flat-square)](https://david-dm.org/microlinkhq/metascraper?path=packages/metascraper-lang-detector)
5+
6+
> Get lang property from HTML markup based on natural language processor.
7+
8+
## Install
9+
10+
```bash
11+
$ npm install metascraper-lang-detector --save
12+
```
13+
14+
## License
15+
16+
**metascraper-lang-detector** © [microlink.io](https://microlink.io), Released under the [MIT](https://github.com/microlinkhq/metascraper-lang-detector/blob/master/LICENSE.md) License.<br>
17+
Authored and maintained by microlink.io with help from [contributors](https://github.com/microlinkhq/metascraper-lang-detector/contributors).
18+
19+
> [microlink.io](https://microlink.io) · GitHub [@microlink.io](https://github.com/microlinkhq) · Twitter [@microlinkhq](https://twitter.com/microlinkhq)
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
'use strict'
2+
3+
const { lang } = require('@metascraper/helpers')
4+
const { reduce, get } = require('lodash')
5+
const langs = require('iso-639-3')
6+
const franc = require('franc')
7+
8+
const toIso6391 = reduce(
9+
langs,
10+
(acc, { iso6393, iso6391 }) => {
11+
if (iso6391) acc[iso6393] = iso6391
12+
return acc
13+
},
14+
{}
15+
)
16+
17+
const detectLang = (collection, field) => {
18+
const value = get(collection, field)
19+
const iso6393 = franc(value)
20+
return lang(toIso6391[iso6393])
21+
}
22+
23+
module.exports = ({ fields = ['description'] }) =>
24+
reduce(
25+
fields,
26+
(acc, prop) => {
27+
const fn = ({ meta }) => detectLang(meta, prop)
28+
return acc.concat(fn)
29+
},
30+
{ lang: [] }
31+
)
32+
33+
module.exports.detectLang = detectLang
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
{
2+
"name": "metascraper-lang-detector",
3+
"description": "Get lang property from HTML markup based on natural language processor",
4+
"homepage": "https://metascraper.js.org",
5+
"version": "4.2.0",
6+
"main": "index.js",
7+
"author": {
8+
"email": "ian@ianstormtaylor.com",
9+
"name": "Ian Storm Taylor"
10+
},
11+
"repository": {
12+
"type": "git",
13+
"url": "https://github.com/microlinkhq/metascraper/tree/master/packages/metascraper-lang-detector"
14+
},
15+
"bugs": {
16+
"url": "https://github.com/microlinkhq/metascraper/issues"
17+
},
18+
"dependencies": {
19+
"@metascraper/helpers": "^4.2.0",
20+
"franc": "~4.0.0",
21+
"iso-639-3": "~1.1.0"
22+
},
23+
"devDependencies": {
24+
"mocha": "latest",
25+
"nyc": "latest",
26+
"should": "latest",
27+
"standard": "11"
28+
},
29+
"engines": {
30+
"node": ">= 8"
31+
},
32+
"files": [
33+
"index.js"
34+
],
35+
"scripts": {
36+
"test": "NODE_PATH=.. TZ=UTC NODE_ENV=test nyc mocha test"
37+
},
38+
"license": "MIT",
39+
"peerDependencies": {
40+
"metascraper": "^4"
41+
},
42+
"standard": {
43+
"env": [
44+
"mocha"
45+
]
46+
}
47+
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
'use strict'
2+
3+
const should = require('should')
4+
5+
const { detectLang } = require('..')
6+
7+
describe('metascraper-lang', () => {
8+
it('.detectLang', () => {
9+
should(
10+
detectLang(
11+
{
12+
description:
13+
'A library to easily scrape metadata from an article on the web using Open Graph metadata, regular HTML metadata, and series of fallbacks.'
14+
},
15+
'description'
16+
)
17+
).be.equal('en')
18+
should(
19+
detectLang(
20+
{
21+
description:
22+
'Una libreria para obtener fácilmente metadatos de cualquier artículo de internet usando Open Graph, HTML y una serie de fallbacks.'
23+
},
24+
'description'
25+
)
26+
).be.equal('es')
27+
should(detectLang({ description: null }, 'description')).be.equal(false)
28+
})
29+
})
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
--require should
2+
--reporter spec
3+
--timeout 120000
4+
--slow 300
5+
--bail
6+
--recursive

0 commit comments

Comments
 (0)