In [1]:
texts = [
    "ру́сский язы́к", # Russian
    "中文",           # Chinese
    "にほんご",      # Japanese
    "العَرَبِيَّة"  # Arabic
]

## Testing packages

### Langdetect library

In [2]:
from langdetect import detect

In [3]:
for text in texts:
    print(detect(text))

ru
zh-cn
ja
ar


### Pycld2 library

In [4]:
import pycld2 as cld2

In [5]:
for text in texts:
    isReliable, textBytesFound, details = cld2.detect(text)
    print(details)

(('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0))
(('ChineseT', 'zh-Hant', 87, 1755.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0))
(('Japanese', 'ja', 92, 3859.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0))
(('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0), ('Unknown', 'un', 0, 0.0))


In [6]:
def lang_detected(text):
    isReliable, textBytesFound, details = cld2.detect(text)
    return(details[0])

In [7]:
print(lang_detected("にほんご"))

('Japanese', 'ja', 92, 3859.0)


## Testing NER

### Part 1

[Xlm roberta ner Japanese](https://huggingface.co/tsmatz/xlm-roberta-ner-japanese)

In [8]:
from transformers import pipeline

In [9]:
from pprint import pprint

In [10]:
text = "鈴井は4月の陽気の良い日に、鈴をつけて北海道のトムラウシへと登った"

In [11]:
model_name = "tsmatz/xlm-roberta-ner-japanese"
classifier = pipeline("token-classification", model=model_name)
result = classifier(text)



In [12]:
pprint(result)

[{'end': 1,
  'entity': 'PER',
  'index': 1,
  'score': 0.9992749,
  'start': 0,
  'word': '▁'},
 {'end': 1,
  'entity': 'PER',
  'index': 2,
  'score': 0.9993655,
  'start': 0,
  'word': '鈴'},
 {'end': 2,
  'entity': 'PER',
  'index': 3,
  'score': 0.9992323,
  'start': 1,
  'word': '井'},
 {'end': 22,
  'entity': 'LOC',
  'index': 14,
  'score': 0.99775296,
  'start': 19,
  'word': '北海道'},
 {'end': 24,
  'entity': 'LOC',
  'index': 16,
  'score': 0.9978789,
  'start': 23,
  'word': 'ト'},
 {'end': 25,
  'entity': 'LOC',
  'index': 17,
  'score': 0.9983669,
  'start': 24,
  'word': 'ム'},
 {'end': 26,
  'entity': 'LOC',
  'index': 18,
  'score': 0.9983725,
  'start': 25,
  'word': 'ラ'},
 {'end': 27,
  'entity': 'LOC',
  'index': 19,
  'score': 0.9978707,
  'start': 26,
  'word': 'ウ'},
 {'end': 28,
  'entity': 'LOC',
  'index': 20,
  'score': 0.9982278,
  'start': 27,
  'word': 'シ'}]


### Part 2

In [13]:
ner_pipeline = pipeline(
    model="llm-book/bert-base-japanese-v3-ner-wikipedia-dataset",
    aggregation_strategy="simple",
)

In [14]:
pprint(ner_pipeline(text))

[{'end': None,
  'entity_group': '人名',
  'score': 0.9969324,
  'start': None,
  'word': '鈴井'},
 {'end': None,
  'entity_group': '地名',
  'score': 0.99784124,
  'start': None,
  'word': '北海道'},
 {'end': None,
  'entity_group': '地名',
  'score': 0.99032897,
  'start': None,
  'word': 'トムラウシ'}]


### Part 3

In [15]:
import nagisa

In [16]:
text = 'Pythonで簡単に使えるツールです'
words = nagisa.tagging(text)
print(words)
#=> Python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞

Python/名詞 で/助詞 簡単/形状詞 に/助動詞 使える/動詞 ツール/名詞 です/助動詞


In [17]:
# Get a list of words
print(words.words)
#=> ['Python', 'で', '簡単', 'に', '使える', 'ツール', 'です']

['Python', 'で', '簡単', 'に', '使える', 'ツール', 'です']


In [18]:
# Get a list of POS-tags
print(words.postags)

['名詞', '助詞', '形状詞', '助動詞', '動詞', '名詞', '助動詞']


In [19]:
def ner_jp_nagisa(text):
    words = nagisa.tagging(text)
    return(words.words, words.postags)

In [20]:
print(ner_jp_nagisa('Pythonで簡単に使えるツールです'))

(['Python', 'で', '簡単', 'に', '使える', 'ツール', 'です'], ['名詞', '助詞', '形状詞', '助動詞', '動詞', '名詞', '助動詞'])


## Loading data

### Example 1

In [21]:
path = '/home/luis-carlos/Documents/experiments/nlp/data/example_zeals.csv'

In [22]:
import pandas as pd

In [23]:
data = pd.read_csv(path)
data.head(10)

Unnamed: 0,prev_message
0,男性にも使えますか？
1,なるほど、男性におすすめのアイテム教えてください
2,乾燥に効きそうなやつ
3,えっ？kinuiって保湿クリームあるの？
4,1枚目
5,1枚目
6,おすすめの使用方法はありますか？
7,どうやって購入するんですか？
8,ありがとう
9,俺の肌状態は？


In [24]:
for index, row in data.iterrows():
    text = row['prev_message']
    pprint(text)

    # Inferring language
    isReliable, textBytesFound, details = cld2.detect(text)
    pprint(details[0][0])

    #pprint(ner_pipeline(text))
    #result = classifier(text)
    #pprint(result)

    # Starting NER
    words = nagisa.tagging(text)
    print(words.words)
    print(words.postags)

    print('--------------------------------\n')


'男性にも使えますか？'
'Japanese'
['男性', 'に', 'も', '使え', 'ます', 'か', '?']
['名詞', '助詞', '助詞', '動詞', '助動詞', '助詞', '補助記号']
--------------------------------

'なるほど、男性におすすめのアイテム教えてください'
'Japanese'
['なるほど', '、', '男性', 'に', 'お', 'すすめ', 'の', 'アイテム', '教え', 'て', 'ください']
['副詞', '補助記号', '名詞', '助詞', '接頭辞', '動詞', '助詞', '名詞', '動詞', '助詞', '動詞']
--------------------------------

'乾燥に効きそうなやつ'
'Japanese'
['乾燥', 'に', '効き', 'そう', 'な', 'やつ']
['名詞', '助詞', '動詞', '形状詞', '助動詞', '名詞']
--------------------------------

'えっ？kinuiって保湿クリームあるの？'
'Japanese'
['えっ', '?', 'kinui', 'って', '保湿', 'クリーム', 'ある', 'の', '?']
['感動詞', '補助記号', '名詞', '助詞', '名詞', '名詞', '動詞', '助詞', '補助記号']
--------------------------------

'1枚目'
'Unknown'
['1', '枚', '目']
['名詞', '接尾辞', '接尾辞']
--------------------------------

'1枚目'
'Unknown'
['1', '枚', '目']
['名詞', '接尾辞', '接尾辞']
--------------------------------

'おすすめの使用方法はありますか？'
'Japanese'
['お', 'すすめ', 'の', '使用', '方法', 'は', 'あり', 'ます', 'か', '?']
['接頭辞', '動詞', '助詞', '名詞', '名詞', '助詞', '動詞', '助動詞', '助詞', '補助記号']
---

### English language (Pt 1)

In [25]:
import spacy

In [26]:
#python -m spacy download en_core_web_lg
#python -m spacy download en_core_web_sm

In [27]:
nlp = spacy.load("en_core_web_lg")

In [28]:
text = "Tesla Inc is going to acquire Twitter for $45 billion"
doc = nlp(text)

entities = doc.ents
#for ent in entities:
#    print(ent)

#for ent in entities:
#    print(ent.text)

for ent in entities:
    print(ent.text, '|', ent.label_)

Tesla Inc | ORG
Twitter | ORG
$45 billion | MONEY


In [29]:
def ner_en_spacy(text):

    doc = nlp(text)
    entities = doc.ents

    all_entities = []

    for ent in entities:
        all_entities.append([ent.text, ent.label_])
    return(all_entities)

### Pt 2

In [30]:
from flair.data import Sentence
from flair.nn import Classifier

In [31]:
# make a sentence
sentence = Sentence('I love Berlin .')

# load the NER tagger
tagger_ner = Classifier.load('ner')

# run NER over sentence
tagger_ner.predict(sentence)

# print the sentence with all annotations
print(sentence)

2025-01-12 00:00:12,089 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
Sentence[4]: "I love Berlin ." → ["Berlin"/LOC]


In [32]:
sentence.get_label().labeled_identifier

'Span[2:3]: "Berlin"/LOC'

In [33]:
# Print the entities
all_entities = []
print("Extracted Entities:")
for entity in sentence.get_spans('ner'):
    print(f"Entity: {entity.text}, Type: {entity.tag}, Confidence: {entity.score:.2f}")
    all_entities.append([entity.text, entity.tag, entity.score])

Extracted Entities:
Entity: Berlin, Type: LOC, Confidence: 1.00


In [34]:
def ner_en(text):
    # make a sentence
    sentence = Sentence(text)

    # load the NER tagger
    tagger_ner = Classifier.load('ner')

    # run NER over sentence
    tagger_ner.predict(sentence)

    all_entities = []
    for entity in sentence.get_spans('ner'):
        all_entities.append([entity.text, entity.tag, entity.score])

    return(all_entities)

In [35]:
# make a sentence
sentence = Sentence('I love Berlin .')

# load the NER tagger
tagger_ner = Classifier.load('ner-fast')

# run NER over sentence
tagger_ner.predict(sentence)

# print the sentence with all annotations
print(sentence)

2025-01-12 00:00:15,260 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
Sentence[4]: "I love Berlin ." → ["Berlin"/LOC]


In [36]:
# Print the entities
print("Extracted Entities:")
for entity in sentence.get_spans('ner'):
    print(f"Entity: {entity.text}, Type: {entity.tag}, Confidence: {entity.score:.2f}")

Extracted Entities:
Entity: Berlin, Type: LOC, Confidence: 1.00


In [37]:
# make a sentence
sentence = Sentence('I love Berlin .')

# load the NER tagger
tagger = Classifier.load('sentiment')

# run NER over sentence
tagger.predict(sentence)

# print the sentence with all annotations
print(sentence)

Sentence[4]: "I love Berlin ." → POSITIVE (0.9983)


### Example 2

In [38]:
#path = '/home/luis-carlos/Documents/experiments/nlp/data/synthetic_reviews_laptops.csv'
path = '/home/luis-carlos/Documents/experiments/nlp/data/synthetic_reviews_laptops_v2.csv'

data = pd.read_csv(path)
data.head()

Unnamed: 0,question,language
0,What is the battery life of this laptop?,en
1,Does this laptop come with pre-installed softw...,en
2,"Is the RAM upgradeable? If yes, what is the ma...",en
3,What type of processor does this laptop have?,en
4,Does this laptop have a dedicated graphics card?,en


In [39]:
all_info_lt = []

for index, row in data.iterrows():
    text = row['question']
#    pprint(text)

    # Inferring language
    isReliable, textBytesFound, details = cld2.detect(text)

    detected_language = details[0][0]
#    pprint(detected_language)

    if detected_language == 'Japanese':
        # Starting NER
        words = nagisa.tagging(text)
#        print(words.words)
#        print(words.postags)
        all_info_lt.append([text, words.words, words.postags])
    elif detected_language == 'ENGLISH':
        doc = nlp(text)
        entities = doc.ents
#        for ent in entities:
#            print(ent.text, '|', ent.label_)

        all_info_lt.append([text, entities])

In [40]:
all_info_lt[30]

['ノートパソコンの重量はどれくらいですか？持ち運びに便利ですか？',
 ['ノート',
  'パソコン',
  'の',
  '重量',
  'は',
  'どれ',
  'くらい',
  'です',
  'か',
  '?',
  '持ち運び',
  'に',
  '便利',
  'です',
  'か',
  '?'],
 ['名詞',
  '名詞',
  '助詞',
  '名詞',
  '助詞',
  '代名詞',
  '助詞',
  '助動詞',
  '助詞',
  '補助記号',
  '名詞',
  '助詞',
  '名詞',
  '助動詞',
  '助詞',
  '補助記号']]

## Final functions

In [None]:
# Language
# lang_detected(text)

# NER (JP)
# ner_jp_nagisa(text)

# NER (EN)
# ner_en(text)
# ner_en_spacy(text)

In [42]:
all_info_lt = []

for index, row in data.iterrows():
    text = row['question']

    detected_language = lang_detected(text)[0]

    if detected_language == 'Japanese':        
        all_info_lt.append([{'text': text, 'language': 'japanese', 'ner': ner_jp_nagisa(text)}])
    elif detected_language == 'ENGLISH':
        all_info_lt.append([{'text': text, 'language': 'english', 'ner': ner_en(text), 'ner_spacy': ner_en_spacy(text)}])

2025-01-12 00:01:36,137 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
2025-01-12 00:01:39,040 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
2025-01-12 00:01:42,328 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
2025-01-12 00:01:45,609 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
2025-01-12 00:01:48,571 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-

In [43]:
print(all_info_lt[30])

[{'text': 'ノートパソコンの重量はどれくらいですか？持ち運びに便利ですか？', 'language': 'japanese', 'ner': (['ノート', 'パソコン', 'の', '重量', 'は', 'どれ', 'くらい', 'です', 'か', '?', '持ち運び', 'に', '便利', 'です', 'か', '?'], ['名詞', '名詞', '助詞', '名詞', '助詞', '代名詞', '助詞', '助動詞', '助詞', '補助記号', '名詞', '助詞', '名詞', '助動詞', '助詞', '補助記号'])}]


In [49]:
print(all_info_lt[5])

[{'text': 'Is the keyboard backlit?', 'language': 'english', 'ner': [], 'ner_spacy': []}]
