<a href="https://colab.research.google.com/github/michaelmoju/ml_final_fall21/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Spacy

In [1]:
import spacy

In [2]:
nlp = spacy.load("en_core_web_sm")

In [3]:
doc = nlp("Fung Permadi (Taiwan) v Fraida.")


In [4]:
print('{:<12}  {:}\n'.format('Entity', 'Type'))

# For each entity found...
for ent in doc.ents:
    
    # Print the entity text `ent.text` and its label `ent.label_`.
    print('{:<12}  {:}'.format(ent.text, ent.label_))

Entity        Type

Fung Permadi (Taiwan  ORG


# Flair

In [5]:
!pip install --upgrade git+https://github.com/flairNLP/flair.git

Collecting git+https://github.com/flairNLP/flair.git
  Cloning https://github.com/flairNLP/flair.git to /tmp/pip-req-build-1szxw3on
  Running command git clone -q https://github.com/flairNLP/flair.git /tmp/pip-req-build-1szxw3on
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting huggingface-hub
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 3.2 MB/s 
[?25hCollecting mpld3==0.3
  Downloading mpld3-0.3.tar.gz (788 kB)
[K     |████████████████████████████████| 788 kB 10.0 MB/s 
[?25hCollecting janome
  Downloading Janome-0.4.1-py2.py3-none-any.whl (19.7 MB)
[K     |████████████████████████████████| 19.7 MB 1.3 MB/s 
[?25hCollecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[K     |████████████████████████████████| 981 kB 51.9 MB/s 
[?25hCollecting konoha<5.0.0,>=4.0.0
  Downloading ko

In [6]:
from flair.data import Sentence
from flair.models import SequenceTagger

In [7]:
tagger = SequenceTagger.load("flair/ner-english-large")

Downloading:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

2021-11-30 22:02:18,288 loading file /root/.flair/models/ner-english-large/07301f59bb8cb113803be316267f06ddf9243cdbba92a4c8067ef92442d2c574.554244d3476d97501a766a98078421817b14654496b86f2f7bd139dc502a4f29


Downloading:   0%|          | 0.00/4.83M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/8.68M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/513 [00:00<?, ?B/s]

In [8]:
sentence = Sentence("Fung Permadi (China) v Fraida.")

In [9]:
# Run NER over sentence
tagger.predict(sentence)

# Retrieve the entities found by the tagger.
entity_dict = sentence.to_dict(tag_type='ner')

# Display the entities, and the type(s) of each.
print('\n{:<12}  {:}\n'.format('Entity', 'Type(s)'))

# For each entity...
for entity in entity_dict['entities']:
    
    # Print the entity text and its labels. Flair supports multiple labels
    # per entity, and includes a confidence score.
    print('{:<12}  {:}'.format(entity["text"], str(entity["labels"])))


Entity        Type(s)

Fung Permadi  [PER (1.0)]
China         [LOC (1.0)]
Fraida        [ORG (0.9998)]


In [10]:
sentence = Sentence("Fraida said that this is right.")

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
from flair.data import Corpus
from flair.datasets import ColumnCorpus
columns = {0: 'text', 3: 'ner'}
corpus = ColumnCorpus('/content/drive/MyDrive/conll2003pp/', columns,
                              train_file='conllpp_train.txt',
                              test_file='conllpp_test.txt',
                              dev_file='conllpp_dev.txt')

2021-11-30 22:08:05,228 Reading data from /content/drive/MyDrive/conll2003pp
2021-11-30 22:08:05,234 Train: /content/drive/MyDrive/conll2003pp/conllpp_train.txt
2021-11-30 22:08:05,235 Dev: /content/drive/MyDrive/conll2003pp/conllpp_dev.txt
2021-11-30 22:08:05,239 Test: /content/drive/MyDrive/conll2003pp/conllpp_test.txt


In [18]:
import pandas as pd
data = [[len(corpus.train), len(corpus.test), len(corpus.dev)]]
# Prints out the dataset sizes of train test and development in a table.
pd.DataFrame(data, columns=["Train", "Test", "Development"])

Unnamed: 0,Train,Test,Development
0,14987,3684,3466


In [19]:
label_type = 'ner'
label_dict = corpus.make_label_dictionary(label_type=label_type)

2021-11-30 22:08:29,478 Computing label dictionary. Progress:


100%|██████████| 14987/14987 [00:01<00:00, 12903.55it/s]

2021-11-30 22:08:30,654 Corpus contains the labels: ner (#204567)
2021-11-30 22:08:30,657 Created (for label 'ner') Dictionary with 10 tags: <unk>, O, B-ORG, B-MISC, B-PER, I-PER, B-LOC, I-ORG, I-MISC, I-LOC





In [20]:
print(label_dict)

Dictionary with 10 tags: <unk>, O, B-ORG, B-MISC, B-PER, I-PER, B-LOC, I-ORG, I-MISC, I-LOC


In [21]:
corpus.test[0]

Sentence: "-DOCSTART-"   [− Tokens: 1]

In [22]:
sentence = corpus.test[1]

In [23]:
entity_dict = sentence.to_dict(tag_type='ner')

In [24]:
sentence.to_plain_string()

'SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .'

In [25]:
entity_dict
# For each entity...
for entity in entity_dict['entities']:
    
    # Print the entity text and its labels. Flair supports multiple labels
    # per entity, and includes a confidence score.
    print('{:<12}  {:}'.format(entity["text"], str(entity["labels"])))

JAPAN         [LOC (1.0)]
CHINA         [LOC (1.0)]
