In [2]:
import pandas as pd
from ktrain import text 
from flair.data import Sentence
from flair.models import SequenceTagger
import logging
from multiprocessing.pool import ThreadPool
from tqdm import tqdm

from galvasr2.align.spark.align_lib import load_audio_id_text_id_mapping, load_transcripts

In [3]:
%%time
spark = SparkSession.builder \
                          .master("local[*]") \
                          .appName("NER") \
                          .config("spark.sql.execution.arrow.pyspark.enabled", "true")\
                          .config("spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")\
                          .config("spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")\
                          .config("spark.driver.memory", "40g")\
                          .config("spark.executor.memory", "40g")\
                          .config("spark.rpc.askTimeout", "480s")\
                          .config("spark.executor.heartbeatInterval", "20000ms")\
                          .config("spark.eventLog.enabled", "true")\
                          .getOrCreate()
data_trans_index = "gs://the-peoples-speech-west-europe/archive_org/Mar_7_2021/CC_BY_SA_EXPANDED_LICENSES_FILTERED_ACCESS.jsonl.gz"
data_trans = "gs://the-peoples-speech-west-europe/archive_org/Mar_7_2021/CC_BY_SA_EXPANDED_LICENSES_FILTERED_ACCESS"

CPU times: user 3.6 ms, sys: 0 ns, total: 3.6 ms
Wall time: 3.79 ms


In [4]:
class NER():
    
    def __init__(self, num_rows:int=200):
        self.num_rows = num_rows

    def read_transcription_data(self, spark:SparkSession, data_trans_index:str=data_trans_index, 
                                data_trans:str=data_trans) -> pd.DataFrame:
        # spark.sparkContext.setLogLevel("INFO") # "ALL" for very verbose logging                                                                                                                                   
        logging.getLogger("py4j").setLevel(logging.ERROR)
        catalogue_df = load_audio_id_text_id_mapping(spark, data_trans_index)
        training_sample_rows = catalogue_df.collect()

        # Comment this out to load everything. It might takes ~15 minute, in my experience, on an 8 core machine.
        training_sample_rows = training_sample_rows[:self.num_rows]
        transcripts_df = load_transcripts(spark, data_trans, training_sample_rows)
        transcripts_pdf = transcripts_df.toPandas()
        return transcripts_pdf

    def detect_languague(transcripts_pdf:pd.DataFrame) -> pd.DataFrame:
        def detect_lan(row):
            try:
                return detect(row)
            except:
                return 'problem'
        transcripts_pdf['language'] = transcripts_pdf['transcript'].apply(detect_lan)
        return transcripts_pdf

    def get_text_classification(transcripts_pdf:pd.DataFrame) -> pd.DataFrame:
        zsl = text.ZeroShotClassifier()
        labels=['politics', 'elections', 'sports', 'films', 'television', 'artificial intelligence',
               'food', 'healthy', 'Information technology', 'financials', 'communication services']
        def get_top_class(row):
            try:
                row = row[:20000]
                classification = zsl.predict(row, labels=labels, include_labels=True, batch_size=1, multilabel=False)
                classification.sort(key=lambda tup: tup[1])
                return classification[-1][0]
            except:
                return 'problem'
        transcripts_pdf['classification'] = transcripts_pdf['transcript'].apply(get_top_class)
        return transcripts_pdf

    def load_ner_model(self) -> SequenceTagger:
        model = SequenceTagger.load("flair/ner-english-ontonotes-fast")
        return model

    def get_ner_transcription(self, transcripts_pdf:pd.DataFrame) -> dict:
        ner_entities = {
        'CARDINAL':[],
        'DATE':[],
        'EVENT':[],
        'FAC':[],
        'GPE':[],
        'LANGUAGE':[],
        'LAW':[],
        'LOC':[],
        'MONEY':[],
        'NORP':[],
        'ORDINAL':[],
        'ORG':[],
        'PERCENT':[],
        'PERSON':[],
        'PRODUCT':[],
        'QUANTITY':[],
        'TIME':[],
        'WORK_OF_ART':[]
        }
        def get_top_class(tagger, row):
            try:
                sentence = Sentence(row)
                tagger.predict(sentence)
                for entity in sentence.get_spans('ner'):
                    ner_entities[entity.tag].append(entity.text) 
            except:
                return 'problem'
        model = self.load_ner_model()
        transcripts = transcripts_pdf['transcript'].values
        for i in tqdm(range(self.num_rows)):
            get_top_class(model, transcripts[i][:20000])
        return ner_entities

In [5]:
%%time
ner_model = NER(100)
transcripts_pdf = ner_model.read_transcription_data(spark, data_trans_index, data_trans)

  self._sock = None


CPU times: user 631 ms, sys: 58.4 ms, total: 689 ms
Wall time: 1min 51s


  self._sock = None


## One example

In [9]:
%%time
result = ner_model.get_ner_transcription(transcripts_pdf)

2021-05-28 22:24:14,445 loading file /root/.flair/models/ner-english-ontonotes-fast/0d55dd3b912da9cf26e003035a0c269a0e9ab222f0be1e48a3bbba3a58c0fed0.c9907cd5fde3ce84b71a4172e7ca03841cd81ab71d13eb68aa08b259f57c00b6






  0%|          | 0/100 [00:00<?, ?it/s][A[A[A[A


 57%|█████▋    | 57/100 [12:49<10:29, 14.64s/it][A[A[A
 57%|█████▋    | 57/100 [12:49<11:15, 15.72s/it][A

 58%|█████▊    | 58/100 [12:52<09:33, 13.65s/it][A[A


 58%|█████▊    | 58/100 [12:58<08:57, 12.79s/it][A[A[A



  1%|          | 1/100 [00:11<19:26, 11.78s/it][A[A[A[A
 58%|█████▊    | 58/100 [13:04<10:41, 15.27s/it][A

 59%|█████▉    | 59/100 [13:08<09:54, 14.49s/it][A[A


 59%|█████▉    | 59/100 [13:12<09:02, 13.24s/it][A[A[A
 59%|█████▉    | 59/100 [13:18<10:17, 15.06s/it][A

 59%|█████▉    | 59/100 [13:22<10:41, 15.65s/it][A[A



 60%|██████    | 60/100 [13:26<10:19, 15.49s/it][A[A[A[A


 60%|██████    | 60/100 [13:29<09:31, 14.30s/it][A[A[A
 60%|██████    | 60/100 [13:33<09:57, 14.93s/it][A

 61%|██████    | 61/100 [13:39<09:38, 14.83s/it][A[A


 61%|██████    | 61/100 [13:42<09:08, 14.06s/it][A[A[A
 61%|██████    | 61/100 [13:46<09:16, 14.26s/it][A

 62%|██████▏   | 62/100 [13:55<09

CPU times: user 14min 36s, sys: 6min, total: 20min 37s
Wall time: 10min 6s





## Multiple result with parallelization

In [None]:
%%time
import tensorflow as tf
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
    executor = ThreadPool()
    result = executor.map(ner_model.get_ner_transcription, [transcripts_pdf] * 16)

2021-05-28 22:36:38,469 loading file /root/.flair/models/ner-english-ontonotes-fast/0d55dd3b912da9cf26e003035a0c269a0e9ab222f0be1e48a3bbba3a58c0fed0.c9907cd5fde3ce84b71a4172e7ca03841cd81ab71d13eb68aa08b259f57c00b6
2021-05-28 22:36:38,472 loading file /root/.flair/models/ner-english-ontonotes-fast/0d55dd3b912da9cf26e003035a0c269a0e9ab222f0be1e48a3bbba3a58c0fed0.c9907cd5fde3ce84b71a4172e7ca03841cd81ab71d13eb68aa08b259f57c00b6


  0%|          | 0/100 [00:00<?, ?it/s]
  1%|          | 1/100 [00:06<10:56,  6.63s/it]
  2%|▏         | 2/100 [00:10<09:15,  5.67s/it][A
  3%|▎         | 3/100 [00:16<09:35,  5.93s/it][A
  4%|▍         | 4/100 [00:23<09:48,  6.13s/it][A
  5%|▌         | 5/100 [00:29<09:56,  6.28s/it][A
  6%|▌         | 6/100 [00:36<10:01,  6.40s/it][A
  7%|▋         | 7/100 [00:43<10:23,  6.71s/it][A
  8%|▊         | 8/100 [00:50<10:16,  6.71s/it][A
  9%|▉         | 9/100 [00:57<10:11,  6.72s/it][A
 10%|█         | 10/100 [01:04<10:02,  6.70s/it][A
 11%|█         | 11/100 [01:10<09:55,  6.69s/it][A
 12%|█▏        | 12/100 [01:17<09:47,  6.67s/it][A
 13%|█▎        | 13/100 [01:23<09:40,  6.67s/it][A
 14%|█▍        | 14/100 [01:30<09:33,  6.66s/it][A
 15%|█▌        | 15/100 [01:38<09:47,  6.91s/it][A
 16%|█▌        | 16/100 [01:44<09:33,  6.83s/it][A
 17%|█▋        | 17/100 [01:51<09:22,  6.77s/it][A
 18%|█▊        | 18/100 [01:58<09:12,  6.74s/it][A
 19%|█▉        | 19/100 [02:04<09:02,