In [18]:
from galvasr2.align.spark.schemas import ARCHIVE_ORG_SCHEMA
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from flair.data import Sentence
from flair.models import SequenceTagger
from tqdm import tqdm

spark = SparkSession.builder.appName('CC-BY-license').getOrCreate()
input_catalogue_path = "gs://the-peoples-speech-west-europe/archive_org/Mar_7_2021/EXPANDED_LICENSES_FILTERED_ACCESS.jsonl.gz"
df = spark.read.format('json').schema(ARCHIVE_ORG_SCHEMA).load(input_catalogue_path)
print(df.count())
columns = [df.metadata.identifier, df.metadata.description, df.metadata.subject]
df = df.select(columns)
df = df.na.fill('No info')
df = df.select(F.col("`metadata.identifier`").alias("identifier"), 
               F.col("`metadata.description`").alias("description"),
               F.col("`metadata.subject`").alias("subject"))
df = df.withColumn("subject",F.concat_ws(",",F.col("subject")))
df = df.toPandas()
df.head()

63627


Unnamed: 0,identifier,description,subject
0,0084_Tomorrow_Always_Comes_20_00_32_00,Promotes Bur-Mil Rayon Fabrics (at great lengt...,No info
1,00BienvenidoALaRevolucinDeLaImpresinEn3D,3d,3d
2,00BienvenidoALaRevolucinDeLaImpresinEn3D_201809,3d printing; education,3d printing; education
3,02777AOpenApril1213_201403,"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<p class=""Ms...","[""open"",""friday"",""rhina"",""rhina valentin"",""bro..."
4,02777AOpenApril1713_201403,"\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<p class=""Ms...","[""open wednesday"",""bronxnet tv"",""public access..."


In [19]:
df = df.replace(r'\n',' ', regex=True) 
df['description'] = df['description'].str.replace('<[^<]+?>', '')
df['subject'] = df['subject'].str.strip('[]').astype(str)
df = df.replace('"', '', regex=True)
df = df.replace(',', '  ', regex=True)
df.head()

Unnamed: 0,identifier,description,subject
0,0084_Tomorrow_Always_Comes_20_00_32_00,Promotes Bur-Mil Rayon Fabrics (at great lengt...,No info
1,00BienvenidoALaRevolucinDeLaImpresinEn3D,3d,3d
2,00BienvenidoALaRevolucinDeLaImpresinEn3D_201809,3d printing; education,3d printing; education
3,02777AOpenApril1213_201403,OPEN is shot LIVE from Bron...,open friday rhina rhina valentin bronx br...
4,02777AOpenApril1713_201403,OPEN is shot LIVE from Bron...,open wednesday bronxnet tv public access da...


## NER

In [49]:
# load tagger
tagger = SequenceTagger.load("flair/ner-english-ontonotes-fast")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1331379415.0, style=ProgressStyle(descr…


2021-05-31 00:25:52,033 loading file /root/.flair/models/ner-english-ontonotes-fast/0d55dd3b912da9cf26e003035a0c269a0e9ab222f0be1e48a3bbba3a58c0fed0.c9907cd5fde3ce84b71a4172e7ca03841cd81ab71d13eb68aa08b259f57c00b6


In [60]:
%%time
ner_entities = {
    'CARDINAL':[],
    'DATE':[],
    'EVENT':[],
    'FAC':[],
    'GPE':[],
    'LANGUAGE':[],
    'LAW':[],
    'LOC':[],
    'MONEY':[],
    'NORP':[],
    'ORDINAL':[],
    'ORG':[],
    'PERCENT':[],
    'PERSON':[],
    'PRODUCT':[],
    'QUANTITY':[],
    'TIME':[],
    'WORK_OF_ART':[]
}
tagger = SequenceTagger.load("flair/ner-english-ontonotes-fast")
def get_top_class(tagger, row):
    try:
        sentence = Sentence(row)
        tagger.predict(sentence)
        for entity in sentence.get_spans('ner'):
            ner_entities[entity.tag].append(entity.text) 
    except:
        return 'problem'
description = df['description'].values
subject = df['subject'].values

2021-05-31 01:27:18,769 loading file /root/.flair/models/ner-english-ontonotes-fast/0d55dd3b912da9cf26e003035a0c269a0e9ab222f0be1e48a3bbba3a58c0fed0.c9907cd5fde3ce84b71a4172e7ca03841cd81ab71d13eb68aa08b259f57c00b6
CPU times: user 3.43 s, sys: 561 ms, total: 3.99 s
Wall time: 4.19 s


In [61]:
%%time
subject = df['subject'].values
for i in tqdm(range(df.shape[0])):
    get_top_class(tagger, subject[i])

 14%|█▎        | 8668/63627 [05:32<38:15, 23.94it/s]  



 14%|█▎        | 8672/63627 [05:33<36:13, 25.29it/s]



 14%|█▎        | 8704/63627 [05:33<20:59, 43.61it/s]



 14%|█▎        | 8712/63627 [05:33<18:33, 49.33it/s]



 14%|█▍        | 8788/63627 [05:35<27:30, 33.23it/s]



 14%|█▍        | 8841/63627 [05:37<40:13, 22.70it/s]



 14%|█▍        | 8864/63627 [05:38<33:40, 27.10it/s]



 14%|█▍        | 8872/63627 [05:38<28:09, 32.42it/s]



 14%|█▍        | 8877/63627 [05:38<25:19, 36.03it/s]



 14%|█▍        | 8913/63627 [05:39<22:57, 39.71it/s]



 14%|█▍        | 8939/63627 [05:39<18:32, 49.14it/s]



 15%|█▍        | 9273/63627 [05:53<45:18, 20.00it/s]



 15%|█▍        | 9279/63627 [05:53<36:18, 24.94it/s]



 15%|█▍        | 9290/63627 [05:53<27:56, 32.42it/s]



 15%|█▍        | 9464/63627 [05:57<15:44, 57.36it/s]



 15%|█▍        | 9470/63627 [05:58<19:44, 45.74it/s]



 16%|█▌        | 10145/63627 [06:18<15:53, 56.07it/s]



 16%|█▌        | 10151/63627 [06:18<15:43, 56.71it/s]



 16%|█▌        | 10160/63627 [06:19<16:08, 55.20it/s]



 16%|█▌        | 10166/63627 [06:19<15:48, 56.36it/s]



 16%|█▌        | 10172/63627 [06:19<16:30, 53.95it/s]



 16%|█▌        | 10178/63627 [06:19<18:53, 47.14it/s]



 16%|█▌        | 10183/63627 [06:19<20:38, 43.17it/s]



 16%|█▌        | 10297/63627 [06:20<09:44, 91.23it/s]



 16%|█▌        | 10307/63627 [06:21<10:21, 85.73it/s]



 16%|█▌        | 10326/63627 [06:21<10:48, 82.14it/s]



 16%|█▋        | 10343/63627 [06:21<21:43, 40.88it/s]



 16%|█▋        | 10350/63627 [06:22<23:09, 38.33it/s]



 16%|█▋        | 10378/63627 [06:22<22:57, 38.66it/s]



 16%|█▋        | 10413/63627 [06:23<33:46, 26.26it/s]



 16%|█▋        | 10417/63627 [06:24<33:53, 26.16it/s]



 16%|█▋        | 10452/63627 [06:24<17:02, 52.02it/s]



 16%|█▋        | 10464/63627 [06:24<14:19, 61.87it/s]



 38%|███▊      | 24327/63627 [08:56<06:47, 96.52it/s] IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [62]:
import pandas as pd
df_ner = pd.DataFrame.from_dict(ner_entities, orient='index').T
df_ner.head()

Unnamed: 0,CARDINAL,DATE,EVENT,FAC,GPE,LANGUAGE,LAW,LOC,MONEY,NORP,ORDINAL,ORG,PERCENT,PERSON,PRODUCT,QUANTITY,TIME,WORK_OF_ART
0,78,2013,WWI,Molokai Molokai Planning,Hawaii,English,Second Amendment,Maui Water,15.0,Spanish,44th,Revolución Rusa,99 %,amanda holley,Gamera,12 mile,tonight,The Preacher One Man Show
1,17,Saturday,World War,Vermont South Burlington City Center,Rusia,English,,Maui Water,,Spanish,first,Russian Revolution,,Jin,Lynx,120 seconds,minutes,The Preacher One Man Show
2,17,Saturday,WW2,Parade Snow,Russia,Russian,,Maui Water,,Farsi,44th,Historia,,Jadoo - Ml Zaheer Raggie,Crusaders,,hour,The Preacher One Man Show
3,17,2016,Canton 2016 Sports 2016,Lanai Lanai Planning,Michigan,Russian,,Maui Water,,Persian,6th,Holland City Council,,Lenin,Crusaders,,24 hour,The Preacher One Man Show
4,17,2016,9 / 11,Cuban Bridge,Holland City,English,,Maui Water,,Asian American,6th,Holland City Council,,Putin,Jack the Ripper,,24 hour,The Preacher One Man Show


## Count appear words

In [31]:
from collections import Counter
subject = df['subject'].values
final_count = Counter({'a':0})
for i in tqdm(range(df.shape[0])):
    count_row = (Counter(subject[i].split()))
    final_count = final_count + count_row
final_count

100%|██████████| 63627/63627 [04:37<00:00, 229.69it/s]


Counter({'No': 47513,
         'info': 47511,
         '3d': 8,
         'printing;': 1,
         'education': 40,
         'open': 13,
         'friday': 3,
         'rhina': 4,
         'valentin': 3,
         'bronx': 2,
         'bronxnet': 9,
         'cablevision': 1,
         'verizon': 1,
         'fios': 1,
         'gail': 1,
         'digital': 28,
         'kabuki': 2,
         'lehman': 1,
         'college': 37,
         'art': 31,
         'wednesday': 5,
         'tv': 24,
         'public': 86,
         'access': 63,
         'daren': 6,
         'jaime': 6,
         'sports': 45,
         'roundup': 2,
         'monday': 2,
         'bronxmedia': 2,
         'bob': 7,
         'lee': 4,
         'dj': 1,
         'cool': 4,
         'clyde': 3,
         'bobbyc': 5,
         'bobyc': 2,
         'fitness': 4,
         'music': 124,
         'yankees': 1,
         'baseball': 2,
         'healthcare': 2,
         'lifestyle': 2,
         'business': 7,
         'perfor

In [33]:
dict(sorted(final_count.items(), key=lambda item: item[1], reverse=True))

{'No': 47513,
 'info': 47511,
 'Community': 9191,
 'Media': 7499,
 'TV': 7174,
 'Belmont': 5755,
 'Access': 5335,
 'PEG': 4922,
 'Vimeo': 4017,
 'Government': 3928,
 'Public': 3271,
 'Belmont;': 2327,
 'Archive': 2324,
 'Moving': 2320,
 'Image': 2320,
 'Center;': 2314,
 'Massachusetts;': 2314,
 'Committee': 2099,
 'on': 2079,
 'Holland': 2051,
 'U.S.': 1905,
 'Vermont': 1686,
 'Peters': 1671,
 'Township': 1654,
 'and': 1540,
 'of': 1486,
 'City': 1423,
 'Pennsylvania': 1321,
 'PTCT7': 1321,
 'McMurray': 1320,
 '2015': 1312,
 'Richmond': 1291,
 'Meeting': 1260,
 'Paul': 1081,
 'Maui': 1067,
 'Congress;': 941,
 'Michigan': 937,
 'Saint': 936,
 'Youtube': 897,
 '2014': 876,
 '2013': 751,
 '-': 687,
 'Minnesota': 678,
 '2018': 659,
 'Burlington': 651,
 'Network': 648,
 'Television': 619,
 'Council': 616,
 'County': 614,
 '2017': 592,
 'House;': 575,
 'Commission': 568,
 'Reform': 553,
 'Florida': 552,
 'Sarasota': 551,
 'Oversight': 550,
 'the': 543,
 'Westwood': 520,
 '2012': 511,
 '2018;