**Note**:
This notebook combines entity and entity types over the years, in separate files, into a single file.

In [1]:
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import janitor
from tqdm.notebook import tqdm
import ast
import gc

LF_FP = '../../toi-lf/ner'
filelist = os.listdir(LF_FP)
yearly_datafiles = [filename for filename in filelist if filename.endswith(".csv.gz")]
yearly_datafiles[:5]

['toi_ner_1838.csv.gz',
 'toi_ner_1839.csv.gz',
 'toi_ner_1840.csv.gz',
 'toi_ner_1841.csv.gz',
 'toi_ner_1842.csv.gz']

In [None]:
df_full = pd.DataFrame()
for filename in tqdm(yearly_datafiles):
    year = filename.split('_')[2].split('.')[0]
    df = (pd.read_csv(os.path.join(LF_FP, filename), usecols=['ner'])
          .assign(
              ner=lambda df: [ast.literal_eval(strlist) for strlist in df.ner],
              yearofpub=year
          )
          .explode('ner')
          .query('ner==ner')
         )
    for ix, col in enumerate(['entity', 'nertype']):
        df[col] = df['ner'].apply(lambda nertuple: nertuple[ix])
    
    df_full = df_full.append(df.drop('ner', axis=1), ignore_index=True)
    del df
    gc.collect()

  0%|          | 0/171 [00:00<?, ?it/s]

In [None]:
df_full.to_csv('../../toi-lf/entity_type_year.csv.gz', compression='gzip', index=False)

In [None]:
df_full.head(2)

In [None]:
df_agg = (df_full.groupby(['entity', 'nertype']).size().reset_index()
          .rename_column(0, 'count')
          .sort_values('count', ascending=False)
         )
df_agg.to_csv('../../toi-lf/entity_type_agg.csv.gz', compression='gzip', index=False)

In [None]:
# Save top 1k persons
(df_agg
 .query('nertype=="PERSON"')
 .sort_values('count', ascending=False)
 .pipe(lambda df: df[:1000])
).to_csv('../data/toi_top1000_persons.csv', index=False)

In [None]:
# Save top 1k GPE
(df_agg
 .query('nertype=="GPE"')
 .sort_values('count', ascending=False)
 .pipe(lambda df: df[:1000])
).to_csv('../data/toi_top1000_gpe.csv', index=False)