**Note**: This notebook generates the TOI NER data (`toi_ner_1838.csv.gz`, `toi_ner_1839.csv.gz`, ..., `toi_ner_2008.csv.gz`).

In [1]:
import pandas as pd
import janitor
import telegram_send
from tqdm.notebook import tqdm
from data_utils import fix_separated_words, beepme
import spacy
nlp = spacy.load("en_core_web_sm")
import sys
import gc

FP_TOI_LF = '../../readable_news_lf/toi_parsed_all.csv.gz'

In [2]:
df = (pd.read_csv(FP_TOI_LF, compression='gzip', 
                  usecols=['FullText', 'ObjectType', 'NumericPubDate'])
      .clean_names()
      # Remove ads
      .query('~objecttype.str.contains("advert", case=False)')
      .dropna(subset='fulltext')
      .assign(
          fulltext=lambda df: df.fulltext.apply(fix_separated_words),
          date=lambda df: pd.to_datetime(df.numericpubdate, format='%Y%m%d'),
          year=lambda df: [dt64.year for dt64 in df.date],
      )
     )

df.head(3)

Unnamed: 0,fulltext,objecttype,numericpubdate,date,year
0,SJqSwldni8 vold apf f181 CflSyZTS Theleadergua...,Credit/Acknowledgement,19980224,1998-02-24,1998
1,JHHQk guest Akshay Page 3,"['Table of Contents', 'Front Matter']",19960522,1996-05-22,1996
3,By Vldyadhar Date The Times of India News Serv...,"['Feature', 'Article']",19980916,1998-09-16,1998


In [3]:
if False:
    df_record = (pd.DataFrame(index=range(df.year.min(), df.year.max()+1))
                 .add_column('completed', False)
                )
    
df_record = pd.read_csv('toi_ner_record.csv', index_col=0)
df_record.head(3)

Unnamed: 0,completed
1838,True
1839,True
1840,True


In [4]:
def get_ner(df):
    df = (df
          .assign(completed=False, ner='')
         )
    
    for ix, row in tqdm(df.iterrows()):
        if row['completed'] is True:
            continue
        
        text = row['fulltext']
        try:
            doc = nlp(text)
            ners = [(item.text, item.label_) for item in doc.ents]
        except:
            df.loc[ix, 'completed'] = False
            telegram_send.send(messages=["NER processing error"])  
            
        df.at[ix, 'ner'] = ners
        df.loc[ix, 'completed'] = True
        
    return df

In [5]:
for year in range(df.year.min(), df.year.max()+1):
    if df_record.loc[year, 'completed']==True:
        continue
        
    sys.stdout.write(f'\r Working on year {year}')
    _df = (df.query('year==@year')
           .rename_axis('oix')
           .reset_index()
          )
    _df = get_ner(_df)
    _df.to_csv(f'ner/toi_ner_{year}.csv.gz', index=False, compression='gzip')
    df_record.to_csv('toi_ner_record.csv')
    telegram_send.send(messages=[f"NER for {year} completed"])  
    
    df_record.loc[year, 'completed'] = True
    
    del _df
    gc.collect()

 Working on year 1931

0it [00:00, ?it/s]

 Working on year 1932

0it [00:00, ?it/s]

 Working on year 1933

0it [00:00, ?it/s]

 Working on year 1934

0it [00:00, ?it/s]

 Working on year 1935

0it [00:00, ?it/s]

 Working on year 1936

0it [00:00, ?it/s]

 Working on year 1937

0it [00:00, ?it/s]

 Working on year 1938

0it [00:00, ?it/s]

 Working on year 1939

0it [00:00, ?it/s]

 Working on year 1940

0it [00:00, ?it/s]

 Working on year 1941

0it [00:00, ?it/s]

 Working on year 1942

0it [00:00, ?it/s]

 Working on year 1943

0it [00:00, ?it/s]

 Working on year 1944

0it [00:00, ?it/s]

 Working on year 1945

0it [00:00, ?it/s]

 Working on year 1946

0it [00:00, ?it/s]

 Working on year 1947

0it [00:00, ?it/s]

 Working on year 1948

0it [00:00, ?it/s]

 Working on year 1949

0it [00:00, ?it/s]

 Working on year 1950

0it [00:00, ?it/s]

 Working on year 1951

0it [00:00, ?it/s]

 Working on year 1952

0it [00:00, ?it/s]

 Working on year 1953

0it [00:00, ?it/s]

 Working on year 1954

0it [00:00, ?it/s]

 Working on year 1955

0it [00:00, ?it/s]

 Working on year 1956

0it [00:00, ?it/s]

 Working on year 1957

0it [00:00, ?it/s]

 Working on year 1958

0it [00:00, ?it/s]

 Working on year 1959

0it [00:00, ?it/s]

 Working on year 1960

0it [00:00, ?it/s]

 Working on year 1961

0it [00:00, ?it/s]

 Working on year 1962

0it [00:00, ?it/s]

 Working on year 1963

0it [00:00, ?it/s]

 Working on year 1964

0it [00:00, ?it/s]

 Working on year 1965

0it [00:00, ?it/s]

 Working on year 1966

0it [00:00, ?it/s]

 Working on year 1967

0it [00:00, ?it/s]

 Working on year 1968

0it [00:00, ?it/s]

 Working on year 1969

0it [00:00, ?it/s]

 Working on year 1970

0it [00:00, ?it/s]

 Working on year 1971

0it [00:00, ?it/s]

 Working on year 1972

0it [00:00, ?it/s]

 Working on year 1973

0it [00:00, ?it/s]

 Working on year 1974

0it [00:00, ?it/s]

 Working on year 1975

0it [00:00, ?it/s]

 Working on year 1976

0it [00:00, ?it/s]

 Working on year 1977

0it [00:00, ?it/s]

 Working on year 1978

0it [00:00, ?it/s]

 Working on year 1979

0it [00:00, ?it/s]

 Working on year 1980

0it [00:00, ?it/s]

 Working on year 1981

0it [00:00, ?it/s]

 Working on year 1982

0it [00:00, ?it/s]

 Working on year 1983

0it [00:00, ?it/s]

 Working on year 1984

0it [00:00, ?it/s]

 Working on year 1985

0it [00:00, ?it/s]

 Working on year 1986

0it [00:00, ?it/s]

 Working on year 1987

0it [00:00, ?it/s]

 Working on year 1988

0it [00:00, ?it/s]

 Working on year 1989

0it [00:00, ?it/s]

 Working on year 1990

0it [00:00, ?it/s]

 Working on year 1991

0it [00:00, ?it/s]

 Working on year 1992

0it [00:00, ?it/s]

 Working on year 1993

0it [00:00, ?it/s]

 Working on year 1994

0it [00:00, ?it/s]

 Working on year 1995

0it [00:00, ?it/s]

 Working on year 1996

0it [00:00, ?it/s]

 Working on year 1997

0it [00:00, ?it/s]

 Working on year 1998

0it [00:00, ?it/s]

 Working on year 1999

0it [00:00, ?it/s]

 Working on year 2000

0it [00:00, ?it/s]

 Working on year 2001

0it [00:00, ?it/s]

 Working on year 2002

0it [00:00, ?it/s]

 Working on year 2003

0it [00:00, ?it/s]

 Working on year 2004

0it [00:00, ?it/s]

 Working on year 2005

0it [00:00, ?it/s]

 Working on year 2006

0it [00:00, ?it/s]

 Working on year 2007

0it [00:00, ?it/s]

 Working on year 2008

0it [00:00, ?it/s]