In [None]:
# !pip install textstat
# !pip install lexicalrichness

In [None]:
import pandas as pd
pd.options.display.max_columns = 50
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import textstat
from lexicalrichness import LexicalRichness

def uniqueterms(text):
    lex = LexicalRichness(text)
    return lex.terms

def ttr(text):
    lex = LexicalRichness(text)
    if lex.words>1:
        return lex.ttr
    else:
        return None

def mtld(text):
    lex = LexicalRichness(text)
    if lex.words>1:
        return lex.mtld(threshold=0.72)
    else:
        return None

def hdd(text):
    lex = LexicalRichness(text)
    if lex.words>42:
        return lex.hdd(draws=42)
    else:
        return None

# FP_NYT = '../readable_news_lf/nyt.csv'
FP_NYT = 'raw_data/nyt.zip'

In [None]:
# _usecols = ['Body', 'Headline', 'Lead Paragraph', 'News Desk', 'Online Section', 'Publication Year']
_usecols = ['Body', 'News Desk', 'Online Section', 'Publication Year']

df_nyt = (pd.read_csv(FP_NYT, usecols=_usecols)
          .rename(columns={'Publication Year': 'year'})
          .query('year==year')
          .query('Body==Body')
          .assign(Body=lambda df: df.Body.str.strip())
          .query('Body!=""')
          .assign(
              # Compute readability
              flesch_reading_ease=lambda df: df.Body.apply(textstat.flesch_reading_ease),
              flesch_kincaid_grade=lambda df: df.Body.apply(textstat.flesch_kincaid_grade),
              fog=lambda df: df.Body.apply(textstat.gunning_fog), 
              smog=lambda df: df.Body.apply(textstat.smog_index), 
              # Compute lexical richness
              uniqueterms=lambda df: df.Body.apply(uniqueterms), 
              ttr=lambda df: df.Body.apply(ttr),
              mtld=lambda df: df.Body.apply(mtld),
              hdd=lambda df: df.Body.apply(hdd),
          )
         )

# df_nyt.memory_usage(deep=True).sum()/ 1073741824
df_nyt.head(3)

In [None]:
(df_nyt
 .drop('Body', axis=1)
 .to_csv('nyt_1987_2007_textstat.csv.gz', index=False, compression="gzip")
)