In [1]:
import os
import pandas as pd
import textstat
from utils import uniqueterms, ttr, mtld, hdd

# FP_NPR = '../../readable_news_lf/news_dialogue.zip'
FP_NPR = 'raw_data/news_dialogue.zip'

In [2]:
df = (pd.read_json(FP_NPR)
      .query('utt==utt')
      .reset_index(drop=True)
      .assign(text=lambda df: [' '.join(li) for li in df.utt])
      .assign(
          # Compute readability for text
          flesch_reading_ease=lambda df: df.text.apply(textstat.flesch_reading_ease),
          flesch_kincaid_grade=lambda df: df.text.apply(textstat.flesch_kincaid_grade),
          fog=lambda df: df.text.apply(textstat.gunning_fog), 
          smog=lambda df: df.text.apply(textstat.smog_index),  
          terms=lambda df: df.text.apply(textstat.lexicon_count),
          # Compute lexical richness for text
          uniqueterms=lambda df: df.text.apply(uniqueterms), 
          ttr=lambda df: df.text.apply(ttr),
          mtld=lambda df: df.text.apply(mtld),
          hdd=lambda df: df.text.apply(hdd),    
      )          
     )
df.head(3)

Unnamed: 0,id,program,date,url,title,summary,utt,speaker,text,flesch_reading_ease,flesch_kincaid_grade,fog,smog,terms,uniqueterms,ttr,mtld,hdd
0,NPR-1,News & Notes,2007-11-28,https://www.npr.org/templates/story/story.php?...,Black Actors Give Bible Star Appeal,"More than 400 black actors, artists and minist...","[Now, moving on, Forest Whitaker as Moses, Tis...","[FARAI CHIDEYA, host, FARAI CHIDEYA, host, Mr....","Now, moving on, Forest Whitaker as Moses, Tish...",73.98,6.5,7.66,9.5,1417,479,0.327186,58.140244,0.831717
1,NPR-2,Weekend Edition Sunday,2016-10-23,https://www.npr.org/2016/10/23/499042298/young...,"Young, First-Time Voters Share Views On Electi...",NPR's Rachel Martin speaks with young voters w...,[You have heard it again and again - this is a...,"[RACHEL MARTIN, HOST, ASHANTI MARTINEZ, LAUREN...",You have heard it again and again - this is an...,75.5,5.9,6.67,9.1,1914,508,0.252988,69.832405,0.846484
2,NPR-3,News & Notes,2007-11-30,https://www.npr.org/templates/story/story.php?...,Snapshots: On Solid Ground,"In this week's snapshot, actor and playwright ...","[I came close to running out of luck, when I a...","[Mr. JEFF OBAFEMI CARR (Actor, Playwright), CH...","I came close to running out of luck, when I al...",79.19,6.5,8.91,8.8,696,358,0.507082,110.83306,0.859794


In [5]:
(df
 .drop(['text', 'utt'], axis=1)
 .to_csv('data/npr_cnn_textstat.csv.gz', index=False, compression="gzip")
)