In [1]:
import os
import pandas as pd
from glob import glob
from data_utils import (
    clean_transcripts,
    flesch,
    fog,
    smog,
    terms,
    uniqueterms, 
    ttr, 
    mtld, 
    hdd
)
# FP_CNN = '../../readable_news_lf/cnn_cc/'
FP_MSNBC1 = 'raw_data/msnbc/msnbc--2003--2014.csv.gz'
FP_MSNBC2 = 'raw_data/msnbc/msnbc-2010--2021.csv.gz'

In [2]:
df1 = (pd.read_csv(FP_MSNBC1)
       .query('Content==Content')
       .reset_index(drop=True)
       .assign(
           Content=lambda df: df.Content.apply(clean_transcripts),
           # Compute readability
           flesch_reading_ease=lambda df: df.Content.apply(flesch),
           fog=lambda df: df.Content.apply(fog), 
           smog=lambda df: df.Content.apply(smog),  
           terms=lambda df: df.Content.apply(terms),
           # Compute lexical richness for text
           uniqueterms=lambda df: df.Content.apply(uniqueterms), 
           ttr=lambda df: df.Content.apply(ttr),
           mtld=lambda df: df.Content.apply(mtld),
           hdd=lambda df: df.Content.apply(hdd),    
       )           
      )
df1.head(3)

Unnamed: 0,cite,Source,Date,Show,Author,Location(s),Dateline,Section,Index Terms,Record Number,...,Estimated printed pages,Content,flesch_reading_ease,fog,smog,terms,uniqueterms,ttr,mtld,hdd
0,"MSNBC , Hardball for September 24, 2003",MSNBC,"Tuesday, September 24, 2002",HARDBALL,Chris MatthewsDavid Shuster,,,"News, International",Terrorism ; Iraq ; Politics ; War ; Elections ...,0FE8AF52AF84A531,...,26,i`m chris matthews. let`s play hardball.n nn t...,71.802736,12.968742,10.162546,7379,1545,0.21147,76.557372,0.867911
1,"COUNTDOWN: IRAQ for March 31, 2003",MSNBC,"Monday, March 31, 2003",COUNTDOWN: IRAQ,Keith OlbermannCarl RochelleJohn IrvineKerry S...,,,"News, International",Iraq ; War ; United States ; United Nations ; ...,0FE5A1D810A218B3,...,24,"p.m. in the east, a.m. in baghdad, fighting co...",56.027565,12.340627,14.244171,7704,1967,0.25699,106.771874,0.879302
2,"HARDBALL for March 31, 2003",MSNBC,"Monday, March 31, 2003",HARDBALL,Chris MatthewsJohn WardenCarl RochelleJohn Irv...,,,"News, International",War ; Military ; Iraq ; SHOW,0FE5A1CEDF3C105E,...,40,"b'n nn george w. by acting today, we are savin...",60.810621,13.714394,13.320208,12196,2334,0.192305,92.143945,0.876826


In [3]:
(df1
 .drop(['Content'], axis=1)
 .to_csv('data/msnbc2003_2014_textstat.csv.gz', compression="gzip")
)

In [2]:
%%time
df2 = (pd.read_csv(FP_MSNBC2)
       .query('text==text')
       .reset_index(drop=True)
       .assign(
           text=lambda df: df.text.apply(clean_transcripts),
           # Compute readability for text
           flesch_reading_ease=lambda df: df.text.apply(flesch),
           fog=lambda df: df.text.apply(fog), 
           smog=lambda df: df.text.apply(smog),  
           terms=lambda df: df.text.apply(terms),
           # Compute lexical richness for text
           uniqueterms=lambda df: df.text.apply(uniqueterms), 
           ttr=lambda df: df.text.apply(ttr),
           mtld=lambda df: df.text.apply(mtld),
           hdd=lambda df: df.text.apply(hdd),    
       )           
      )
df2.head(3)

CPU times: user 3h 33min 26s, sys: 5.48 s, total: 3h 33min 31s
Wall time: 3h 33min 31s


Unnamed: 0,air_date,show_name,headline,guests,url,channel.name,program.name,uid,duration,year,...,summary,text,flesch_reading_ease,fog,smog,terms,uniqueterms,ttr,mtld,hdd
0,May 28 2010,The Rachel Maddow Show,"The Rachel Maddow Show, Transcript 05/27/10","Guests: Rep. Charlie Melancon, Linda Tollner, ...",https://www.msnbc.com/transcripts/rachel-maddo...,MSNBC,"The Rachel Maddow Show, Transcript 05/27/10",,,2010,...,,"and now, to discuss her latest “geek week” ext...",78.872488,9.210783,8.817932,7937,1455,0.199561,58.260061,0.865781
1,Jun 02 2010,The Rachel Maddow Show,"The Rachel Maddow Show, Transcript 06/01/10",Guests: Chris Hayes,https://www.msnbc.com/transcripts/rachel-maddo...,MSNBC,"The Rachel Maddow Show, Transcript 06/01/10",,,2010,...,,"good evening, keith. thank you very much for t...",67.389709,9.3871,11.496674,7729,1653,0.223771,78.955855,0.87155
2,Jun 03 2010,The Rachel Maddow Show,"The Rachel Maddow Show, Transcript 06/02/10","Guests: Sen. Barbara Boxer, David Muth, Larry ...",https://www.msnbc.com/transcripts/rachel-maddo...,MSNBC,"The Rachel Maddow Show, Transcript 06/02/10",,,2010,...,,"and now, for the very latest from the gulf coa...",72.9698,8.745827,10.236856,7568,1492,0.209404,67.550437,0.868869


In [3]:
(df2
 .drop(['text'], axis=1)
 .to_csv('data/msnbc2010_2021_textstat.csv.gz', compression="gzip")
)