In [44]:
import pandas as pd
import janitor

from data_utils import (
    fix_separated_words,
    clean_transcripts,
    flesch, fog, smog, # readability
    terms, uniqueterms, # word counters
    ttr, mtld, hdd # lexical richness
)

In [45]:
df = (pd.read_csv('../data/1000-toi-sample.csv.gzip', compression='gzip')
      .clean_names()
      # Remove ads
      .query('~objecttype.str.contains("advert", case=False)')
      .dropna(subset='fulltext')
      .assign(fulltext=lambda df: df.fulltext.apply(fix_separated_words))
      .assign(
          fulltext=lambda df: df.fulltext.apply(clean_transcripts),
          # Compute readability
          flesch_reading_ease=lambda df: df.fulltext.apply(flesch),
          fog=lambda df: df.fulltext.apply(fog), 
          smog=lambda df: df.fulltext.apply(smog),  
          terms=lambda df: df.fulltext.apply(terms),
          # Compute lexical richness for text
          uniqueterms=lambda df: df.fulltext.apply(uniqueterms), 
          ttr=lambda df: df.fulltext.apply(ttr),
          mtld=lambda df: df.fulltext.apply(mtld),
          hdd=lambda df: df.fulltext.apply(hdd),    
      )
     )
df

Unnamed: 0,sourcetype,publication_publicationid,abstract,contributor_personname,contributor_lastname,urldocview,datetimestamp,publication_qualifier,alphapubdate,actioncode,...,numericpubdate,cleanedfulltext,flesch_reading_ease,fog,smog,terms,uniqueterms,ttr,mtld,hdd
1,Historical Newspapers,54644,,,,http://search.proquest.com/docview/739304513/,20170928174009,"New Delhi, India","Jul 24, 1983",change,...,19830724,hearing of staff union suit against bse by sta...,-225.666516,,121.768641,287.0,152,0.529617,59.071014,0.798904
2,Historical Newspapers,54644,,,,http://search.proquest.com/docview/346635909/,20170928165334,"New Delhi, India","Apr 28, 1958",change,...,19580428,bombay city suburbs fisherman kills wife daugh...,-288.851000,,147.314286,350.0,187,0.535817,76.268112,0.827223
3,Historical Newspapers,54644,Mumbai: The high court passed an order last ye...,,,http://search.proquest.com/docview/1809804914/,20170928180033,"New Delhi, India","Jun 24, 2007",change,...,20070624,parents school in faceoff over dyslexic' child...,-171.130610,,103.684553,492.0,239,0.481855,106.574187,0.849251
5,Historical Newspapers,54644,,,,http://search.proquest.com/docview/609122232/,20170928172217,"New Delhi, India","Oct 9, 1926",change,...,19261009,absconding driver hindu knocked by car and kil...,-144.088356,,92.714155,219.0,127,0.579909,86.408413,0.827451
6,Historical Newspapers,54644,"Lieutenant D.G. Bryce, Deputy Assistant Commis...",,,http://search.proquest.com/docview/234144570/,20170928163332,"New Delhi, India","Jun 30, 1898",change,...,18980630,military intelligence lieutenant . bryce deput...,,,,22.0,22,1.000000,22.000000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
994,Historical Newspapers,54694,WE subjoin a letter on a subject of late frequ...,,,http://search.proquest.com/docview/233847515/,20170928171835,"Bombay, India","Aug 23, 1856",change,...,18560823,wb subjoin a letter on u subjeot of late frequ...,47.027813,,17.533929,384.0,153,0.399478,104.856530,0.830109
996,Historical Newspapers,54644,,,,http://search.proquest.com/docview/507137117/,20170928170628,"New Delhi, India","Nov 8, 1928",change,...,19281108,republican control government of united states...,-161.126743,,95.823853,218.0,119,0.545872,58.532872,0.776315
997,Historical Newspapers,54644,While the government is engaged in a detailed ...,,,http://search.proquest.com/docview/608355385/,20170928172049,"New Delhi, India","Mar 19, 1994",change,...,19940319,pak studying us n proposal by mb naqvi the tim...,-650.195793,,289.708782,706.0,347,0.491501,120.301449,0.853534
998,Historical Newspapers,54644,The Lord Biahop or Madras who is at present on...,,,http://search.proquest.com/docview/310889027/,20170928163930,"New Delhi, India","Mar 15, 1929",change,...,19290315,union of churches in south india address by ma...,-360.414907,,177.274766,428.0,231,0.540984,92.221114,0.828066


In [47]:
(df
 .to_csv('../data/_toi_textstat.csv.gz', compression="gzip")
)

In [80]:
df.columns

Index(['sourcetype', 'publication_publicationid', 'abstract',
       'contributor_personname', 'contributor_lastname', 'urldocview',
       'datetimestamp', 'publication_qualifier', 'alphapubdate', 'actioncode',
       'contributor', 'contributor_persontitle', 'startpage', 'recordid',
       'products_product', 'version', 'recordtitle', 'contributor_contribrole',
       'contributor_firstname', 'contributor_namesuffix',
       'contributor_middlename', 'fulltext', 'objecttype',
       'contributor_organizationname', 'publication_title', 'publisher',
       'contributor_originalform', 'languagecode', 'numericpubdate',
       'cleanedfulltext', 'flesch_reading_ease', 'fog', 'smog', 'terms',
       'uniqueterms', 'ttr', 'mtld', 'hdd'],
      dtype='object')

In [78]:
a = df.loc[24, 'fulltext']
a

"FACE OFF IN STYLE Forget facelifts and other cosmetic procedures All it takes is little exercise to get that perfect face BTgives you few tips on facial fitness face eing beautiful on the outside is all about keeping the face young and looking healthy We know that the absence of wrinkles and sags as well as having taut toned smooth and glowing skin are the signs of youthful and healthy beauty Some go through so much just to maintain the vigour and agelessness of their faces so much that they would spend fortune on face lifts and other procedures that would keep them looking as close to 18 years old as possible There is however an inexpensive way of retaining youthful beauty and that is through observing regular facial fitness routine that is exercising the muscles of your face Hollywood stars like Sharon Stone swear by their daily facial fitness routine The idea behind more quickly Maintaining regular facial fitness routine is easy It only takes few minutes of your time and it can be 

In [79]:
a = df.loc[18, 'objecttype']
a

"['Feature', 'Article']"

In [11]:
from autocorrect import Speller
spell = Speller()


In [22]:
b = 'in different units particular-  ly'
b

'in different units particular-  ly'

In [23]:
# Remove extra whitespaces 
import re
re_extra_whitespaces = re.compile(r"\s+")
text = re_extra_whitespaces.sub(" ", b).strip()    
text

'in different units particular- ly'

In [28]:
a = fix_separated_words(df.loc[1, 'fulltext'])
a

'Hearing of staff union suit against BSE By Staff Reporter BOMBAY July 23 The statement by Mr Laldas Jamnadas president of the Bombay Stock Exchange published in the issue of July 21 that the Bombay Share Bazar Staff Union has moved vthe City Civil Court challenging the issue of gate passes to share broken by the stock exchange authorities and that the court had ruled that the stock exchange building being the property of the stock exchange the authorities of the stock exchange have right to refuse entry to any one particular at time when share worth crores of rupees are to be delivered by the member brokers was erroneously given The error is regretted Actually the Bombay City Civil Court passed an ad-interim injunction on July 21 restraining the stock exchange from barring employees of share brokers from entering the building to perform their duties The ad-interim injunction was granted pending the hearing and final disposal of the notice of motion moved by the members of the union Th

In [16]:
b.replace('- ', '')

'in different units particularly'

In [14]:
spell('in different units particular- ly')

'in different units particular- ly'