This script is to summarize the text for SAS Viya dashboard. Here I applied TextRank algoritm to extract important sentences from a long narrative. 

### Import Packages

In [None]:
#import packages
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
import re
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize, sent_tokenize


In [None]:
ls

all_inspection.csv                 text2567_20200901_cms_reg4.xlsx
all_inspection_report.csv          text2567_20200901_cms_reg5a.csv
text2567_20200901_cms_reg10.csv    text2567_20200901_cms_reg5a.xlsx
text2567_20200901_cms_reg10.xlsx   text2567_20200901_cms_reg5b.csv
text2567_20200901_cms_reg1.csv     text2567_20200901_cms_reg5b.xlsx
text2567_20200901_cms_reg1.gsheet  text2567_20200901_cms_reg6.csv
text2567_20200901_cms_reg1.xlsx    text2567_20200901_cms_reg6.xlsx
text2567_20200901_cms_reg2.csv     text2567_20200901_cms_reg7.csv
text2567_20200901_cms_reg2.gsheet  text2567_20200901_cms_reg7.xlsx
text2567_20200901_cms_reg2.xlsx    text2567_20200901_cms_reg8.csv
text2567_20200901_cms_reg3.csv     text2567_20200901_cms_reg8.xlsx
text2567_20200901_cms_reg3.xlsx    text2567_20200901_cms_reg9.csv
text2567_20200901_cms_reg4.csv     text2567_20200901_cms_reg9.xlsx


### Load Data

In [None]:
# In the file, we have 11 datasets, we need to process the text data for all of them.
df = pd.read_excel('text2567_20200901_cms_reg1.xlsx')

In [None]:
# This is to add deficiency category features to the dataset
central_data = pd.read_excel('/content/drive/My Drive/central_data.xlsx')
central_data = central_data.drop('Note: data may not contain ALL deficiencies. Source website: https://data.medicare.gov/Nursing-Home-Compare/Health-Deficiencies/r5ix-sfxw/data',axis = 1)

In [None]:
inspection_report = pd.merge(df, central_data,how='left', left_on='deficiency_tag', right_on ='Deficiency Tag Number' )
inspection_report = inspection_report.drop('Deficiency Tag Number',axis = 1)

In [None]:
inspection_report = df.dropna()

### Process Data

In [None]:
# Remove useless sentences
def remove_pre_sentence(text):
    text = text.apply(lambda x: x.split('** ')[1] if x.startswith("**") else x)
    text = text.apply(lambda x: x[1:] if x.startswith(">") else x)  
    return text

In [None]:
inspection_report ['inspection_text'] = remove_pre_sentence(inspection_report ['inspection_text'])
inspection_report ['inspection_text'].head()

0    <BR/>Based on clinical record review, review o...
1    <BR/>Based on observations and interviews, the...
2    <BR/>Based on observations, review of facility...
3    <BR/>Based on clinical record reviews, review ...
4    <BR/>Based on clinical record review of facili...
Name: inspection_text, dtype: object

In [None]:
inspection_report ['inspection_text'][0]

"<BR/>Based on clinical record review, review of facility documentation and interviews for one sampled resident (Resident #347) who had reported a grievance regarding mistreatment, the facility failed to conduct a thorough investigation and/or report an allegation of neglect to the state agency. The finding include:<BR/>Resident #347's [DIAGNOSES REDACTED]. <BR/>The Resident Care Plan (RCP) dated 1/13/17 identified the resident with lung disease. Interventions directed to obtain pulse oximetry every shift and when needed, notify physician if oxygen saturation is below 90%, to administer oxygen as ordered and/or directed to observe for worsening Shortness of Breath (SOB), and to notify the physician of any unrelieved or new SOB at rest. <BR/>The 14 day Minimum Data Set (MDS) assessment dated [DATE] identified Resident #347 was noted with intact cognition, required extensive assistance with Activities of Daily Living (ADL) and indicated the received oxygen therapy. <BR/>The physician's o

In [None]:
# Remove html tags
def remove_html_tags(text):
    text = text.apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
    return text

In [None]:
inspection_report ['inspection_text'] = remove_html_tags(inspection_report ['inspection_text'])
inspection_report ['inspection_text'].head()

0    Based on clinical record review, review of fac...
1    Based on observations and interviews, the faci...
2    Based on observations, review of facility docu...
3    Based on clinical record reviews, review of fa...
4    Based on clinical record review of facility do...
Name: inspection_text, dtype: object

In [None]:
# Remove unique symbols
def remove_symbol(text):
  res = ''
  text = text.split(' ')
  for word in text:
    word = re.sub('[@#<>]*', '', word)
    if '*' in word:
      continue
    if word != '':
      res += word
      res += ' '
  return res.strip()

In [None]:
inspection_report ['inspection_text'] = inspection_report ['inspection_text'].apply(lambda x: remove_symbol(x))
inspection_report ['inspection_text'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


0    Based on clinical record review, review of fac...
1    A large reddish brown area was also noted on f...
2    Based on observations, review of facility docu...
3    Based on clinical record reviews, review of fa...
4    The quarterly MDS assessment dated [DATE] iden...
Name: inspection_text, dtype: object

### Summarize data

In [None]:
# TextTank
# Tokenizing the text 
def summery(text):
  stopWords = set(stopwords.words("english")) 
  words = word_tokenize(text) 
    
  # Creating a frequency table to keep the  
  # score of each word 
    
  freqTable = dict() 
  for word in words: 
      word = word.lower() 
      if word in stopWords: 
          continue
      if word in freqTable: 
          freqTable[word] += 1
      else: 
          freqTable[word] = 1
    
  # Creating a dictionary to keep the score 
  # of each sentence 
  sentences = sent_tokenize(text) 
  sentenceValue = dict() 
    
  for sentence in sentences: 
      for word, freq in freqTable.items(): 
          if word in sentence.lower(): 
              if sentence in sentenceValue: 
                  sentenceValue[sentence] += freq 
              else: 
                  sentenceValue[sentence] = freq 

  sumValues = 0
  for sentence in sentenceValue: 
      sumValues += sentenceValue[sentence] 
    
  # Average value of a sentence from the original text 
    
  average = int(sumValues / len(sentenceValue)) 
    
  # Storing sentences into our summary. 
  summary = '' 
  for sentence in sentences: 
      if (sentence in sentenceValue) and (sentenceValue[sentence] > (1 * average)): 
          summary += " " + sentence 
  return summary


In [None]:
for i in range(len(inspection_report)):
  if(len(inspection_report ['inspection_text'].iloc[i]) > 2000):
    inspection_report ['inspection_text'].iloc[i] = summery(inspection_report['inspection_text'].iloc[i])

In [None]:
inspection_report['inspection_text'].apply(lambda x:len(x.split('.'))).describe()

count    15944.000000
mean        11.859885
std          9.397542
min          1.000000
25%          6.000000
50%         10.000000
75%         15.000000
max        281.000000
Name: inspection_text, dtype: float64

In [None]:
for i in range(len(inspection_report['inspection_text'])):
  if len(inspection_report['inspection_text'].iloc[i]) > 2000:
    sentences = sent_tokenize(inspection_report['inspection_text'].iloc[i])
    summery = ''
    val = min(4, len(sentences))
    for j in range(val) :
      summery += sentences[j]
    inspection_report['inspection_text'].iloc[i] = summery


In [None]:
# If you process 'text2567_20200901_cms_reg2.xlsx', please name the csv as 'text2567_20200901_cms_reg2.csv'
inspection_report.to_csv('text2567_20200901_cms_reg1.csv',index = False)

### Combine Data

In [None]:
# Once all 11 datasets have been processed, I concatenate them to create a complete dataset
data1 = pd.read_csv('text2567_20200901_cms_reg1.csv')
data2 = pd.read_csv('text2567_20200901_cms_reg2.csv')
data3 = pd.read_csv('text2567_20200901_cms_reg3.csv')
data4 = pd.read_csv('text2567_20200901_cms_reg4.csv')
data5a = pd.read_csv('text2567_20200901_cms_reg5a.csv')
data5b = pd.read_csv('text2567_20200901_cms_reg5b.csv')
data6 = pd.read_csv('text2567_20200901_cms_reg6.csv')
data7 = pd.read_csv('text2567_20200901_cms_reg7.csv')
data8 = pd.read_csv('text2567_20200901_cms_reg8.csv')
data9 = pd.read_csv('text2567_20200901_cms_reg9.csv')
data10 = pd.read_csv('text2567_20200901_cms_reg10.csv')

data = pd.concat([data1,data2,data3,data4,data5a,data5b,data6,data7,data8,data9,data10],axis = 0)
data = data.drop('inspection_date', axis = 1)
data.to_csv('all_inspection_report.csv', index = False)