In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_colwidth = 200

## for text summarization
import re
import heapq 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sumy.summarizers.text_rank import TextRankSummarizer
nltk.download('stopwords')
stopwords = stopwords.words('english')
nltk.download('punkt')


import sumy
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from lexrank import STOPWORDS, LexRank

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lilianacruzlopez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lilianacruzlopez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def add_titles(row):
    "add the titles to the text"
    row["text"].insert(0,row["title"])
    
def section_ids(df):
    ''' 
    input data frame with the following columns document, page, Class
    this will extract the section if once the row is tagged
    '''
    
    secID_in = []
    for doc in df["document"].unique():
        temp = df[df["document"]==doc]
        j = 1
        for i in range(len(temp)):
            if temp.iloc[i,2] == 1.0:
                secID_in.append((temp.iloc[i,0],temp.iloc[i,1],temp.iloc[i,2],0))
                j+=1
            elif temp.iloc[i,2] == 0.0:
                secID_in.append((temp.iloc[i,0],temp.iloc[i,1],temp.iloc[i,2],j))
            else:
                secID_in.append((temp.iloc[i,0],temp.iloc[i,1],temp.iloc[i,2],np.nan))

    sections = pd.DataFrame(secID_in, columns =["document","page","Class","secIDin"])
    return sections

def get_text(df_after_classification):
    # gets sections ID  
    sec_doc = section_ids(df_after_classification[["document", "page", "Class"]])

    # merges the sections ID with DataFrame
    data = pd.merge(df_after_classification,
                    sec_doc,left_index=True, right_index=True, on=["document","page", "Class"])
    
    text = data[["text","document","page","Class",
              "secIDin"]].groupby(["document","secIDin"])['text'].apply(list).reset_index()
    
    # extracts the text
    text = text[(text["secIDin"]!=0) & (text["secIDin"]!=1)] 
    
    # extracts titles
    #titles = data[["document","text","Class"]].where(data.Class ==1).dropna().reset_index(drop=1)
    #titles["secIDin"] = titles.groupby("document")["Class"].rank(method='first')
    #titles["secIDin"] = titles["secIDin"] +1
    #titles.rename(index=str, columns={"text": "title"}, inplace=True)
    #titles.drop("Class",axis=1, inplace=True)
    #temp_df = pd.merge(text,titles,on=["document","secIDin"]) # merges the text and the titles
    #_ = temp_df[["text","title"]].apply(add_titles, axis=1) # adds the titles to the text
    #temp_df.drop(columns=["title"], inplace=True)
    
    temp_df = text
    temp_df["secIDin"] = temp_df["secIDin"]-1
    
    return temp_df

### the cell below cleans the data but it is not needed if the data is already cleaned. 

In [3]:
excel_file = pd.read_excel('./features.xlsx', sheet_name = 'features_v7' , 
                           header=1, skip_blank_lines = False) 
excel_file['left'] = pd.to_numeric(excel_file['left'].str.replace('px', ''))
excel_file['top'] = pd.to_numeric(excel_file['top'].str.replace('px', ''))
excel_file["page"] = excel_file["page"].apply(lambda x: int(''.join(filter(str.isdigit, x))))
excel_file["document"] = excel_file["document"].apply(lambda x: x.split("/")[-1])
excel_file["LSL"] = excel_file.groupby(["document","page"])["left"].apply(lambda x: x.diff()).fillna(0)
excel_file["LST"] = excel_file.groupby(["document","page"])["top"].apply(lambda x: x.diff()).fillna(0)
#Clean px and convert to right data type for columns: fontsize, left and top
excel_file['font-size'] = pd.to_numeric(excel_file['font-size'].str.replace('px', ''))

### this is the code to extract the important info. 

In [4]:
df_after_classification = excel_file[["document","page","text","Class",]]
print("this is how the dataframe should look like to pass  it to my function")
df_after_classification.head()

this is how the dataframe should look like to pass  it to my function


Unnamed: 0,document,page,text,Class
0,CMS_2014_0115_0059.pdf,1,2014-10-10 00:00:00,0.0
1,CMS_2014_0115_0059.pdf,1,Centers for Medicare & Medicaid Services,0.0
2,CMS_2014_0115_0059.pdf,1,Department of Health and Human Services,0.0
3,CMS_2014_0115_0059.pdf,1,Attention: CMS-9968-P,0.0
4,CMS_2014_0115_0059.pdf,1,P.O. Box 8013,0.0


### using the function

In [5]:
documents = get_text(df_after_classification)
documents.head()

Unnamed: 0,document,secIDin,text
2,CMS_2014_0115_0059.pdf,1.0,"[students of religious institutions, To Whom It May Concern:, On behalf of Nationwide Life Insurance Company (“Nationwide”) and its affiliated companies, we, appreciate the opportunity to provide ..."
3,CMS_2014_0115_0059.pdf,2.0,"[significant administrative burden upon all parties., The Department’s basic premise that “issuers generally would find that providing such contraceptive, coverage is cost neutral” is in error (Fe..."
4,CMS_2014_0115_0059.pdf,3.0,"[automatically enrolled in a contraceptive-only health plan., Students who choose to attend a religious institution of higher learning do so for a reason, and most, of the time, these students str..."
5,CMS_2014_0115_0059.pdf,4.0,"[partaking in providing contraceptive coverage to its students if the same SHIP it contracts, with for general student health must also provide contraceptive coverage via individual, policies., In..."
6,CMS_2014_0115_0059.pdf,5.0,"[contrary to basic contract law., An issued health insurance policy is a contract between an insurance company and the insured., Contracts are binding and enforceable only when one party extends a..."


# MODELS 

In [6]:
def format_summary(summary_sentences):
    ## THIS FORMATS THE OUTPUT AS KPMG DID
    intro = ["The commenter stated that ", "The commented further stated that "]
    summary_sentences = [str(intro[i]) + str(summary_sentences[i].lower()) 
                         for i in range(len(summary_sentences))]
    summary = ' '.join(summary_sentences) 
    summary = summary.replace(". ",".\n")
    summary = summary.replace("we","they")
    return summary

In [7]:
# source https://stackabuse.com/text-summarization-with-nltk-in-python/
def text_summarization_nltk_model(article):
    '''
    input:a dataframe with the following features 1) document name (document), 2) section id (secIDin), 3) text (text) 
    output: a dataframe with features from the input plus the text summarization (2 sentences)
    function: this function does text summarization using NLTK
    '''
    try:
        article_text = " ".join(article)
        # Removing Square Brackets and Extra Spaces 
        article_text = re.sub(r'\s+', ' ', article_text) 

        # Removing special characters and digits
        formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )  
        formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)  

        sentence_list = nltk.sent_tokenize(article_text) 

        stopwords = nltk.corpus.stopwords.words('english')

        word_frequencies = {}  
        for word in nltk.word_tokenize(formatted_article_text):  
            if word not in stopwords:
                if word not in word_frequencies.keys():
                    word_frequencies[word] = 1
                else:
                    word_frequencies[word] += 1

        maximum_frequncy = max(word_frequencies.values())

        for word in word_frequencies.keys():  
            word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)


        sentence_scores = {}  
        for sent in sentence_list:  
            for word in nltk.word_tokenize(sent.lower()):
                if word in word_frequencies.keys():
                    if len(sent.split(' ')) < 30:
                        if sent not in sentence_scores.keys():
                            sentence_scores[sent] = word_frequencies[word]
                        else:
                            sentence_scores[sent] += word_frequencies[word]

        summary_sentences = heapq.nlargest(2, sentence_scores, key=sentence_scores.get)
        summary = format_summary(summary_sentences)
                
    except:
        summary = article
            
    return summary

In [8]:
def text_summarization_nltk(documents):
    text = []
    summ = []
    sec = []
    page = []
    doc_name = []

    for i in range(len(documents)):
        article_text = documents.iloc[i,2]
        text.append(article_text)
        summ.append(text_summarization_nltk_model(article_text))
        sec.append(documents.iloc[i,1])
        doc_name.append(documents.iloc[i,0])

    d = {"document":doc_name, "secIDin":sec,'text': text, 'summary_NLTK': summ}
    docs = pd.DataFrame(data=d)
    
    return docs

In [9]:
def text_summarization_lexRank(df_final):
    '''
    input:a dataframe with the following features 1) document name (document), 2) section id (secIDin), 
            3) text (text) 
    output: a dataframe with features from the input plus the text summarization (45% of the setences)
    func: this fucntion does text summarization using Lex Rank 
    '''
    summarizer = LexRankSummarizer()
    
    df_final["summary_Lex_Rank"] = np.nan ## add empty column called 'Sec_Summary'
    df_final['text'] = df_final['text'].apply(str)

    for i in range(len(df_final)-1):
        parser = PlaintextParser.from_string(df_final['text'].iloc[i], Tokenizer("english"))
        all_sentences = sent_tokenize(df_final['text'].iloc[i])
        num_sentences = len(all_sentences)
        n = num_sentences*0.45  ## summarize the document with 45% sentences
        summary = summarizer(parser.document, n)
        df_final['summary_Lex_Rank'].iloc[i] = summary
    return df_final

In [10]:
def text_summarization_lsa(df):
    '''
    input:a dataframe with the following features 1) document name (document), 2) section id (secIDin), 
            3) text (text) 
    output: a dataframe with features from the input plus the text summarization (45% of the setences)
    func: this fucntion does text summarization using lsa 
    '''
    summarizer_2 = LsaSummarizer()
    df["summary_LSA"] = np.nan

    for i in range(len(df)-1):

        parser = PlaintextParser.from_string(df['text'].iloc[i], Tokenizer("english"))
        all_sentences = sent_tokenize(df['text'].iloc[i])
        num_sentences = len(all_sentences)
        n = num_sentences*0.45  ## summarize the document with 45% sentences
        summary_LSA = summarizer_2(parser.document, n)
        df["summary_LSA"].iloc[i] = summary_LSA
    
    return df

In [11]:
def text_summarization_textRank(df_final):
    summarizer_3 = TextRankSummarizer()

    df_final["summary_TextRank"] = np.nan

    for i in range(len(df_final)-1):

        parser = PlaintextParser.from_string(df_final['text'].iloc[i], Tokenizer("english"))
        all_sentences = sent_tokenize(df_final['text'].iloc[i])
        num_sentences = len(all_sentences)
        n = num_sentences*0.45  ## summarize the document with 45% sentences
        summary_TextRank = summarizer_3(parser.document, n)
        df_final["summary_TextRank"].iloc[i] = summary_TextRank
    return df_final


In [12]:
def text_summarization_luhn(df_final):
    summarizer_1 = LuhnSummarizer()
    df_final["summary_Luhn"] = np.nan

    for i in range(len(df_final)-1):

        parser = PlaintextParser.from_string(df_final['text'].iloc[i], Tokenizer("english"))
        all_sentences = sent_tokenize(df_final['text'].iloc[i])
        num_sentences = len(all_sentences)
        n = num_sentences*0.45  ## summarize the document with 45% sentences
        summary_Luhn = summarizer_1(parser.document, n)
        df_final["summary_Luhn"].iloc[i] =   summary_Luhn
        
        #df_final["summary_Luhn"] = df_final["summary_Luhn"].apply(lambda x: format_summary(x))
    
    return df_final

In [13]:
summary = text_summarization_nltk(documents)
summary= text_summarization_lexRank(summary)
summary = text_summarization_lsa(summary)
summary = text_summarization_textRank(summary)
summary = text_summarization_luhn(summary)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [15]:
summary.to_csv("sample.csv")