In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_colwidth = 500

In [2]:
def section_ids(df):
    ''' 
    input data frame with the following columns document, page, Class
    this will extract the section if once the row is tagged
    '''
    
    secID_in = []
    for doc in df["document"].unique():
        temp = df[df["document"]==doc]
        j = 1
        for i in range(len(temp)):
            if temp.iloc[i,2] == 1.0:
                secID_in.append((temp.iloc[i,0],temp.iloc[i,1],temp.iloc[i,2],0))
                j+=1
            elif temp.iloc[i,2] == 0.0:
                secID_in.append((temp.iloc[i,0],temp.iloc[i,1],temp.iloc[i,2],j))
            else:
                secID_in.append((temp.iloc[i,0],temp.iloc[i,1],temp.iloc[i,2],np.nan))

    sections = pd.DataFrame(secID_in, columns =["document","page","Class","secIDin"])
    return sections

def get_text(df_after_classification):
    
    # gets sections ID  
    sec_doc = section_ids(df_after_classification[["document", "page", "Class"]])

    # merges the sections ID with DataFrame
    data = pd.merge(df_after_classification,
                    sec_doc,left_index=True, right_index=True, on=["document","page", "Class"])
    
    text = data[["text","document","page","Class",
              "secIDin"]].groupby(["document","secIDin"])['text'].apply(list).reset_index()

    # extracts the text
    text = text[(text["secIDin"]!=0) & (text["secIDin"]!=1)] 
    
    # merges the text and the titles
    temp_df = text
    temp_df["secIDin"] = temp_df["secIDin"]-1
    
    return temp_df

### the cell below cleans the data but it is not needed if the data is already cleaned. 

In [3]:
excel_file = pd.read_excel('./features.xlsx', sheet_name = 'features_v7' , 
                           header=1, skip_blank_lines = False) 
excel_file['left'] = pd.to_numeric(excel_file['left'].str.replace('px', ''))
excel_file['top'] = pd.to_numeric(excel_file['top'].str.replace('px', ''))
excel_file["page"] = excel_file["page"].apply(lambda x: int(''.join(filter(str.isdigit, x))))
excel_file["document"] = excel_file["document"].apply(lambda x: x.split("/")[-1])
excel_file["LSL"] = excel_file.groupby(["document","page"])["left"].apply(lambda x: x.diff()).fillna(0)
excel_file["LST"] = excel_file.groupby(["document","page"])["top"].apply(lambda x: x.diff()).fillna(0)
#Clean px and convert to right data type for columns: fontsize, left and top
excel_file['font-size'] = pd.to_numeric(excel_file['font-size'].str.replace('px', ''))

### this is the code to extract the important info. 

In [4]:
df_after_classification = excel_file[["document","page","text","Class",]]
print("this is how the dataframe should look like to pass  it to my function")
df_after_classification.head()

this is how the dataframe should look like to pass  it to my function


Unnamed: 0,document,page,text,Class
0,CMS_2014_0115_0059.pdf,1,2014-10-10 00:00:00,0.0
1,CMS_2014_0115_0059.pdf,1,Centers for Medicare & Medicaid Services,0.0
2,CMS_2014_0115_0059.pdf,1,Department of Health and Human Services,0.0
3,CMS_2014_0115_0059.pdf,1,Attention: CMS-9968-P,0.0
4,CMS_2014_0115_0059.pdf,1,P.O. Box 8013,0.0


### using the function

In [5]:
documents = get_text(df_after_classification)
documents.head()

Unnamed: 0,document,secIDin,text
2,CMS_2014_0115_0059.pdf,1.0,"[students of religious institutions, To Whom It May Concern:, On behalf of Nationwide Life Insurance Company (“Nationwide”) and its affiliated companies, we, appreciate the opportunity to provide comments in response to CMS–9940–Pin which the Internal, Revenue Service (“IRS”), Employee Benefits Security Administration (“EBSA”), and the, Department of Health and Human Services (“HHS”) solicited comments on its proposed rule, concerning the coverage of certain preventive services under the Pat..."
3,CMS_2014_0115_0059.pdf,2.0,"[significant administrative burden upon all parties., The Department’s basic premise that “issuers generally would find that providing such contraceptive, coverage is cost neutral” is in error (Federal Register, Vol. 78, No. 25, Pg. 8463). Economic,, actuarial, and administrative data prove otherwise. A February 2012 publication by the Office of the, Assistant Secretary for Planning and Evaluation (ASPE) entitled The Cost of Covering Contraceptives, through Health Insurance found that the ac..."
4,CMS_2014_0115_0059.pdf,3.0,"[automatically enrolled in a contraceptive-only health plan., Students who choose to attend a religious institution of higher learning do so for a reason, and most, of the time, these students strongly believe and share in the religious convictions of their chosen, institution. Therefore, many who attend anti-contraceptive institutions may not wish to be or may, even be extremely offended by being automatically enrolled in an individual contraceptive-only, policy. It will simply be a waste o..."
5,CMS_2014_0115_0059.pdf,4.0,"[partaking in providing contraceptive coverage to its students if the same SHIP it contracts, with for general student health must also provide contraceptive coverage via individual, policies., In the proposed rules, the Departments state, “the eligible organization would have no role in, contracting, arranging, paying, or referring for this separate contraceptive coverage” (Federal, Register, Vol. 79, No. 166, Pg. 51118). This is unattainable and unrealistic. Even if SHIP issuers, provide s..."
6,CMS_2014_0115_0059.pdf,5.0,"[contrary to basic contract law., An issued health insurance policy is a contract between an insurance company and the insured., Contracts are binding and enforceable only when one party extends an offer, the other party accepts, the offer, and adequate consideration is exchanged by both. In the context of an insurance policy,, the benefits offered by the issuer in exchange for a premium paid by the insured are both adequate, consideration. However, under the proposed rule, the “insured” is ..."


### TEXT SUMMARIZATION

In [6]:
import re
import nltk
import heapq 

def text_summarization(article):

    #print(article)
    
    article_text = " ".join([str(elem) for elem in article]) 
    # Removing Square Brackets and Extra Spaces 
    article_text = re.sub(r'\s+', ' ', article_text) 

    # Removing special characters and digits
    formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )  
    formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)  

    sentence_list = nltk.sent_tokenize(article_text) 

    stopwords = nltk.corpus.stopwords.words('english')

    word_frequencies = {}  
    for word in nltk.word_tokenize(formatted_article_text):  
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1

    maximum_frequncy = max(word_frequencies.values())

    for word in word_frequencies.keys():  
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)


    sentence_scores = {}  
    for sent in sentence_list:  
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]

    summary_sentences = heapq.nlargest(2, sentence_scores, key=sentence_scores.get)

    ## THIS FORMATS THE OUTPUT AS KPMG DID
    intro = ["The commenter stated that ", "The commenter further stated that "]
    summary_sentences = [str(intro[i]) + str(summary_sentences[i].lower()) 
                         for i in range(len(summary_sentences))]
    summary = ' '.join(summary_sentences) 
    summary = summary.replace(". ",".\n\n")
    summary = summary.replace("we","they")


            
    return summary

In [7]:
text = []
summ = []
sec = []
page = []
doc_name = []

for i in range(len(documents)):
    article_text = documents.iloc[i,2]
    text.append(article_text)
    summ.append(text_summarization(article_text))
    sec.append(documents.iloc[i,1])
    doc_name.append(documents.iloc[i,0])
    
d = {"document":doc_name, "secIDin":sec,'text': text, 'summary': summ}
docs = pd.DataFrame(data=d)
docs.head()

Unnamed: 0,document,secIDin,text,summary
0,CMS_2014_0115_0059.pdf,1.0,"[students of religious institutions, To Whom It May Concern:, On behalf of Nationwide Life Insurance Company (“Nationwide”) and its affiliated companies, we, appreciate the opportunity to provide comments in response to CMS–9940–Pin which the Internal, Revenue Service (“IRS”), Employee Benefits Security Administration (“EBSA”), and the, Department of Health and Human Services (“HHS”) solicited comments on its proposed rule, concerning the coverage of certain preventive services under the Pat...",The commenter stated that proposed solution: utilize the exchanges to provide free contraceptive coverage to students attending religious institutions.\n\nThe commenter further stated that students enrolled in a religious institution of higher education may not wish to be automatically enrolled in a contraceptive-only health plan.
1,CMS_2014_0115_0059.pdf,2.0,"[significant administrative burden upon all parties., The Department’s basic premise that “issuers generally would find that providing such contraceptive, coverage is cost neutral” is in error (Federal Register, Vol. 78, No. 25, Pg. 8463). Economic,, actuarial, and administrative data prove otherwise. A February 2012 publication by the Office of the, Assistant Secretary for Planning and Evaluation (ASPE) entitled The Cost of Covering Contraceptives, through Health Insurance found that the ac...","The commenter stated that thus, the total cost of enrolling students and their beneficiaries into a separate individual contraceptive-only health policy is high.\n\nThe commenter further stated that train and pay filing costs for registering agents and brokers to sell individual health policies."
2,CMS_2014_0115_0059.pdf,3.0,"[automatically enrolled in a contraceptive-only health plan., Students who choose to attend a religious institution of higher learning do so for a reason, and most, of the time, these students strongly believe and share in the religious convictions of their chosen, institution. Therefore, many who attend anti-contraceptive institutions may not wish to be or may, even be extremely offended by being automatically enrolled in an individual contraceptive-only, policy. It will simply be a waste o...","The commenter stated that just like the decision-making process to attend any school, students are acutely aware of the positives and negatives of attending a religious institution.\n\nThe commenter further stated that students attending religious institutions are aware of the religious doctrines the school has employed as part of their daily lives."
3,CMS_2014_0115_0059.pdf,4.0,"[partaking in providing contraceptive coverage to its students if the same SHIP it contracts, with for general student health must also provide contraceptive coverage via individual, policies., In the proposed rules, the Departments state, “the eligible organization would have no role in, contracting, arranging, paying, or referring for this separate contraceptive coverage” (Federal, Register, Vol. 79, No. 166, Pg. 51118). This is unattainable and unrealistic. Even if SHIP issuers, provide s...","The commenter stated that partaking in providing contraceptive coverage to its students if the same ship it contracts with for general student health must also provide contraceptive coverage via individual policies.\n\nThe commenter further stated that in the proposed rules, the departments state, “the eligible organization would have no role in contracting, arranging, paying, or referring for this separate contraceptive coverage” (federal register, vol."
4,CMS_2014_0115_0059.pdf,5.0,"[contrary to basic contract law., An issued health insurance policy is a contract between an insurance company and the insured., Contracts are binding and enforceable only when one party extends an offer, the other party accepts, the offer, and adequate consideration is exchanged by both. In the context of an insurance policy,, the benefits offered by the issuer in exchange for a premium paid by the insured are both adequate, consideration. However, under the proposed rule, the “insured” is ...","The commenter stated that in the context of an insurance policy, the benefits offered by the issuer in exchange for a premium paid by the insured are both adequate consideration.\n\nThe commenter further stated that hotheyver, under the proposed rule, the “insured” is paying no premium, yet the ship issuer must still provide health coverage under the guise of a legally-executed policy."


In [None]:
#docs.to_csv("sample.csv", index=False)