In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_colwidth = 200

## for text summarization
import re
import heapq 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from sumy.summarizers.text_rank import TextRankSummarizer
nltk.download('stopwords')
stopwords = stopwords.words('english')
nltk.download('punkt')


import sumy
from sumy.summarizers.luhn import LuhnSummarizer
from sumy.summarizers.lsa import LsaSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from lexrank import STOPWORDS, LexRank
import distance

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lilianacruzlopez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/lilianacruzlopez/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def add_titles(row):
    "add the titles to the text"
    row["text"].insert(0,row["title"])
    
def section_ids(df):
    ''' 
    input data frame with the following columns document, page, Class
    this will extract the section if once the row is tagged
    '''
    
    secID_in = []
    for doc in df["document"].unique():
        temp = df[df["document"]==doc]
        j = 1
        for i in range(len(temp)):
            if temp.iloc[i,2] == 1.0:
                secID_in.append((temp.iloc[i,0],temp.iloc[i,1],temp.iloc[i,2],0))
                j+=1
            elif temp.iloc[i,2] == 0.0:
                secID_in.append((temp.iloc[i,0],temp.iloc[i,1],temp.iloc[i,2],j))
            else:
                secID_in.append((temp.iloc[i,0],temp.iloc[i,1],temp.iloc[i,2],np.nan))

    sections = pd.DataFrame(secID_in, columns =["document","page","Class","secIDin"])
    return sections

def get_text(df_after_classification):
    # gets sections ID  
    sec_doc = section_ids(df_after_classification[["document", "page", "Class"]])

    # merges the sections ID with DataFrame
    data = pd.merge(df_after_classification,
                    sec_doc,left_index=True, right_index=True, on=["document","page", "Class"])
    
    text = data[["text","document","page","Class",
              "secIDin"]].groupby(["document","secIDin"])['text'].apply(list).reset_index()
    
    # extracts the text
    text = text[(text["secIDin"]!=0) & (text["secIDin"]!=1)] 
    
    # extracts titles
    #titles = data[["document","text","Class"]].where(data.Class ==1).dropna().reset_index(drop=1)
    #titles["secIDin"] = titles.groupby("document")["Class"].rank(method='first')
    #titles["secIDin"] = titles["secIDin"] +1
    #titles.rename(index=str, columns={"text": "title"}, inplace=True)
    #titles.drop("Class",axis=1, inplace=True)
    #temp_df = pd.merge(text,titles,on=["document","secIDin"]) # merges the text and the titles
    #_ = temp_df[["text","title"]].apply(add_titles, axis=1) # adds the titles to the text
    #temp_df.drop(columns=["title"], inplace=True)
    
    temp_df = text
    temp_df["secIDin"] = temp_df["secIDin"]-1
    
    return temp_df

### the cell below cleans the data but it is not needed if the data is already cleaned. 

In [3]:
excel_file = pd.read_excel('./features.xlsx', sheet_name = 'features_v7' , 
                           header=1, skip_blank_lines = False) 
excel_file['left'] = pd.to_numeric(excel_file['left'].str.replace('px', ''))
excel_file['top'] = pd.to_numeric(excel_file['top'].str.replace('px', ''))
excel_file["page"] = excel_file["page"].apply(lambda x: int(''.join(filter(str.isdigit, x))))
excel_file["document"] = excel_file["document"].apply(lambda x: x.split("/")[-1])
excel_file["LSL"] = excel_file.groupby(["document","page"])["left"].apply(lambda x: x.diff()).fillna(0)
excel_file["LST"] = excel_file.groupby(["document","page"])["top"].apply(lambda x: x.diff()).fillna(0)
#Clean px and convert to right data type for columns: fontsize, left and top
excel_file['font-size'] = pd.to_numeric(excel_file['font-size'].str.replace('px', ''))

### this is the code to extract the important info. 

In [4]:
df_after_classification = excel_file[["document","page","text","Class",]]
print("this is how the dataframe should look like to pass  it to my function")
df_after_classification.head()

this is how the dataframe should look like to pass  it to my function


Unnamed: 0,document,page,text,Class
0,CMS_2014_0115_0059.pdf,1,2014-10-10 00:00:00,0.0
1,CMS_2014_0115_0059.pdf,1,Centers for Medicare & Medicaid Services,0.0
2,CMS_2014_0115_0059.pdf,1,Department of Health and Human Services,0.0
3,CMS_2014_0115_0059.pdf,1,Attention: CMS-9968-P,0.0
4,CMS_2014_0115_0059.pdf,1,P.O. Box 8013,0.0


### using the function

In [5]:
documents = get_text(df_after_classification).reset_index(drop=1)
documents.head()

Unnamed: 0,document,secIDin,text
0,CMS_2014_0115_0059.pdf,1.0,"[students of religious institutions, To Whom It May Concern:, On behalf of Nationwide Life Insurance Company (“Nationwide”) and its affiliated companies, we, appreciate the opportunity to provide ..."
1,CMS_2014_0115_0059.pdf,2.0,"[significant administrative burden upon all parties., The Department’s basic premise that “issuers generally would find that providing such contraceptive, coverage is cost neutral” is in error (Fe..."
2,CMS_2014_0115_0059.pdf,3.0,"[automatically enrolled in a contraceptive-only health plan., Students who choose to attend a religious institution of higher learning do so for a reason, and most, of the time, these students str..."
3,CMS_2014_0115_0059.pdf,4.0,"[partaking in providing contraceptive coverage to its students if the same SHIP it contracts, with for general student health must also provide contraceptive coverage via individual, policies., In..."
4,CMS_2014_0115_0059.pdf,5.0,"[contrary to basic contract law., An issued health insurance policy is a contract between an insurance company and the insured., Contracts are binding and enforceable only when one party extends a..."


# MODELS 

In [6]:
# source https://stackabuse.com/text-summarization-with-nltk-in-python/
def text_summarization_nltk_model(article):
    '''
    input:a dataframe with the following features 1) document name (document), 2) section id (secIDin), 3) text (text) 
    output: a dataframe with features from the input plus the text summarization (2 sentences)
    function: this function does text summarization using NLTK
    '''
    #article = [str(x) for x in article]
    article_text = " ".join(map(str, article))
    # Removing Square Brackets and Extra Spaces 
    article_text = re.sub(r'\s+', ' ', article_text) 

    # Removing special characters and digits
    formatted_article_text = re.sub('[^a-zA-Z]', ' ', article_text )  
    formatted_article_text = re.sub(r'\s+', ' ', formatted_article_text)  

    sentence_list = nltk.sent_tokenize(article_text) 

    stopwords = nltk.corpus.stopwords.words('english')

    word_frequencies = {}  
    for word in nltk.word_tokenize(formatted_article_text):  
        if word not in stopwords:
            if word not in word_frequencies.keys():
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1

    maximum_frequncy = max(word_frequencies.values())

    for word in word_frequencies.keys():  
        word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)


    sentence_scores = {}  
    for sent in sentence_list:  
        for word in nltk.word_tokenize(sent.lower()):
            if word in word_frequencies.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = word_frequencies[word]
                    else:
                        sentence_scores[sent] += word_frequencies[word]

    summary_sentences = heapq.nlargest(2, sentence_scores, key=sentence_scores.get)
    summary = format_summary(summary_sentences)
         
    return summary

In [7]:
def text_summarization_nltk(documents):
    documents["summary_NLTK"]=documents["text"].apply(lambda x: text_summarization_nltk_model(x))
    return documents

In [8]:
def text_summarization_lsa(df):
    '''
    input:a dataframe with the following features 1) document name (document), 2) section id (secIDin), 
            3) text (text) 
    output: a dataframe with features from the input plus the text summarization (45% of the setences)
    func: this fucntion does text summarization using lsa 
    '''
    summarizer_2 = LsaSummarizer()
    df["summary_LSA"] = np.nan

    for i in range(len(df)):
        parser = PlaintextParser.from_string(df['text'].iloc[i], Tokenizer("english"))
        all_sentences = sent_tokenize(df['text'].iloc[i])
        num_sentences = len(all_sentences)
        n = 2#num_sentences*0.45  ## summarize the document with 45% sentences
        summary_LSA = summarizer_2(parser.document, n)
        df["summary_LSA"].iloc[i] = summary_LSA
        
    df["summary_LSA"] = df["summary_LSA"].apply(lambda x: format_summary([str(text) for text in x]))
    return df

In [9]:
def text_summarization_textRank(df_final):
    summarizer_3 = TextRankSummarizer()

    df_final["summary_TextRank"] = np.nan

    for i in range(len(df_final)):
        parser = PlaintextParser.from_string(df_final['text'].iloc[i], Tokenizer("english"))
        all_sentences = sent_tokenize(df_final['text'].iloc[i])
        num_sentences = len(all_sentences)
        n = 2#num_sentences*0.45  ## summarize the document with 45% sentences
        summary_TextRank = summarizer_3(parser.document, n)
        df_final["summary_TextRank"].iloc[i] = summary_TextRank
        
    df_final["summary_TextRank"] = df_final["summary_TextRank"].apply(lambda x: format_summary([str(text) for text in x]))
    return df_final

In [10]:
def text_summarization_luhn(df_final):
    summarizer_1 = LuhnSummarizer()
    df_final["summary_Luhn"] = np.nan

    for i in range(len(df_final)):
        parser = PlaintextParser.from_string(df_final['text'].iloc[i], Tokenizer("english"))
        all_sentences = sent_tokenize(df_final['text'].iloc[i])
        num_sentences = len(all_sentences)
        n = 2#num_sentences*0.45  ## summarize the document with 45% sentences
        summary_Luhn = summarizer_1(parser.document, n)
        df_final["summary_Luhn"].iloc[i] =   summary_Luhn
        
    df_final["summary_Luhn"] = df_final["summary_Luhn"].apply(lambda x: format_summary([str(text) for text in x]))
    return df_final

In [11]:
def text_summarization_lexRank(df_final):
    '''
    input:a dataframe with the following features 1) document name (document), 2) section id (secIDin), 
            3) text (text) 
    output: a dataframe with features from the input plus the text summarization (45% of the setences)
    func: this fucntion does text summarization using Lex Rank 
    '''
    summarizer = LexRankSummarizer()
    
    df_final["summary_Lex_Rank"] = np.nan ## add empty column called 'Sec_Summary'
    df_final['text'] = df_final['text'].apply(str)

    for i in range(len(df_final)):
        parser = PlaintextParser.from_string(df_final['text'].iloc[i], Tokenizer("english"))
        all_sentences = sent_tokenize(df_final['text'].iloc[i])
        num_sentences = len(all_sentences)
        n = 2#num_sentences*0.45  ## summarize the document with 45% sentences
        summary = summarizer(parser.document, n)
        df_final['summary_Lex_Rank'].iloc[i] = summary
        
    df_final['summary_Lex_Rank'] = df_final['summary_Lex_Rank'].apply(lambda x: format_summary([str(text) for text in x]))
    return df_final

In [12]:
def format_summary(summary_sentences):        
    intro = ["The commenter stated that ", "\nThe commented further stated that "]
    summary_sentences = [str(intro[i]) + str(summary_sentences[i].lower()) 
                         for i in range(len(summary_sentences))]

    summary = ' '.join(summary_sentences) 
    summary = summary.replace(". ",".\n\n")
    summary = summary.replace("we","they")
    
    return summary

In [13]:
summary = text_summarization_nltk(documents)
summary= text_summarization_lexRank(summary)
summary = text_summarization_lsa(summary)
summary = text_summarization_textRank(summary)
summary = text_summarization_luhn(summary)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


# metrics 

In [18]:
# source https://www.datacamp.com/community/tutorials/fuzzy-string-python
def levenshtein_ratio_and_distance(s, t, ratio_calc = False):
    """ levenshtein_ratio_and_distance:
        Calculates levenshtein distance between two strings.
        If ratio_calc = True, the function computes the
        levenshtein distance ratio of similarity between two strings
        For all i and j, distance[i,j] will contain the Levenshtein
        distance between the first i characters of s and the
        first j characters of t
    """
    # Initialize matrix of zeros
    rows = len(s)+1
    cols = len(t)+1
    distance = np.zeros((rows,cols),dtype = int)

    # Populate matrix of zeros with the indeces of each character of both strings
    for i in range(1, rows):
        for k in range(1,cols):
            distance[i][0] = i
            distance[0][k] = k

    # Iterate over the matrix to compute the cost of deletions,insertions and/or substitutions    
    for col in range(1, cols):
        for row in range(1, rows):
            if s[row-1] == t[col-1]:
                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
            else:
                # In order to align the results with those of the Python Levenshtein package, if we choose to calculate the ratio
                # the cost of a substitution is 2. If we calculate just distance, then the cost of a substitution is 1.
                if ratio_calc == True:
                    cost = 2
                else:
                    cost = 1
            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of deletions
                                 distance[row][col-1] + 1,          # Cost of insertions
                                 distance[row-1][col-1] + cost)     # Cost of substitutions
    if ratio_calc == True:
        # Computation of the Levenshtein Distance Ratio
        Ratio = ((len(s)+len(t)) - distance[row][col]) / (len(s)+len(t))
        return Ratio
    else:
        # print(distance) # Uncomment if you want to see the matrix showing how the algorithm computes the cost of deletions,
        # insertions and/or substitutions
        # This is the minimum number of edits needed to convert string a to string b
        return "The strings are {} edits away".format(distance[row][col])

In [25]:
from scipy.spatial import distance

In [26]:
%%time
method1="summary_Lex_Rank"
method2="summary_LSA"
score1 = "Levin score for "+method1+" and "+method2
summary[score1] =summary[[method1,method2]].apply(
    lambda x: levenshtein_ratio_and_distance(x[method1].lower(),
                                             x[method2].lower(),True), axis=1)

score2 = "Jaccard score for "+method1+" and "+method2
summary[score2] = summary[[method1,method2]].apply(
    lambda x: distance.jaccard(x[method1].lower(), x[method2].lower()), axis=1)

CPU times: user 16min 43s, sys: 1.98 s, total: 16min 45s
Wall time: 16min 46s


  nonzero = np.bitwise_or(u != 0, v != 0)


In [27]:
summary[["document","secIDin","text","summary_Lex_Rank","summary_LSA",
         "Levin score for summary_LSA and summary_Lex_Rank","Jaccard score for summary_LSA and summary_Lex_Rank"]]

Unnamed: 0,document,secIDin,text,summary_Lex_Rank,summary_LSA,Levin score for summary_LSA and summary_Lex_Rank,Jaccard score for summary_LSA and summary_Lex_Rank
0,CMS_2014_0115_0059.pdf,1.0,"['students of religious institutions', 'To Whom It May Concern:', 'On behalf of Nationwide Life Insurance Company (“Nationwide”) and its affiliated companies, we', 'appreciate the opportunity to p...","The commenter stated that ', 'the departments asked for input regarding the proposed requirement for ship issuers to', 'automatically enroll covered students and beneficiaries attending religious ...","The commenter stated that nationwide proposes that religious institutions of higher', 'education should be completely exempt from offering contraceptive coverage if it goes against their', 'religi...",0.522314,0.066667
1,CMS_2014_0115_0059.pdf,2.0,"['significant administrative burden upon all parties.', 'The Department’s basic premise that “issuers generally would find that providing such contraceptive', 'coverage is cost neutral” is in erro...","The commenter stated that since tpas typically do not sell insurance and thus are not', 'licensed to do so, they must become licensed in each state in order to direct each student to', 'a separate...","The commenter stated that the cost of underwriting, implementing, filing,', 'and administering this distinct policy is significantly higher than just adding a feature to an existing', 'plan.\n\n\n...",0.458078,0.235294
2,CMS_2014_0115_0059.pdf,3.0,"['automatically enrolled in a contraceptive-only health plan.', 'Students who choose to attend a religious institution of higher learning do so for a reason, and most', 'of the time, these student...","The commenter stated that ['automatically enrolled in a contraceptive-only health plan.\n\n\nThe commented further stated that ', 'students who choose to attend a religious institution of higher l...","The commenter stated that ', 'students who choose to attend a religious institution of higher learning do so for a reason, and most', 'of the time, these students strongly believe and share in the...",0.586166,0.103448
3,CMS_2014_0115_0059.pdf,4.0,"['partaking in providing contraceptive coverage to its students if the same SHIP it contracts', 'with for general student health must also provide contraceptive coverage via individual', 'policies...","The commenter stated that ['partaking in providing contraceptive coverage to its students if the same ship it contracts', 'with for general student health must also provide contraceptive coverage ...","The commenter stated that even if ship issuers', 'provide separate individual contraceptive-only coverage to students, religious institutions of higher', 'education must still be involved in the d...",0.476096,0.129032
4,CMS_2014_0115_0059.pdf,5.0,"['contrary to basic contract law.', 'An issued health insurance policy is a contract between an insurance company and the insured.', 'Contracts are binding and enforceable only when one party exte...","The commenter stated that ['contrary to basic contract law.\n\n\nThe commented further stated that in the context of an insurance policy,', 'the benefits offered by the issuer in exchange for a pr...","The commenter stated that ', 'contracts are binding and enforceable only when one party extends an offer, the other party accepts', 'the offer, and adequate consideration is exchanged by both.\n\n...",0.493464,0.129032
5,CMS_2014_0115_0059.pdf,6.0,"['FDA approved contraception as prescribed” and “certain contraception services”. Is there', 'more clarity regarding which methods of contraception must be excluded?', 'The proposed rule reference...","The commenter stated that is there', 'more clarity regarding which methods of contraception must be excluded? \nThe commented further stated that a second and distinct reference to contraceptive',...","The commenter stated that is there a method of contraception that is both required under section 2713 of the', 'public health services act (phs) and which would be allowably included in the covera...",0.488771,0.289474
6,CMS_2014_0115_0059.pdf,7.0,"['students attending religious institutions.', 'Rather than require each individual SHIP carrier to provide free separate contraceptive-only', 'coverage for students of religious institutions, Nat...","The commenter stated that ', 'rather than require each individual ship carrier to provide free separate contraceptive-only', 'coverage for students of religious institutions, nationwide proposes t...","The commenter stated that ', 'rather than require each individual ship carrier to provide free separate contraceptive-only', 'coverage for students of religious institutions, nationwide proposes t...",0.720602,0.064516
7,CMS_2014_0115_0059.pdf,8.0,"['federal income taxes for all costs incurred for non-covered contraception methods.', 'As opposed to requiring the insurance company to craft individual insurance contracts for specific', 'method...","The commenter stated that ['federal income taxes for all costs incurred for non-covered contraception methods.\n\n\nThe commented further stated that ', 'as opposed to requiring the insurance comp...","The commenter stated that the departments could allow the', 'covered persons to submit any expenditure for those services, excluded under their plan, along with', 'their federal tax return for a f...",0.517837,0.096774
8,CMS_2014_0115_0059.pdf,9.0,"['Nationwide respectfully proposes that SHIP issuers should not be required to automatically enroll', 'students attending anti-contraceptive religious institutions into a separate health plan pure...","The commenter stated that ['nationwide respectfully proposes that ship issuers should not be required to automatically enroll', 'students attending anti-contraceptive religious institutions into a...","The commenter stated that ['nationwide respectfully proposes that ship issuers should not be required to automatically enroll', 'students attending anti-contraceptive religious institutions into a...",0.795556,0.256410
9,CMS_2014_0115_0076.pdf,1.0,"['Certain Preventive Services Under the Affordable Care Act', 'Dear Sir or Madam:', 'On behalf of the United States Conference of Catholic Bishops (“USCCB”),', 'we respectfully submit the followin...","The commenter stated that the rules pertain to application of the contraceptive', 'mandate to closely-held for-profit companies.1', 'our comments follow.\n\n\nThe commented further stated that the...","The commenter stated that ', '1 they use the term “mandate” or “contraceptive mandate” as shorthand for the requirement that', 'non-grandfathered health plans and policies provide coverage of drug...",0.634033,0.142857
