In [1]:
import pandas as pd
import numpy as np
pd.options.display.max_colwidth = 1000

In [2]:
def add_titles(row):
    "add the titles to the text"
    row["text"].insert(0,row["title"])
    
def section_ids(df):
    ''' 
    input data frame with the following columns document, page, Class
    this will extract the section if once the row is tagged
    '''
    
    secID_in = []
    for doc in df["document"].unique():
        temp = df[df["document"]==doc]
        j = 1
        for i in range(len(temp)):
            if temp.iloc[i,2] == 1.0:
                secID_in.append((temp.iloc[i,0],temp.iloc[i,1],temp.iloc[i,2],0))
                j+=1
            elif temp.iloc[i,2] == 0.0:
                secID_in.append((temp.iloc[i,0],temp.iloc[i,1],temp.iloc[i,2],j))
            else:
                secID_in.append((temp.iloc[i,0],temp.iloc[i,1],temp.iloc[i,2],np.nan))

    sections = pd.DataFrame(secID_in, columns =["document","page","Class","secIDin"])
    return sections

def get_text(df_after_classification):
    # gets sections ID  
    sec_doc = section_ids(df_after_classification[["document", "page", "Class"]])

    # merges the sections ID with DataFrame
    data = pd.merge(df_after_classification,
                    sec_doc,left_index=True, right_index=True, on=["document","page", "Class"])
    
    text = data[["text","document","page","Class",
              "secIDin"]].groupby(["document","secIDin"])['text'].apply(list).reset_index()
    
    # extracts titles
    titles = data[["document","text","Class"]].where(data.Class ==1).dropna().reset_index(drop=1)
    titles["secIDin"] = titles.groupby("document")["Class"].rank(method='first')
    titles["secIDin"] = titles["secIDin"] +1
    titles.rename(index=str, columns={"text": "title"}, inplace=True)
    titles.drop("Class",axis=1, inplace=True)
    
    # extracts the text
    text = text[(text["secIDin"]!=0) & (text["secIDin"]!=1)] 
    
    # merges the text and the titles
    temp_df = pd.merge(text,titles,on=["document","secIDin"])
    
    # adds the titles to the text
    _ = temp_df[["text","title"]].apply(add_titles, axis=1)
    temp_df.drop(columns=["title"], inplace=True)
    temp_df["secIDin"] = temp_df["secIDin"]-1
    
    return temp_df

### the cell below cleans the data but it is not needed if the data is already cleaned. 

In [3]:
excel_file = pd.read_excel('./features.xlsx', sheet_name = 'features_v7' , 
                           header=1, skip_blank_lines = False) 
excel_file['left'] = pd.to_numeric(excel_file['left'].str.replace('px', ''))
excel_file['top'] = pd.to_numeric(excel_file['top'].str.replace('px', ''))
excel_file["page"] = excel_file["page"].apply(lambda x: int(''.join(filter(str.isdigit, x))))
excel_file["document"] = excel_file["document"].apply(lambda x: x.split("/")[-1])
excel_file["LSL"] = excel_file.groupby(["document","page"])["left"].apply(lambda x: x.diff()).fillna(0)
excel_file["LST"] = excel_file.groupby(["document","page"])["top"].apply(lambda x: x.diff()).fillna(0)
#Clean px and convert to right data type for columns: fontsize, left and top
excel_file['font-size'] = pd.to_numeric(excel_file['font-size'].str.replace('px', ''))

### this is the code to extract the important info. 

In [4]:
df_after_classification = excel_file[["document","page","text","Class",]]
print("this is how the dataframe should look like to pass  it to my function")
df_after_classification.head()

this is how the dataframe should look like to pass  it to my function


Unnamed: 0,document,page,text,Class
0,CMS_2014_0115_0059.pdf,1,2014-10-10 00:00:00,0.0
1,CMS_2014_0115_0059.pdf,1,Centers for Medicare & Medicaid Services,0.0
2,CMS_2014_0115_0059.pdf,1,Department of Health and Human Services,0.0
3,CMS_2014_0115_0059.pdf,1,Attention: CMS-9968-P,0.0
4,CMS_2014_0115_0059.pdf,1,P.O. Box 8013,0.0


### using the function

In [5]:
get_text(df_after_classification)

Unnamed: 0,document,secIDin,text
0,CMS_2014_0115_0059.pdf,1.0,"[Re: Nationwide Life Insurance Company’s comments on separate contraceptive-only policies for, students of religious institutions, To Whom It May Concern:, On behalf of Nationwide Life Insurance Company (“Nationwide”) and its affiliated companies, we, appreciate the opportunity to provide comments in response to CMS–9940–Pin which the Internal, Revenue Service (“IRS”), Employee Benefits Security Administration (“EBSA”), and the, Department of Health and Human Services (“HHS”) solicited comments on its proposed rule, concerning the coverage of certain preventive services under the Patient Protection and Affordable, Care Act (“ACA”). Nationwide currently has the fourth largest share of the student health, insurance plan (“SHIP”) market and insures over 130,000 undergraduate, graduate, and international, students at 183 colleges and universities throughout the U.S. We do not offer any other group or, individual major medical health policies in any market., The Departments asked for in..."
1,CMS_2014_0115_0059.pdf,2.0,"[I. Providing contraceptive coverage in the student market is not cost neutral and imposes a, significant administrative burden upon all parties., The Department’s basic premise that “issuers generally would find that providing such contraceptive, coverage is cost neutral” is in error (Federal Register, Vol. 78, No. 25, Pg. 8463). Economic,, actuarial, and administrative data prove otherwise. A February 2012 publication by the Office of the, Assistant Secretary for Planning and Evaluation (ASPE) entitled The Cost of Covering Contraceptives, through Health Insurance found that the actual cost of adding contraceptives with no cost sharing to an, existing plan is between $21 and $41 annually per insured, as determined by three separate actuarial, companies. Adjusted for inflation, using 8% medical trend per year this equates to $26-$52 annually., In the SHIP arena with average policies of about $1600, this would be result in a 2-3.5 % increase in, rate. It is important to note that th..."
2,CMS_2014_0115_0059.pdf,3.0,"[II. Students enrolled in a religious institution of higher education may not wish to be, automatically enrolled in a contraceptive-only health plan., Students who choose to attend a religious institution of higher learning do so for a reason, and most, of the time, these students strongly believe and share in the religious convictions of their chosen, institution. Therefore, many who attend anti-contraceptive institutions may not wish to be or may, even be extremely offended by being automatically enrolled in an individual contraceptive-only, policy. It will simply be a waste of resources for all parties involved if an issuer creates, determines, eligibility, and enrolls all students into an individual health policy, just for the student to immediately, cancel., Students attending religious institutions are aware of the religious doctrines the school has employed, as part of their daily lives. Just like the decision-making process to attend any school, students are, acutely aware ..."
3,CMS_2014_0115_0059.pdf,4.0,"[III. It is impossible to completely insulate religious institutions of higher education from, partaking in providing contraceptive coverage to its students if the same SHIP it contracts, with for general student health must also provide contraceptive coverage via individual, policies., In the proposed rules, the Departments state, “the eligible organization would have no role in, contracting, arranging, paying, or referring for this separate contraceptive coverage” (Federal, Register, Vol. 79, No. 166, Pg. 51118). This is unattainable and unrealistic. Even if SHIP issuers, provide separate individual contraceptive-only coverage to students, religious institutions of higher, education must still be involved in the day-to-day administration of these policies and will be forced, to partake in providing this benefit against their will., In order for a SHIP issuer to offer contraceptive coverage on an individual basis, the institution, must first identify the individual students who wa..."
4,CMS_2014_0115_0059.pdf,5.0,"[IV. Requiring SHIP issuers to provide free contraceptive coverage via individual policies is, contrary to basic contract law., An issued health insurance policy is a contract between an insurance company and the insured., Contracts are binding and enforceable only when one party extends an offer, the other party accepts, the offer, and adequate consideration is exchanged by both. In the context of an insurance policy,, the benefits offered by the issuer in exchange for a premium paid by the insured are both adequate, consideration. However, under the proposed rule, the “insured” is paying no premium, yet the, SHIP issuer must still provide health coverage under the guise of a legally-executed policy. As such,, neither party is contractually protected since this “agreement” is in direct conflict with general, contract law.]"
5,CMS_2014_0115_0059.pdf,6.0,"[V. Request for clarity: The proposed rule seems to distinguish between the “full range of, FDA approved contraception as prescribed” and “certain contraception services”. Is there, more clarity regarding which methods of contraception must be excluded?, The proposed rule references methods of contraception in two seemly distinct ways. The first, reference is regarding the coverage requirement for non-eligible entities which is referenced, numerous times as “Food and Drug Administration (FDA) approved contraception as prescribed by, a health care provider”. These references are made both directly and through reference to the, Health Resources and Services (HRSA) guidelines. A second and distinct reference to contraceptive, methods, specifically in connection with those methods to which the eligible entity may object is, repeatedly referred as “certain contraceptive services”. Nationwide requests clarity on these, references. Is there a method of contraception that is both required ..."
6,CMS_2014_0115_0059.pdf,7.0,"[VI. Proposed solution: Utilize the Exchanges to provide free contraceptive coverage to, students attending religious institutions., Rather than require each individual SHIP carrier to provide free separate contraceptive-only, coverage for students of religious institutions, Nationwide proposes that this individual policy be, offered through the Exchange directly to these students. This would eliminate every one of the, issues discussed above in this comment letter. Not only would the Departments reduce their, administrative burden of auditing and overseeing each individual SHIP carrier, the religious entities, themselves can truly be separated from the process of providing contraceptive coverage. Students, will have the freedom and option to decide for themselves whether they wish to have contraceptive, coverage or not, and those who do can obtain it directly themselves through the Exchange by, applying and submitting the required documentation. Neither the school nor the SHIP car..."
7,CMS_2014_0115_0059.pdf,8.0,"[VII. Proposed solution: Give the covered student an above line reimbursement on their, federal income taxes for all costs incurred for non-covered contraception methods., As opposed to requiring the insurance company to craft individual insurance contracts for specific, methods of contraception, of which the eligible entity would still be an unwitting participant and, would potentially breach the privacy of the covered student. The Departments could allow the, covered persons to submit any expenditure for those services, excluded under their plan, along with, their federal tax return for a full refund. This would eliminate the requirement for insurers to craft, and administer a whole new policy as well as completely remove the eligible entity from any, involvement and thus protect the students’ privacy. The burden to the federal government would be, minimal as it would simply tack the requirement onto an existing function, for which there are, existing guidelines.]"
8,CMS_2014_0115_0059.pdf,9.0,"[Conclusion, Nationwide respectfully proposes that SHIP issuers should not be required to automatically enroll, students attending anti-contraceptive religious institutions into a separate health plan purely for, certain contraceptive services. Not only is this contrary to basic contract law, it is a costly,, cumbersome burden to place upon insurance carriers, TPAs, and religious institutions. Nationwide, also asks that the Departments consider the above alternative options that would still provide these, students access to free contraceptives while reducing administration strain and truly preventing, religious institutions from having to partake in providing contraceptives. As always, we appreciate, the dialogue and look forward to further opportunities to comment., Please contact Bobby Handley, Managing Counsel, at Handleb2@nationwide.com or 614-677-3869, if you would like to discuss any of these issues., Very truly yours,, Teresa Robison, Director, College Segment, Nationwide In..."
9,CMS_2014_0115_0076.pdf,1.0,"[Re: Comments on Proposed Rules on Coverage of, Certain Preventive Services Under the Affordable Care Act, Dear Sir or Madam:, On behalf of the United States Conference of Catholic Bishops (“USCCB”),, we respectfully submit the following comments on the proposed rules on coverage, of certain preventive services under the Affordable Care Act (“ACA”). 79 Fed., Reg. 51118 (Aug. 27, 2014). The rules pertain to application of the contraceptive, mandate to closely-held for-profit companies.1, Our comments follow., 1 We use the term “mandate” or “contraceptive mandate” as shorthand for the requirement that, non-grandfathered health plans and policies provide coverage of drugs and devices that the FDA, has approved as contraceptives (including those that can cause an abortion), sterilization, procedures for women, and related education and counseling. We use the term “contraceptives”, and “contraceptive coverage” to refer to these items and their coverage, respectively.]"


# if you want to check what the functions does here is a brief walk thru

In [6]:
# gets sections ID  
sec_doc = section_ids(df_after_classification[["document", "page", "Class"]])

# merges the sections ID with DataFrame
data = pd.merge(df_after_classification,
                sec_doc,left_index=True, right_index=True, on=["document","page", "Class"])

print(sec_doc.shape)
print(df_after_classification.shape)
print(data.shape)
data.head()

(41910, 4)
(41910, 4)
(41910, 5)


Unnamed: 0,document,page,text,Class,secIDin
0,CMS_2014_0115_0059.pdf,1,2014-10-10 00:00:00,0.0,1.0
1,CMS_2014_0115_0059.pdf,1,Centers for Medicare & Medicaid Services,0.0,1.0
2,CMS_2014_0115_0059.pdf,1,Department of Health and Human Services,0.0,1.0
3,CMS_2014_0115_0059.pdf,1,Attention: CMS-9968-P,0.0,1.0
4,CMS_2014_0115_0059.pdf,1,P.O. Box 8013,0.0,1.0


In [7]:
text = data[["text","document","page","Class",
              "secIDin"]].groupby(["document","secIDin"])['text'].apply(list).reset_index()
text.head(3)

Unnamed: 0,document,secIDin,text
0,CMS_2014_0115_0059.pdf,0.0,"[Re: Nationwide Life Insurance Company’s comments on separate contraceptive-only policies for, I. Providing contraceptive coverage in the student market is not cost neutral and imposes a, II. Students enrolled in a religious institution of higher education may not wish to be, III. It is impossible to completely insulate religious institutions of higher education from, IV. Requiring SHIP issuers to provide free contraceptive coverage via individual policies is, V. Request for clarity: The proposed rule seems to distinguish between the “full range of, VI. Proposed solution: Utilize the Exchanges to provide free contraceptive coverage to, VII. Proposed solution: Give the covered student an above line reimbursement on their, Conclusion]"
1,CMS_2014_0115_0059.pdf,1.0,"[2014-10-10 00:00:00, Centers for Medicare & Medicaid Services, Department of Health and Human Services, Attention: CMS-9968-P, P.O. Box 8013, Baltimore, MD 21244-1850]"
2,CMS_2014_0115_0059.pdf,2.0,"[students of religious institutions, To Whom It May Concern:, On behalf of Nationwide Life Insurance Company (“Nationwide”) and its affiliated companies, we, appreciate the opportunity to provide comments in response to CMS–9940–Pin which the Internal, Revenue Service (“IRS”), Employee Benefits Security Administration (“EBSA”), and the, Department of Health and Human Services (“HHS”) solicited comments on its proposed rule, concerning the coverage of certain preventive services under the Patient Protection and Affordable, Care Act (“ACA”). Nationwide currently has the fourth largest share of the student health, insurance plan (“SHIP”) market and insures over 130,000 undergraduate, graduate, and international, students at 183 colleges and universities throughout the U.S. We do not offer any other group or, individual major medical health policies in any market., The Departments asked for input regarding the proposed requirement for SHIP issuers to, automatically enroll covered stude..."


In [8]:
# extracts titles
titles = data[["document","text","Class"]].where(data.Class ==1).dropna().reset_index(drop=1)
titles["secIDin"] = titles.groupby("document")["Class"].rank(method='first')
titles["secIDin"] = titles["secIDin"] +1
titles.rename(index=str, columns={"text": "title"}, inplace=True)
titles.drop("Class",axis=1, inplace=True)
# extracts the text
text = text[(text["secIDin"]!=0) & (text["secIDin"]!=1)] 
# merges the text and the titles
temp_df = pd.merge(text,titles,on=["document","secIDin"])
temp_df.head(3)

Unnamed: 0,document,secIDin,text,title
0,CMS_2014_0115_0059.pdf,2.0,"[students of religious institutions, To Whom It May Concern:, On behalf of Nationwide Life Insurance Company (“Nationwide”) and its affiliated companies, we, appreciate the opportunity to provide comments in response to CMS–9940–Pin which the Internal, Revenue Service (“IRS”), Employee Benefits Security Administration (“EBSA”), and the, Department of Health and Human Services (“HHS”) solicited comments on its proposed rule, concerning the coverage of certain preventive services under the Patient Protection and Affordable, Care Act (“ACA”). Nationwide currently has the fourth largest share of the student health, insurance plan (“SHIP”) market and insures over 130,000 undergraduate, graduate, and international, students at 183 colleges and universities throughout the U.S. We do not offer any other group or, individual major medical health policies in any market., The Departments asked for input regarding the proposed requirement for SHIP issuers to, automatically enroll covered stude...",Re: Nationwide Life Insurance Company’s comments on separate contraceptive-only policies for
1,CMS_2014_0115_0059.pdf,3.0,"[significant administrative burden upon all parties., The Department’s basic premise that “issuers generally would find that providing such contraceptive, coverage is cost neutral” is in error (Federal Register, Vol. 78, No. 25, Pg. 8463). Economic,, actuarial, and administrative data prove otherwise. A February 2012 publication by the Office of the, Assistant Secretary for Planning and Evaluation (ASPE) entitled The Cost of Covering Contraceptives, through Health Insurance found that the actual cost of adding contraceptives with no cost sharing to an, existing plan is between $21 and $41 annually per insured, as determined by three separate actuarial, companies. Adjusted for inflation, using 8% medical trend per year this equates to $26-$52 annually., In the SHIP arena with average policies of about $1600, this would be result in a 2-3.5 % increase in, rate. It is important to note that the $26-$52 estimated price is only to add contraceptives to an, existing plan – the proposed r...",I. Providing contraceptive coverage in the student market is not cost neutral and imposes a
2,CMS_2014_0115_0059.pdf,4.0,"[automatically enrolled in a contraceptive-only health plan., Students who choose to attend a religious institution of higher learning do so for a reason, and most, of the time, these students strongly believe and share in the religious convictions of their chosen, institution. Therefore, many who attend anti-contraceptive institutions may not wish to be or may, even be extremely offended by being automatically enrolled in an individual contraceptive-only, policy. It will simply be a waste of resources for all parties involved if an issuer creates, determines, eligibility, and enrolls all students into an individual health policy, just for the student to immediately, cancel., Students attending religious institutions are aware of the religious doctrines the school has employed, as part of their daily lives. Just like the decision-making process to attend any school, students are, acutely aware of the positives and negatives of attending a religious institution. If access to free, c...",II. Students enrolled in a religious institution of higher education may not wish to be


In [9]:
# adds the titles to the text
_ = temp_df[["text","title"]].apply(add_titles, axis=1)
temp_df.drop(columns=["title"], inplace=True)
temp_df["secIDin"] = temp_df["secIDin"]-1
temp_df.head(3)

Unnamed: 0,document,secIDin,text
0,CMS_2014_0115_0059.pdf,1.0,"[Re: Nationwide Life Insurance Company’s comments on separate contraceptive-only policies for, students of religious institutions, To Whom It May Concern:, On behalf of Nationwide Life Insurance Company (“Nationwide”) and its affiliated companies, we, appreciate the opportunity to provide comments in response to CMS–9940–Pin which the Internal, Revenue Service (“IRS”), Employee Benefits Security Administration (“EBSA”), and the, Department of Health and Human Services (“HHS”) solicited comments on its proposed rule, concerning the coverage of certain preventive services under the Patient Protection and Affordable, Care Act (“ACA”). Nationwide currently has the fourth largest share of the student health, insurance plan (“SHIP”) market and insures over 130,000 undergraduate, graduate, and international, students at 183 colleges and universities throughout the U.S. We do not offer any other group or, individual major medical health policies in any market., The Departments asked for in..."
1,CMS_2014_0115_0059.pdf,2.0,"[I. Providing contraceptive coverage in the student market is not cost neutral and imposes a, significant administrative burden upon all parties., The Department’s basic premise that “issuers generally would find that providing such contraceptive, coverage is cost neutral” is in error (Federal Register, Vol. 78, No. 25, Pg. 8463). Economic,, actuarial, and administrative data prove otherwise. A February 2012 publication by the Office of the, Assistant Secretary for Planning and Evaluation (ASPE) entitled The Cost of Covering Contraceptives, through Health Insurance found that the actual cost of adding contraceptives with no cost sharing to an, existing plan is between $21 and $41 annually per insured, as determined by three separate actuarial, companies. Adjusted for inflation, using 8% medical trend per year this equates to $26-$52 annually., In the SHIP arena with average policies of about $1600, this would be result in a 2-3.5 % increase in, rate. It is important to note that th..."
2,CMS_2014_0115_0059.pdf,3.0,"[II. Students enrolled in a religious institution of higher education may not wish to be, automatically enrolled in a contraceptive-only health plan., Students who choose to attend a religious institution of higher learning do so for a reason, and most, of the time, these students strongly believe and share in the religious convictions of their chosen, institution. Therefore, many who attend anti-contraceptive institutions may not wish to be or may, even be extremely offended by being automatically enrolled in an individual contraceptive-only, policy. It will simply be a waste of resources for all parties involved if an issuer creates, determines, eligibility, and enrolls all students into an individual health policy, just for the student to immediately, cancel., Students attending religious institutions are aware of the religious doctrines the school has employed, as part of their daily lives. Just like the decision-making process to attend any school, students are, acutely aware ..."
