In [1]:
import pandas as pd
import pycountry
import re

In [2]:
df_actual = pd.read_csv("hl_data/actual_removed_duplicated_uncodable.csv")

In [3]:
df_actual['source'] = "manual"

In [4]:
# Manual override
df_actual.code[df_actual["identifier"]==301929] = "SAFEABORTION"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_actual.code[df_actual["identifier"]==301929] = "SAFEABORTION"


In [5]:
df_modelled = pd.read_csv("hl_data/automated_coding_from_greg.csv")
df_modelled['source'] = "greg"

## Include the UNCODABLE data that was recoded with Sklearn

In [6]:
df_recoded = pd.read_csv("hl_data/02_sklearn_recoded_uncodable.csv")

In [7]:
# We only want to auto recode where relatively high confidence
df_recoded = df_recoded[df_recoded.prediction_confidence > 0.3]

In [8]:
len(df_recoded)

49097

In [9]:
df_recoded["code"] = df_recoded["prediction"]
df_recoded["source"] = "tw_sklearn"
df_recoded.drop(["prediction", "prediction_confidence"], axis=1, inplace=True)

In [10]:
df_actual = df_actual[~df_actual.identifier.isin(set(df_recoded.identifier))]

In [89]:
df_all = pd.concat([df_actual, df_modelled, df_recoded])

# Delete responses that are just typos

In [90]:
to_delete = ["ayala", "chika emma", "odulio", "nabisalo", "masarira", "efgyk", "kendro", "martyr", "fmysojh", "^blank$"]
to_delete_regex = re.compile(r'(?i)' + "|".join(to_delete))

In [91]:
df_all = df_all[df_all.raw_response.apply(str).apply(lambda s : not to_delete_regex.search(s))]

# Recode PRIVACY

Privacy and RESPECTFULCARE & BETTERFACILITIES

Kristy wants it so that if it is ONLY ‘privacy’, it goes into RESPECTFULCARE. But, if it is something related to both privacy and the facility, it should be double coded (so, if it has to do with curtains, your own bed, or the search terms below) it should be double coded in both RESPECTFULCARE and BETTERFACILITIES.

Based on our codebook, these terms should be doublecoded now in both RESPECTFULCARE and BETTERFACILITIES:

“I want my own bed”

“I don’t want to share beds”

“I want my own room”

“I want a curtain”

But, if it is just a single word (privacy) it should JUST go into RESPECTFULCARE.

In [92]:
df_all.code[(df_all.code=="PRIVACY") & (df_all.raw_response.apply(str).str.match("(?i).*(bed|room|curtain|ward).*"))] = "LABOR/BETTERFACILITIES"

## Include the UNCODABLES that were recoded manually

In [15]:
import os

In [93]:
len(df_all[df_all.code=="UNCODABLE"])

25011

In [94]:
for root, _, files in os.walk("hl_data/UNCODABLES Handcoded/"):
    for f in files:
        if f.endswith("xlsx"):
            df = pd.read_excel(os.path.join(root, f))
            for idx in range(len(df)):
                identifier = df.identifier.iloc[idx]
                raw_response_in_excel = df.raw_response.iloc[idx]
                
                raw_response_origs = list(df_all.raw_response[df_all.identifier == identifier])
                
                if len(raw_response_origs) > 0 and raw_response_in_excel == raw_response_origs[0]:
                    df_all.code[df_all.identifier == identifier] = df.code.iloc[idx]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all.code[df_all.identifier == identifier] = df.code.iloc[idx]


In [95]:
len(df_all[df_all.code=="UNCODABLE"])

238

# Recode according to March corrections spreadsheet

Received from Diana 8 Mar 2021

In [96]:
def normalise_text_for_corrections(text):
    return re.sub(r'\s+', ' ', str(text).lower().strip()) # sometimes the corrections Excel has newlines

In [97]:
cleaned_raw_response_for_matching_to_excel = df_all.raw_response.apply(normalise_text_for_corrections)

In [98]:
for sheet_name in ['Correction Sheet', 'Special Cases']:
    print ("doing corrections from sheet", sheet_name)
    df_corrections = pd.read_excel (r'hl_data/COMPILATIONS.xlsx', sheet_name=sheet_name)
    for idx in range(2, len(df_corrections)):
        correction_text = df_corrections["Dashboard Survey Response"].iloc[idx]
        rows_to_correct_indices = df_all.raw_response == correction_text
        if sum(rows_to_correct_indices) == 0 and len(correction_text) > 80:
            # sometimes the correction text in the spreadsheet is the truncated version of the full text
            # in this case, if nothing was found by "exact match", try via "starts with"
            rows_to_correct_indices = cleaned_raw_response_for_matching_to_excel.str.startswith(normalise_text_for_corrections(correction_text))

        if sum(rows_to_correct_indices) == 0:
            print ("No rows corrected", idx)
            print ("\t", correction_text)


        if sum(rows_to_correct_indices) > 0:
            final_code = df_corrections["actual/final code"].iloc[idx]
            if not pd.isna(df_corrections["secondary/double code (if needed)"].iloc[idx]):
                final_code += "/" + df_corrections["secondary/double code (if needed)"].iloc[idx]
            if not pd.isna(df_corrections["triple code (if needed)"].iloc[idx]):
                final_code += "/" + df_corrections["triple code (if needed)"].iloc[idx]

            df_all.loc[rows_to_correct_indices, "code"] = final_code

doing corrections from sheet Correction Sheet
No rows corrected 89
	 Disability Info, Services, Supplies / Timely and attentive care
No rows corrected 122
	 Campaigns To Maternal Health And Reproductive Health Isolated Corners
No rows corrected 148
	 "increased health personnel
increased material and medical tools
campaigns sexual and reproductive information"
No rows corrected 217
	 Campaigns To Maternal Health And Reproductive Health Isolated Corners
No rows corrected 243
	 increased health personnel, increased material and medical tools, campaigns sexual and reproductive information
No rows corrected 347
	 Improvement of work efficiency 
No rows corrected 437
	  Bring us pants to our school (Kaliene Secondary School) 
No rows corrected 694
	 1. Hospital required in Village, 2. Staff at the hospital/PHC/Anganwadi should be sensitive
doing corrections from sheet Special Cases


# Replace common misspellings

In [99]:
df_all.replace({r'(?i)\bsavage\b': 'sewage', # savage system but not "savagery" - different word
                r'(?i)\bquark':'quack'}, regex=True, inplace=True)

## Rename Codes

In [100]:
hierarchy = {
    "Broader health, development and rights":
        {"ENVIRON": "Environmental health and agricultural support",
         # "?": "Continuity of Care",  # not found
       #  "EQUITABLE": "Equitable care (Universal Healthcare)",
         "RSCHTECH": "Evidence, research, innovation and technology",
         "POWER": "Power, Policy/Politics and Rights",
         #"WOMENSEQUALITY": "Empowerment and rights (women’s leadership)"
         },
    "Equity, Dignity, and Respect":
        {"ATTENTIVENESS": "Timely and attentive care",  # merged with 24x7
         # "BIRTHCOMPANION": "Birth Companion Choice", # no data
         # "INFORMEDCONSENT": "Informed Consent", # no data
        # "NOABUSE": "Ethical, lawful non-abusive and secure care",
         # "NONEGLIGENCE": "No unprofessional conduct", # no data
         #"RESPECT": "Respectful and dignified care",  # ??? not sure
          "RESPECTFULCARE": "Respectful and Dignified Care"
         },
    "Facility Improvements":
        {  # "24X7": "24x7 Services, Facilities, Providers", # merged into ATTENTIVENESS
          #  "BEDS": "Beds and bedding", # add to BETTERFACILITIES
            #"ELECTRICITY": "Electricity",
            #  "KITCHEN": "Improved kitchens", # no data
            #'"LABORATORIES": "Laboratories", # added to BETTERFACILITIES
            #"MANAGEMENT": "Administration and record-keeping",
            # "MATERNALWARDS": "Maternal, women's health wards, centers, waiting rooms", # no data
            # "PRIVACY": "More space and privacy",  # ???? # no data
            "BETTERFACILITIES": "Increased, full-functioning and close health facilities",
            "TRANSPORTATION": "Transportation infrastructure",
            "WASH": "Water, sanitation and hygiene"
        },
    "Health Professionals":
        {#"DOCTORS": "Increased, competent and better supported doctors",
         "HEALTHPROFESSIONALS": "Health professionals and related needs",
         #'"NURSESMIDWIVES": "Increased competent and better supported midwives and nurses",
         #"FEMALEPROVIDERS": "More female health providers",
         #"MALEPROVIDERS": "Male health providers",
         #"SPECIALISTS": "Specialists (surgeons, anesthetists)",
         #"SUPPORTLINKS": "Support for traditional, mobile and community health workers"
         },
    # "Uncodable": {
    #     "UNCODABLE": "Uncodable"
    # },
    "Other":
        { #"COMMUNITY": "Community engagement and accountability",
         "NOINTERVENTION": "Reduced medicalization or do not want service",
         "FITNESS": "Fitness and recreation",
         # "QUOTE": "Good Quotes",
         "HEALTH": "Improved health, well-being, health services",
         "FAMILY": "Male engagement, shifts in family/partner dynamics",
         # "INNOVATION": "Innovation, R&D and Technology", # merged into EVIDENCEBASED
         "ECONOMIC": "Economic opportunity and financial support",
         #  "MALEENGAGEMENT": "Male Engagement", # no data
         #  "PARENTALLEAVE": "Maternity, Paternity Leave", # no data
         "NODEMAND": "No demand, everything is OK",
         #"NOHARMFULPRACTICE": "End violence, harmful practices against women and girls",
         "OTHERNONDETERMINABLE": "Other Asks and Non-Determinable Requests",
         #"PEACE": "Peace, no conflict",
         "RELIGION": "Religious support",
         "SCHOOLS": "Schools and educational opportunity",
         #"WANTCHILDREN": "Want children"
         },
    #"Patient-Provider Communication":
     #   {

            # "BETTERCOMMUNICATION": "Complete and understandable communication", # added to RESPECTFULCARE
         #    "COMMUNICATED": "Communication understanding (e.g. language)", # no data
         #"CONFIDENTIALITY": "Confidentiality and privacy",
         #   "FRIENDLY": "Friendly, hospitable and polite", # no data
         #  "NOSTIGMA": "No Judgement or Stigma", # no data
         # "TOFEELHEARD": "To feel heard and listened to, shared trust" #  no data
      #   },
    "Rights and Affordability": {
        "FREE": "Free, affordable and equitable care",
        #   "NOCORRUPTION": "No corruption, informal payments, appropriate payment procedures", # no data
        #   "RECEIVING": "Receiving Entitled Government Benefit, Timely Reimbursements" #  no data
    },
    "Services, Supplies, and Information":
        {
            "ADOLESCENT": "Adolescent & Youth-Friendly Info, Services, Supplies",
            "ANTENATAL": "Antenatal and Prenatal Info, Services, Supplies",
            #   "ALTERNATIVES": "Availability of Alternative Birthing Practices", #  no data
            "CANCER": "Breast, Cervical and Other Cancers Info, Services, Supplies",
            "CHILD": "Child health Info, Services, Supplies (Vaccines)",
            "DISABILITY": "Disability Info, Services, Supplies",
            "FAMILYPLANNING": "Family Planning & Contraceptive Info, Services, Supplies",
            "FOOD": "Food and Nutrition Info, Services, Supplies",
            "HIVSTITB": "HIV, STI, Hepatitis and TB Info, Services, Supplies",
            "INFERTILITY": "Infertility Info, Services, Supplies",
            "INFORMATION": "Counseling and awareness on health & services",
            "LABOR": "Labor and delivery Info, services, supplies",
            "LGBTQ": "LGBTQ Info, Services and Supplies",
            "MALARIA": "Malaria and Vector-borne diseases Info, Services, Supplies",
            "MENSTRUAL": "Menstrual Health Info, Services, Supplies",
            "POSTPARTUM": "Post-Partum/Mental Health Info, Services, Supplies",
          #  "MISCARRIAGE": "Miscarriage Info, services, supplies",
            "NCDS": "Noncommunicable Diseases Info, Services, Supplies",
            #    "PAINMANAGEMENT": "Pain Management Info, Services, Supplies", # no data
            "POSTMENOPAUSAL": "Post-menopausal/elderly Info, Services, Supplies",
            # "POSTPARTUM": "Post-partum, Newborn and Infant Health Info, Services, Supplies", #  no data
            #"REFERRAL": "Referral system",
            "ABORTION": "Abortion/Miscarriage Info, Services, Supplies",
            "SUPPLIES": "Medicines and supplies (blood)",
            "OTHERSERVICES": "Other specific services (e.g. dentistry, eye care)",
        },
    # "To feel safe from threat, danger and discrimination when seeking health services":
    #   {
    # "NODISCRIMINATION": "No Discrimination or Denial of Services re Sex, Ethnicity, Race, Class or Migratory Status", # no data
    #  "NOARRESTFEAR": "No Fear of Deportation, Detainment or Arrest", # no data
    #   "SECURITY": "Security provisions", # no data
    #       "TOFEELSAFE": "To feel safe and free from threat and danger (general)" #  no data
    #  }

}

category_colours = {'Broader health, development and rights': '#2ca02c',
                    'Equity, Dignity, and Respect': '#ff7f0e',
                    'Facility Improvements': '#17becf',
                    'Health Professionals': '#bcbd22',
                    'Other': 'lightslategray',
                    'Patient-Provider Communication': '#e377c2',
                    'Rights and Affordability': '#d62728',
                    'Services, Supplies, and Information': '#1f77b4'}

mapping_to_top_level = {}
mapping_to_description = {}
for top_level, leaves in hierarchy.items():
    for code, name in leaves.items():
        mapping_to_top_level[code] = top_level
        mapping_to_description[code] = name


def get_menu_items():
    l = []
    # l.append({"label": "all categories", "value": ""})
    for top_level, leaves in hierarchy.items():

        l.append({"label": top_level, "value": top_level})
        for code, name in leaves.items():
            l.append({"label": "— " + name, "value": code})

    return l

In [101]:
len(mapping_to_description)

39

In [102]:
import operator
for a, b in sorted(mapping_to_description.items(), key=operator.itemgetter(0)):
    print (a + "\t" + b)

ABORTION	Abortion/Miscarriage Info, Services, Supplies
ADOLESCENT	Adolescent & Youth-Friendly Info, Services, Supplies
ANTENATAL	Antenatal and Prenatal Info, Services, Supplies
ATTENTIVENESS	Timely and attentive care
BETTERFACILITIES	Increased, full-functioning and close health facilities
CANCER	Breast, Cervical and Other Cancers Info, Services, Supplies
CHILD	Child health Info, Services, Supplies (Vaccines)
DISABILITY	Disability Info, Services, Supplies
ECONOMIC	Economic opportunity and financial support
ENVIRON	Environmental health and agricultural support
FAMILY	Male engagement, shifts in family/partner dynamics
FAMILYPLANNING	Family Planning & Contraceptive Info, Services, Supplies
FITNESS	Fitness and recreation
FOOD	Food and Nutrition Info, Services, Supplies
FREE	Free, affordable and equitable care
HEALTH	Improved health, well-being, health services
HEALTHPROFESSIONALS	Health professionals and related needs
HIVSTITB	HIV, STI, Hepatitis and TB Info, Services, Supplies
INFERTILITY	

In [103]:
# some IMPROVED codes look like they need to be merged
def clean_up_code(code):
    if type(code) is str:
        code = code.upper()
    if code in ["INNOVATION", "EVIDENCEBASE", "EVIDENCEBASED"]:
        return "RSCHTECH"
    if code == "24X7":
        return "ATTENTIVENESS"
    if code == "HEALTHCARE":
        return "HEALTH"
    if code == "GOOD":
        return "HEALTH"
    if code in ["ALTERNATIVES", "BIRTHCOMPANION", "MATERNALWARDS"]:
        return "LABOR"
    if code in ["PRIVACY", "BEDS"]:
        return "BETTERFACILITIES"
    if code in ["KITCHEN"]:
        return "FOOD"
    if code in ["RECEIVING", "EQUITABLE"]:
        return "FREE"
    if code in ["COMMUNICATED", "BETTERCOMMUNICATION", "NOSTIGMA", "RESPECT", "FRIENDLY", "INFORMEDCONSENT", \
                "TOFEELHEARD", "RESPECTFULCARE", "CONFIDENTIALITY"]:
        return "RESPECTFULCARE"
    if code in ["MALEENGAGEMENT", "IMPROVED"]:
        return "FAMILY"
    if code in ["MENTAL"]:
        return "POSTPARTUM"
    if code in ["NOARRESTFEAR", "NOCORRUPTION", "NODISCRIMINATION", "SECURITY", "NOABUSE", "SECURECARE"]:
        return "RESPECTFULCARE"
    if code == "OTHER":
        return "OTHERSERVICES"
    # new merges and renames 15 Jan 2021
    if code == "CLIMATE":
        return "ENVIRON"
    if code in [ "COMMUNITY", "ENGAGEACCT", "FUNDING", "POLCHANGE", "NOHARMFULPRACTICE", "PEACE", "WOMENSEQUALITY"]:
        return "POWER"
    if code in ["MIDWIVES", "DOCTORS", "FEMALEPROVIDERS", "MALEPROVIDERS", "NURSESMIDWIVES", "SPECIALISTS", "SUPPORTLINKS", "COMMWORK"]:
        return "HEALTHPROFESSIONALS"
    if code == "HIV":
        return "HIVSTITB"
    if code == "JOBS":
        return "ECONOMIC"
    if code in ["MANAGEMENT", "ADMIN", "REFERRAL", "BED", "ELECTRICITY", "LABORATORIES"]:
        return "BETTERFACILITIES"
    if code in ["MISCARRIAGE", "SAFEABORTION"]:
        return "ABORTION"
    if code in ["NOTRELATED", "OTHERQUESTIONABLE"]:
        return "OTHERNONDETERMINABLE"
    if code == "RELIGIOUS":
        return "RELIGION"
    if code == "WANTCHILDREN":
        return "INFERTILITY"
    else:
        return code

In [104]:
def sanitise_string(code):
    return re.sub(r'[^A-Z0-9]', '', code.upper())

In [105]:
def uber_clean_up_code(code):
    if type(code) is not str:
        return code
    code = sanitise_string(code)
    code = clean_up_code(code)
    code = sanitise_string(code)
    return code

In [106]:
len(df_all[df_all.canonical_code == "CONFIDENTIALITY"])

0

In [107]:
df_all["canonical_code"] = df_all.code.apply(uber_clean_up_code)

In [108]:
df_all["age"] = df_all["age"].apply(lambda x : 100 if x > 100 else (0 if x <= 14 else x))

In [109]:
df_all["canonical_country"] = df_all["iso_country"].apply(lambda x : re.sub(r', .+', '', pycountry.countries.lookup(x).name))

## If something is in OTHERQUESTIONABLE and also in another category, remove OTHERQUESTIONABLE

In [110]:
identifiers_otherquestionable = set(df_all[df_all.canonical_code=="OTHERNONDETERMINABLE"].identifier)
identifiers_not_otherquestionable = set(df_all[df_all.canonical_code!="OTHERNONDETERMINABLE"].identifier)
otherquestionables_to_delete = set(identifiers_otherquestionable.intersection(identifiers_not_otherquestionable))

In [111]:
df_all = df_all[~df_all.identifier.isin(otherquestionables_to_delete)]

In [112]:
# Rename OTHERQUESTIONABLE to OTHERNONDETERMINABLE
df_all["canonical_code"] = df_all["canonical_code"].apply(lambda x : "OTHERNONDETERMINABLE" if x == "OTHERNONDETERMINABLE" else x)

In [113]:
df_all = df_all[df_all.canonical_code.apply(lambda x : type(x) is str)]

## Remove the entries for countries that have only <10 responses

In [114]:
print (len(df_all))

1192942


In [115]:
df_all = df_all[~df_all["canonical_country"].isin(["Chile", "Honduras", "Senegal", "South Sudan", "United Kingdom", "United States"])]

In [116]:
print (len(df_all))

1192558


## Remove the entries where the canonical code is not in the master list

First print what those codes are which are to be removed.

Mostly typos from the hand coding.

In [117]:
df_all.canonical_code[~df_all["canonical_code"].isin(mapping_to_top_level)].value_counts()

ANTENATALADOLESCENTFOOD              4809
LABORBETTERFACILITIES                2508
ATTENTIVENESSTRANSPORTATION          1730
HEALTHPROFESSIONALSRESPECTFULCARE    1482
LABORFREE                            1469
                                     ... 
INFORMATIONPOWER                        1
HEALH                                   1
HEALTHWORKERS                           1
ELECTRCITY                              1
POSTPARUM                               1
Name: canonical_code, Length: 201, dtype: int64

In [118]:
len(df_all[~df_all["canonical_code"].isin(mapping_to_top_level)])

20738

In [119]:
df_all = df_all[df_all["canonical_code"].isin(mapping_to_top_level)]

In [120]:
print (len(df_all))

1171820


In [121]:
df_all = df_all[df_all.raw_response.str.len() >= 3]

In [122]:
MAX_LENGTH = 300
def shorten(x):
    if len(x) < MAX_LENGTH:
        return x
    else:
        return re.sub(r'\W+\w+$', '', x[:MAX_LENGTH])

In [123]:
len(df_all), len(set(df_all.identifier))

(1171800, 1018487)

In [124]:
df_all.identifier.value_counts()

789826    10
784036     9
292924     8
298278     8
298276     8
          ..
322057     1
320008     1
301575     1
309763     1
4098       1
Name: identifier, Length: 1018487, dtype: int64

In [125]:
from functools import reduce
def term_aggregator(s):
    return reduce(lambda x, y: x + "/" + y, s)

In [49]:
df_all_grouped = df_all.groupby("identifier").aggregate({"raw_response":"first",
                                                         "iso_country":"first",
                                                         "age":"first",
                                                         "canonical_code":term_aggregator,
                                                         "canonical_country":"first",
                                                        "identifier":"first",
                                                        "source":"first"})

## Double code LABOR to BETTERFACILITIES

Special fix for LABOR: double coding note: will need to be double coded so that the responses that had to do with maternity wards also go in BETTERFACILITIES

In [50]:
df_all_grouped[df_all_grouped.canonical_code == "LABOR"].raw_response.sample(10)

identifier
418766                                   Provide pain killer
899349                          Need child delivery facility
261833                                    FREE DELIVERY KITS
1008318                             Maternity home is needed
1020897                      Provision of maternity hospital
1033927                            Want facilities for women
958671         Maternity home should be establish in village
1008270        Maternal health facilities should be provided
810087                    REQUIRED ICU ROOM IN THE HOSPITAL.
793956     Greater attention to women with childbirth com...
Name: raw_response, dtype: object

In [51]:
len(df_all_grouped[df_all_grouped.canonical_code.str.contains("LABORATORIES")])

0

In [52]:
len(df_all_grouped[df_all_grouped.canonical_code.str.contains("SUPPLIES")])

82883

In [53]:
len(df_all_grouped[df_all_grouped.canonical_code.str.contains("BETTERFACILITIES")])

103775

In [54]:
for idx in range(len(df_all_grouped)):
    if "LABORATORIES" in df_all_grouped.canonical_code.iloc[idx] and "BETTERFACILITIES" not in df_all_grouped.canonical_code.iloc[idx]  and "SUPPLIES" not in df_all_grouped.canonical_code.iloc[idx]:
        text = df_all_grouped.raw_response.iloc[idx].lower()
        if "test" in text:
            df_all_grouped["canonical_code"].iloc[idx] += "/SUPPLIES"
            df_all_grouped["canonical_code"].iloc[idx] = re.sub("LABORATORIES/", "", df_all_grouped["canonical_code"].iloc[idx])
        else:
            df_all_grouped["canonical_code"].iloc[idx] += "/BETTERFACILITIES"

In [55]:
len(df_all_grouped[df_all_grouped.canonical_code.str.contains("LABORATORIES")])

0

In [56]:
len(df_all_grouped[df_all_grouped.canonical_code.str.contains("SUPPLIES")])

82883

In [57]:
len(df_all_grouped[df_all_grouped.canonical_code.str.contains("BETTERFACILITIES")])

103775

If it is ONLY ‘privacy’, it goes into RESPECTFULCARE. But, if it is something related to both privacy and the facility, it should be double coded (so, if it has to do with curtains, your own bed, or the search terms below) it should be double coded in both RESPECTFULCARE and BETTERFACILITIES.

In [58]:
for idx in range(len(df_all_grouped)):
    if "LABOR" in df_all_grouped.canonical_code.iloc[idx]:
        text = df_all_grouped.raw_response.iloc[idx].lower()
        if "ward" in text or "room" in text or "bed" in text:
            if "BETTERFACILITIES" not in df_all_grouped.canonical_code.iloc[idx]:
                df_all_grouped["canonical_code"].iloc[idx] += "/BETTERFACILITIES"
            if "RESPECTFULCARE" not in df_all_grouped.canonical_code.iloc[idx]:
                df_all_grouped["canonical_code"].iloc[idx] += "/RESPECTFULCARE"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [59]:
len(df_all_grouped[df_all_grouped.canonical_code.str.contains("BETTERFACILITIES")])

109321

In [60]:
len(df_all_grouped[df_all_grouped.canonical_code.str.contains("LABOR")])

47339

In [61]:
for idx in range(len(df_all_grouped)):
    if "NOINTERVENTION" in df_all_grouped.canonical_code.iloc[idx] and "LABOR" not in df_all_grouped.canonical_code.iloc[idx]:
        text = df_all_grouped.raw_response.iloc[idx].lower()
        if re.findall("ca?esar|c[- ]?section", text):
            df_all_grouped["canonical_code"].iloc[idx] += "/LABOR"
            df_all_grouped["canonical_code"].iloc[idx] = re.sub("NOINTERVENTION/", "", df_all_grouped["canonical_code"].iloc[idx])

In [62]:
len(df_all_grouped[df_all_grouped.canonical_code.str.contains("LABOR")])

47364

# only "privacy" -> "RESPECTFULCARE"

In [63]:
for idx in range(len(df_all_grouped)):
    text = df_all_grouped.raw_response.iloc[idx].lower()
    if text == "privacy":
        df_all_grouped["canonical_code"].iloc[idx] = "RESPECTFULCARE"

# Best/good deal -> +RESPECTFULCARE

In [64]:
for idx in range(len(df_all_grouped)):
    text = df_all_grouped.raw_response.iloc[idx].lower()
    if  "good deal" in text or "best deal" in text:
        df_all_grouped["canonical_code"].iloc[idx] += "/RESPECTFULCARE"

# Free/cheap/affordable/money

all mentions can now be double coded in their topic category and the FREE code

In [65]:
len(df_all_grouped[df_all_grouped.canonical_code.str.contains("FREE")])

63553

In [66]:
num_changed = 0
pattern = re.compile(r'(?i).*(?:free|cheap|affordable|money)')
for idx in range(len(df_all_grouped)):
    text = df_all_grouped.raw_response.iloc[idx]
    if pattern.match(text):
        df_all_grouped["canonical_code"].iloc[idx] += "/FREE"
        num_changed += 1
num_changed

99437

In [67]:
len(df_all_grouped[df_all_grouped.canonical_code.str.contains("FREE")])
# was 85124 before fix of .*

109343

# Girl/Teen/Teenage/Young/Youth

all mentions can now be double coded to expanded Adolescent code

In [68]:
len(df_all_grouped[df_all_grouped.canonical_code.str.contains("ADOLESCENT")])

15918

In [69]:
pattern = re.compile(r'(?i).*(?:girl|\bteens?\b|teenage|young|youth)')
for idx in range(len(df_all_grouped)):
    text = df_all_grouped.raw_response.iloc[idx]
    if pattern.match(text):
        df_all_grouped["canonical_code"].iloc[idx] += "/ADOLESCENT"

# "child"

anything that now mentions child or children can be double coded to CHILD code

In [70]:
pattern = re.compile(r'(?i).*child')
neg = re.compile(r'(?i)(want|have).+child')
for idx in range(len(df_all_grouped)):
    text = df_all_grouped.raw_response.iloc[idx]
    if pattern.match(text) and not neg.match(text):
        df_all_grouped["canonical_code"].iloc[idx] += "/CHILD"

# Maternity or labor ward/wing

double code to BETTERFACILITIES and LABOR

In [71]:
pattern = re.compile(r'(?i).*(?:maternity|labou?r).+(?:ward|bed|room|wing)')
for idx in range(len(df_all_grouped)):
    text = df_all_grouped.raw_response.iloc[idx]
    if pattern.match(text):
        df_all_grouped["canonical_code"].iloc[idx] += "/BETTERFACILITIES/LABOR"

# Requests around Privacy AND confidentiality

double code to BETTERFACILITIES and RESPECTFULCARE

In [72]:
pattern = re.compile(r'(?i).*(?:priva(?:cy|te).+confidential)|(?:confidential.+priva(cy|te))')
for idx in range(len(df_all_grouped)):
    text = df_all_grouped.raw_response.iloc[idx]
    if pattern.match(text):
        df_all_grouped["canonical_code"].iloc[idx] += "/BETTERFACILITIES/RESPECTFULCARE"

# Within WASH, Marissa noticed that requests were wrongly double coded to WASH when specifically referencing labor/ maternity ward/delivery.

This is almost exclusively coming from the Nigeria data.

In [73]:
pattern = re.compile(r'(?i).*(?:deliver|labou?r|maternity)')
for idx in range(len(df_all_grouped)):
    text = df_all_grouped.raw_response.iloc[idx]
    if df_all_grouped.iso_country.iloc[idx] == "ng" and pattern.match(text):
        df_all_grouped["canonical_code"].iloc[idx] = re.sub("/WASH|WASH/", "", df_all_grouped["canonical_code"].iloc[idx])

# Single word request for “companionship”¶

should be double coded in LABOR and RESPECTFULCARE

In [74]:
pattern = re.compile(r'(?i)^companions?(?:hip)?$')
for idx in range(len(df_all_grouped)):
    text = df_all_grouped.raw_response.iloc[idx]
    if pattern.match(text):
        df_all_grouped["canonical_code"].iloc[idx] = "LABOR/RESPECTFULCARE"

## Add columns with normalised text

Normalizing UK->US spelling

removing suffixes such as "ing"

In [75]:
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

In [76]:
import localspelling

In [77]:
spelling_lookup = localspelling.spelling_converter.lookups['us'][1]

In [78]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import operator

lemmatizer = WordNetLemmatizer()

# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    #sentence = localspelling.convert_spelling(sentence, "us")
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        orig_word = word
        word = spelling_lookup.get(word, word)
        
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatised = lemmatizer.lemmatize(word, tag)
            if lemmatised != word and word == orig_word:
                lemmatised = spelling_lookup.get(lemmatised, lemmatised)
            lemmatized_sentence.append(lemmatised)
    return lemmatized_sentence # " ".join(lemmatized_sentence)

print(lemmatize_sentence("I am loving it")) #I be love it

['I', 'be', 'love', 'it']


In [79]:
df_all_grouped['tokenized'] = df_all_grouped['raw_response'].apply(lambda x : str(x).lower()).apply(lemmatize_sentence)

In [80]:
df_all_grouped['lemmatized'] = df_all_grouped['tokenized'].apply(lambda x : " ".join(x))

In [81]:
df_all_grouped["lemmatized"].sample(10)

identifier
407032                              provision of laboratory
99313      availability of asha worker during delivery time
744324                  maintain hygiene of maternal center
613557                           the free maternity service
729911                         free transport ( ambulance )
222314                             availability of resource
380572                            be polite to the customer
197859                 availability of doctor at sub center
835426                                   doctor - need more
708051    well-equipped rural dispensary that meet the n...
Name: lemmatized, dtype: object

# Remove duplicate canonical codes, which mess up code counts

In [82]:
def clean_up_canonical_code(orig):
    values = orig.split("/")
    return "/".join(set(values))
df_all_grouped["canonical_code"]= df_all_grouped.canonical_code.apply(clean_up_canonical_code)

In [83]:
df_all_grouped.sample(5)

Unnamed: 0_level_0,raw_response,iso_country,age,canonical_code,canonical_country,identifier,source,tokenized,lemmatized
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
195151,Vanitha said that the men here are delivering ...,in,28,CHILD/HEALTHPROFESSIONALS,India,195151,manual,"[vanitha, say, that, the, men, here, be, deliv...",vanitha say that the men here be deliver the d...
111597,"Doctors, nurse, staff availability",in,21,HEALTHPROFESSIONALS,India,111597,manual,"[doctor, ,, nurse, ,, staff, availability]","doctor , nurse , staff availability"
1061496,Consent,mw,26,RESPECTFULCARE,Malawi,1061496,manual,[consent],consent
443031,I WANT FREE MATERNITY CARE,ng,32,FREE,Nigeria,443031,manual,"[i, want, free, maternity, care]",i want free maternity care
148078,Free medicines and drugs,in,29,FREE,India,148078,manual,"[free, medicine, and, drug]",free medicine and drug


## Drop columns not needed for dashboard

In [211]:
df_all_grouped["alpha3country"] = df_all_grouped["iso_country"].apply(lambda x : re.sub(r', .+', '', pycountry.countries.lookup(x).alpha_3))

In [212]:
df_all_grouped.sample(5)

Unnamed: 0_level_0,raw_response,iso_country,age,canonical_code,canonical_country,identifier,source,tokenized,lemmatized,alpha3country
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
691492,Free medicines,in,40,FREE,India,691492,manual,"[free, medicine]",free medicine,IND
409203,there should be lady doctor in our village,pk,27,HEALTHPROFESSIONALS,Pakistan,409203,manual,"[there, should, be, lady, doctor, in, our, vil...",there should be lady doctor in our village,PAK
537862,Qualified doctors,ke,20,HEALTHPROFESSIONALS,Kenya,537862,tw_sklearn,"[qualified, doctor]",qualified doctor,KEN
182169,Cleanliness in hospitals,in,32,WASH,India,182169,manual,"[cleanliness, in, hospital]",cleanliness in hospital,IND
194742,ambulance services 24x7 free of cost,in,32,TRANSPORTATION/ATTENTIVENESS/FREE,India,194742,manual,"[ambulance, service, 24x7, free, of, cost]",ambulance service 24x7 free of cost,IND


In [None]:
sorted(set(df_all_grouped.canonical_country))

In [None]:
df_all_grouped.groupby(["iso_country", "canonical_country"])["raw_response"].count().reset_index()

In [None]:
len(df_all), len(df_all_grouped)

In [None]:
len(df_all) - len(df_all_grouped)

In [None]:
from collections import Counter
s = Counter()
for codes in set(df_all_grouped.canonical_code):
    for code in codes.split("/"):
        s[re.sub(r'[^A-Z]', '', code.upper())] += 1

In [None]:
len(s)
sorted(s)

## Pull out Health Literacy Data

In [241]:
info = df_all_grouped[df_all_grouped['canonical_code'].str.contains('INFORMATION')]
info_hl = info[~info['canonical_code'].str.contains('RELIGION|SCHOOLS')]
len(info_hl)

21290

In [242]:
other = df_all_grouped[~df_all_grouped['canonical_code'].str.contains('INFORMATION|RELIGION|SCHOOLS')]
len(info_hl), len(other)

(21290, 981740)

In [243]:
other_hl = other[other['raw_response'].str.contains('educat|learn|inform|know|counsel')]
len(other_hl)

18540

In [244]:
sch = df_all_grouped[df_all_grouped['canonical_code'].str.contains('SCHOOLS')]
sch_hl = sch[sch['lemmatized'].str.contains('sex|reproduc|motherhood|matern|health')]
len(sch_hl)
sch_hl.sample(10)

Unnamed: 0_level_0,raw_response,iso_country,age,canonical_code,canonical_country,identifier,source,tokenized,lemmatized,alpha3country
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
755918,"It is open workshops for women, reproductive h...",mx,28,BETTERFACILITIES/SCHOOLS,Mexico,755918,manual,"[it, be, open, workshop, for, woman, ,, reprod...","it be open workshop for woman , reproductive h...",MEX
943223,There should be health facilities and treatmen...,pk,60,HEALTHPROFESSIONALS/WASH/BETTERFACILITIES/SCHOOLS,Pakistan,943223,manual,"[there, should, be, health, facility, and, tre...",there should be health facility and treatment ...,PAK
674759,Dissemination by trained personnel responsible...,mx,17,BETTERFACILITIES/ADOLESCENT/SCHOOLS,Mexico,674759,manual,"[dissemination, by, trained, personnel, respon...",dissemination by trained personnel responsible...,MEX
755548,Improve reproductive education in primary scho...,mx,33,SCHOOLS,Mexico,755548,manual,"[improve, reproductive, education, in, primary...",improve reproductive education in primary scho...,MEX
725230,Sexual abuse of girls in schools should be sto...,ug,19,POWER/ADOLESCENT/SCHOOLS,Uganda,725230,manual,"[sexual, abuse, of, girl, in, school, should, ...",sexual abuse of girl in school should be stop ...,UGA
417937,Maternity education should be provided,tz,23,SCHOOLS/INFORMATION,Tanzania,417937,greg,"[maternity, education, should, be, provide]",maternity education should be provide,TZA
872417,Educate children about health,pk,35,CHILD/SCHOOLS,Pakistan,872417,manual,"[educate, child, about, health]",educate child about health,PAK
759962,Talks in schools About sexuality,mx,15,SCHOOLS,Mexico,759962,manual,"[talk, in, school, about, sexuality]",talk in school about sexuality,MEX
1024891,School and maternal health center in our village,pk,42,SCHOOLS,Pakistan,1024891,manual,"[school, and, maternal, health, center, in, ou...",school and maternal health center in our village,PAK
1043698,"there should be provision of healthy food, bet...",pk,25,WASH/FOOD/SCHOOLS,Pakistan,1043698,greg,"[there, should, be, provision, of, healthy, fo...","there should be provision of healthy food , go...",PAK


In [245]:
rel = df_all_grouped[df_all_grouped['canonical_code'].str.contains('RELIGION')]
rel_hl = rel[rel['raw_response'].str.contains('educat|learn|inform|know|counsel')]
rel_hl_fixed = rel_hl[~rel_hl['lemmatized'].str.contains('islamic education|god education|spiritual counseling|madras|quran|religious|provision of education')]
len(rel_hl), len(rel_hl_fixed)

(132, 17)

In [246]:
hl_all = pd.concat([info_hl, other_hl, rel_hl_fixed, sch_hl])
len(hl_all)

40326

In [251]:
hl_final = hl_all[~hl_all['canonical_code'].str.contains('JOBS|FREE|NODEMAND')]
hl_final = hl_final[~hl_final['raw_response'].str.contains('yoga')]
len(hl_final)

38754

In [252]:
#hl_final = hl_final[~hl_final['raw_response'].str.contains('school')]
len(hl_final)

38754

In [249]:
hl_final.sample(10)

Unnamed: 0_level_0,raw_response,iso_country,age,canonical_code,canonical_country,identifier,source,tokenized,lemmatized,alpha3country
identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
417446,Maternity education,tz,42,INFORMATION,Tanzania,417446,greg,"[maternity, education]",maternity education,TZA
524672,Sexual and reproductive education to girls,ug,19,ADOLESCENT,Uganda,524672,manual,"[sexual, and, reproductive, education, to, girls]",sexual and reproductive education to girls,UGA
663451,I want to be informed of my rights.,ng,23,INFORMATION,Nigeria,663451,manual,"[i, want, to, be, inform, of, my, right, .]",i want to be inform of my right .,NGA
296544,maternity education to young generation,tz,75,ADOLESCENT,Tanzania,296544,manual,"[maternity, education, to, young, generation]",maternity education to young generation,TZA
296167,knowledge about good maternal health care and ...,tz,24,INFORMATION,Tanzania,296167,manual,"[knowledge, about, good, maternal, health, car...",knowledge about good maternal health care and ...,TZA
307378,education on family palnning,ke,28,FAMILYPLANNING,Kenya,307378,tw_sklearn,"[education, on, family, palnning]",education on family palnning,KEN
620665,Birth methods education,tz,24,FAMILYPLANNING,Tanzania,620665,manual,"[birth, method, education]",birth method education,TZA
396886,Outreaches to the villages,ug,33,INFORMATION,Uganda,396886,manual,"[outreach, to, the, village]",outreach to the village,UGA
457217,I want sex education at our rural areas,ng,23,ADOLESCENT,Nigeria,457217,manual,"[i, want, sex, education, at, our, rural, area]",i want sex education at our rural area,NGA
14205,There should be committee for health education,pk,33,POWER,Pakistan,14205,greg,"[there, should, be, committee, for, health, ed...",there should be committee for health education,PAK


In [None]:
import pandas as pd
pd.set_option('display.max_rows', 2859)
pd.set_option('display.max_colwidth', None)
df_all_grouped.groupby(["canonical_code"])["raw_response"].count().reset_index()

In [253]:
compression_opts = dict(method='zip',
                        archive_name="hl_final_data_zip.csv")  
hl_final.to_csv("C:/Users/catie/Documents/MFM/Updated Codes Jan22/data/hl_data_final.zip", index=False,
          compression=compression_opts) 

In [254]:
hl_final.to_pickle("C:/Users/catie/Documents/MFM/Updated Codes Jan22/data/hl_data_final.pkl", protocol=3)

In [None]:
#hl_final.sample(1000).to_pickle("data/hl_data_small.pkl", protocol=3)

In [None]:
#hl_final.sample(10000).to_pickle("data/hl_data_10k.pkl", protocol=3)

In [None]:
#hl_final.sample(100000).to_pickle("hl_data_100k.pkl", protocol=3)