# set display

In [15]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# import data

In [16]:
import pandas as pd
df = pd.read_csv("data/primary_data.csv")

# Preprocess Data

## processing data (diagnosis, race, body_part)
- renamed hisp to hispanic in other_race
- change other(3) in race according to other race
    - if yes(1) in hispanic and unknown in other_race change other_race according to hispanic
    - else UNKNOWN
- change other(71) in diagnosis according to other_diagnosis
- change other(71) in diagnosis_2 according to to other_diagnosis_2
- change nan in body_part_2, diagnosis_2 to no
- remove unused cols: other_race, other_diagnosis, other_diagnosis_2

### preprocess diagnosis function

In [17]:
def fill_diagnosis(diagnosis, other_diagnosis):
    if diagnosis == 71:
        return other_diagnosis
    else:
        return diagnosis

### preprocess race function

In [18]:
def fill_race(race, other_race, hispanic):
    if race == 3:
        if hispanic == 1 and other_race== "UNKNOWN":
            return "HISPANIC"
        elif hispanic == 0 and other_race== "UNKNOWN":
            return "UNKNOWN"
        else:
            return other_race
    else:
        return race

## preprocess narrative
- lower string
- remove punctuation (except ```.```) 
- lemmatization word
- remove non english word
- replace multiple spaces with a single space
- strip

### preprocess narrative function

In [19]:
import nltk
from nltk.corpus import words
from nltk.stem import WordNetLemmatizer
from string import punctuation
import re
import spacy
import os 


# download english word corpus
nltk.download('words')
words = set(words.words())

# remove punc except `.`
punctuation = punctuation.replace(".", "")
punctuation_pattern = f"[{re.escape(punctuation)}]"

# get spacy lemmatize model
os.system("python -m spacy download en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

def lemmatizer(narrative:str) -> str:
    return " ".join([token.lemma_ for token in nlp(narrative)])

def remove_non_eng_word(narrative:str)-> str:
    return " ".join([token if token in words else "" for token in narrative.split(" ")])

def process_narrative(narrative:str) -> str:
    process_narrative = narrative.lower()
    process_narrative = re.sub(punctuation_pattern, '', process_narrative)
    process_narrative = re.sub(r' +', ' ', process_narrative)
    process_narrative = lemmatizer(process_narrative)
    processed_narrative = remove_non_eng_word(process_narrative)
    return processed_narrative.strip()

[nltk_data] Downloading package words to /Users/user/nltk_data...
[nltk_data]   Package words is already up-to-date!
2023-08-05 02:11:29.247412: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## preprocess data from all function

In [20]:
# preprocess race and diagnosis
df["other_race"] =  df["other_race"].apply(lambda race: "HISPANIC" if race == "HISP" else race)
df["race"] = df[["race", "other_race", "hispanic"]].apply(lambda row: fill_race(*row), axis=1)
df["diagnosis"] = df[["diagnosis", "other_diagnosis"]].apply(lambda row: fill_diagnosis(*row), axis=1)
df["diagnosis_2"] = df[["diagnosis_2", "other_diagnosis_2"]].apply(lambda row: fill_diagnosis(*row), axis=1)

# preprocess body_part_2 and diagnosis_2
df["body_part_2"] = df["body_part_2"].fillna("no")
df["diagnosis_2"] = df["body_part_2"].fillna("no")

# preprocess narrative
df["narrative"] = df["narrative"].apply(process_narrative)

df = df.drop(columns=["other_race", "other_diagnosis", "other_diagnosis_2"])

# mapper from json 
ref: https://www.drivendata.org/competitions/217/cdc-fall-narratives/data/\
Mapping between encoded integers and their string values

what we do:
- replace some pattern with "" such as "0-None" to "None" 

In [None]:
import json
f = open("data/variable_mapping.json")
mapper = json.load(f)

In [None]:
import re

def value_mapper(idx, cols):
    if idx in mapper[cols].keys():
        value = mapper[cols][idx]
        return re.sub(r"\d{,3}.\s-\s", "", value)
    else:
        return idx

In [None]:
for col in df.columns:
    if col in mapper.keys():
        df[col] = df[col].astype(str)
        df[col] = df[col].apply(value_mapper, cols= col)
    

# save cleaned df

In [None]:
df.to_csv("data/cleaned_df.csv")