In [None]:
import pandas as pd
data = pd.DataFrame([['a',1],['B',None],['C c',3]], columns=['letter','number'])

In [None]:
data

---
### Fill NaN with a fixed value

In [None]:
data['number'].fillna(0)


---
### Fill NaN with a calculated value

In [None]:
data['number'].fillna(data['number'].mean())

---
### Fill NA with another column

In [None]:
data['number'].fillna(data['letter'])

In [None]:
import pandas as pd
states = pd.DataFrame([
    ['1','AK',703423],
    [pd.NA, 'AL', 5634923],
    [pd.NA, 'AR', 3029341],
    [pd.NA, 'AZ', 2317412],
    ['2', 'CA', 13493821],
    [pd.NA, 'CO', 5434124]
], columns=['region','state','population'])

In [None]:
states

In [None]:
states['region'] = states['region'].fillna(states['state'])
#states['region'].fillna(states['state'], inplace=True)   # Old way of running this command in Pandas. Will not work in Pandas 3.0

In [None]:
states

In [None]:
import pandas as pd
states = pd.DataFrame([
    ['1','AK',703423],
    [pd.NA, 'AL', 5634923],
    [pd.NA, 'AR', 3029341],
    [pd.NA, 'AZ', 2317412],
    ['2', 'CA', 13493821],
    [pd.NA, 'CO', 5434124]
], columns=['region','state','population'])

In [None]:
states

In [None]:
states['region'] = states['region'].ffill() # ffill stands for 'Forward Fill'

In [None]:
states

---
---

### String Functions

In [None]:
data = pd.DataFrame([['a',1],['B',None],['Cat c',3]], columns=['letter','number'])

In [None]:
data

In [None]:
data['letter'].str.lower()

In [None]:
data['letter'].str.upper()

In [None]:
data['letter'].str.title()

In [None]:
data['letter'].str.replace(r'^.* ','')

In [None]:
data['letter'].str[-1]

# Situation / Problem

In this data set, we want to group by the Proprietary Name.  How many drugs are represented here?

* Does every row have a proprietary name? - If not, use the non-proprietary name instead
* Are they formatted consistently? - Make them all upper case or lower case
* Do any of the names have unusual characters that we may want to change?

In [None]:
import pandas as pd

In [None]:
ndc = pd.read_csv('https://hds5210-data.s3.amazonaws.com/ndc.txt', delimiter='\t')

In [None]:
ndc

In [None]:
# STEP 1: Are any of them blank

blank_filter = ndc['PROPRIETARYNAME'].isnull()
blanks = ndc[blank_filter]
blanks.shape

In [None]:
blanks

In [None]:
# ndc['PROPRIETARYNAME'].fillna(ndc['NONPROPRIETARYNAME'], inplace=True)  # -- Old way of replacing data with the values of another column
ndc.fillna({'PROPRIETARYNAME': ndc['NONPROPRIETARYNAME']}, inplace=True)


In [None]:
null_filter = ndc['PROPRIETARYNAME'].isnull()
blanks = ndc[null_filter]
blanks.shape


In [None]:
florbetapir_filter = ndc['PROPRIETARYNAME'] == 'Florbetapir F 18'
ndc[florbetapir_filter]

In [None]:
# STEP 2: Check the formats - how many are all caps, how many are all lowercase

upper_filter = ndc['PROPRIETARYNAME'].str.contains(r'^[A-Z0-9 \-]+$')
all_caps = ndc[upper_filter]
all_caps.shape

In [None]:
lower_filter = ndc['PROPRIETARYNAME'].str.contains(r'^[a-z0-9 \-]+$')
all_lc = ndc[ lower_filter ]
all_lc.shape

In [None]:
# Let's just make them all lower case

ndc['PROPRIETARYNAME'] = ndc['PROPRIETARYNAME'].str.lower()


In [None]:
lowercase_filter = ndc['PROPRIETARYNAME'] == ndc['PROPRIETARYNAME'].str.lower()
all_lc = ndc[ lowercase_filter ]
all_lc.shape

In [None]:
ndc.shape

In [None]:
# STEP 3: Look for any that might have strange characters or number
weird_filter = ndc['PROPRIETARYNAME'].str.contains(r'[0-9\-\\\/]')
weird = ndc[weird_filter]
weird.shape

In [None]:
weird