<a href="https://colab.research.google.com/github/maya-papaya/ads1-cervical-cancer-analysis/blob/main/Data%20Cleaning%20%26%20Preprocessing%20(2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Cleaning & Preprocessing (2)

In [None]:
# SETTING UP COLAB AND MODULES
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

import os
os.chdir("/content/drive/My Drive/ADS_Maya_Reddy/projects/disease_project_1/disease_datasets_1/")

Mounted at /content/drive


### Incidence/Mortality Rate Datasets

In [None]:
# LOADING MAIN DATASETS
data19 = pd.read_csv('cervical_cancer_2019.csv')
data16a = pd.read_csv('cervical_cancer_2016_incidence.csv')
data16b = pd.read_csv('cervical_cancer_2016_mortality.csv')
data10 = pd.read_csv('cervical_cancer_1980_2010.csv')

# NARROWING DOWN DATASETS TO CERVICAL CANCER ONLY
data19 = data19[data19['indicator_name'] == 'Cervical cancer'].drop(['indicator_name', 'indicator_id'], axis=1)
data16 = data16b.merge(data16a, how='outer', on=['Location', 'Sex', 'Age', 'Cancer', 'Metric', 'Year']).rename(columns={'Location':'location', 'Year':'year', 'Age':'age_group', 'Measure':'measure', 'Sex':'sex', 'Metric':'metric', 'Value_x':'mortality_rate', 'Lower 95% uncertainty interval_x':'mort_lower', 'Upper 95% uncertainty interval_x':'mort_upper', 'Value_y':'incidence_rate', 'Lower 95% uncertainty interval_y':'inc_lower', 'Upper 95% uncertainty interval_y':'inc_upper'}).drop(['Measure_x', 'Measure_y'], axis=1)
data16 = data16[data16['Cancer'] == 'Cervical cancer'].drop(['Cancer'], axis=1)
data10 = data10.drop(columns=data10.columns[[4, 6, 8, 10, 12, 14, 16]])

In [None]:
# CHECKING NULL VALUES
print(data19.isnull().sum())
print(data19.duplicated().sum())
print()
print(data16.isnull().sum())
print(data16.duplicated().sum())
print()
print(data10.isnull().sum())
print(data10.duplicated().sum())

location_id           0
location_name         0
year_id               0
age_group_id          0
age_group_name        0
haq_index_age_type    0
measure               0
val                   0
upper                 0
lower                 0
dtype: int64
0

location          0
sex               0
age_group         0
metric            0
year              0
mortality_rate    0
mort_lower        0
mort_upper        0
incidence_rate    0
inc_lower         0
inc_upper         0
dtype: int64
60

Country Name                                               0
GBD Region                                                 0
Year                                                       0
Population (women aged 15-79)                              0
No. of  cervical cancer deaths (women aged 15-49)          0
No. of  cervical cancer cases (women aged 15-49)           0
% deaths in ages 15-49 among women with cervical cancer    0
No. of  cervical cancer deaths (women aged 15 - 79)        0
No. of  cervical ca

In [None]:
# CHECKING DATATYPES
print(data19.dtypes)
print()
print(data16.dtypes)
print()
print(data10.dtypes)

location_id             int64
location_name          object
year_id                 int64
age_group_id            int64
age_group_name         object
haq_index_age_type     object
measure                object
val                   float64
upper                 float64
lower                 float64
dtype: object

location           object
sex                object
age_group          object
metric             object
year                int64
mortality_rate    float64
mort_lower        float64
mort_upper        float64
incidence_rate    float64
inc_lower         float64
inc_upper         float64
dtype: object

Country Name                                                object
GBD Region                                                  object
Year                                                         int64
Population (women aged 15-79)                              float64
No. of  cervical cancer deaths (women aged 15-49)            int64
No. of  cervical cancer cases (women aged 15-49) 

In [None]:
# STANDARDIZING COLUMN NAMES
data19 = data19.rename(columns={'location_name':'country', 'year_id':'year', 'age_group_name':'age_group'}).drop(['location_id'], axis=1)
data10 = data10.rename(columns={'Country Name':'country', 'Year':'year', 'GBD Region':'region', 'Population (women aged 15-79)':'population', 'No. of  cervical cancer deaths (women aged 15-49)':'cancer_death_count_to_49', 'No. of  cervical cancer cases (women aged 15-49)':'cancer_case_count_to_49', '% deaths in ages 15-49 among women with cervical cancer':'percent_deaths_to_49', 'No. of  cervical cancer deaths (women aged 15 - 79)':'cancer_death_count_to_79', 'No. of  cervical cancer cases (women aged 15 - 79)':'cancer_case_count_to_79', 'Risk of mortality cervical cancer (%)':'mortality_risk', 'Risk of incidence cervical cancer (%)':'incidence_risk'})

In [None]:
# LOADING MODIFIED DATASET INTO FOLDER
data19.to_csv("/content/drive/My Drive/ADS_Maya_Reddy/projects/disease_project_1/prepped_datasets/data19.csv", index=False)
data16.to_csv("/content/drive/My Drive/ADS_Maya_Reddy/projects/disease_project_1/prepped_datasets/data16.csv", index=False)
data10.to_csv("/content/drive/My Drive/ADS_Maya_Reddy/projects/disease_project_1/prepped_datasets/data10.csv", index=False)

### Pap Smear Test Datasets

In [50]:
# LOADING PAP SMEAR TEST DATASETS
pap19 = pd.read_csv('pap_smear/pap_smear_2019.csv')[['StateAbbr', 'PlaceName', 'Population2010', 'PAPTEST_CrudePrev', 'PAPTEST_Crude95CI']]
pap18 = pd.read_csv('pap_smear/pap_smear_2018.csv')[['StateAbbr', 'PlaceName', 'Population2010', 'PAPTEST_CrudePrev', 'PAPTEST_Crude95CI']]
pap17 = pd.read_csv('pap_smear/pap_smear_2017.csv')[['StateAbbr', 'PlaceName', 'population_count', 'PAPTEST_CrudePrev', 'PAPTEST_Crude95CI']].rename(columns={'population_count': 'Population2010'})
pap16 = pd.read_csv('pap_smear/pap_smear_2016.csv')[['StateAbbr', 'PlaceName', 'Population2010', 'PAPTEST_CrudePrev', 'PAPTEST_Crude95CI']]

In [51]:
# CHECKING NULL / DUPLICATE VALUES FOR PAP SMEAR TESTS
print(pap19.isnull().sum())
print(pap19.duplicated().sum())
print()
print(pap18.isnull().sum())
print(pap18.duplicated().sum())
print()
print(pap17.isnull().sum())
print(pap17.duplicated().sum())
print()
print(pap16.isnull().sum())
print(pap16.duplicated().sum())

pap19 = pap19.dropna()
pap18 = pap18.dropna()
pap17 = pap17.dropna()
pap16 = pap16.dropna()

StateAbbr               0
PlaceName               0
Population2010          0
PAPTEST_CrudePrev    2202
PAPTEST_Crude95CI    2202
dtype: int64
28

StateAbbr               0
PlaceName               0
Population2010          0
PAPTEST_CrudePrev    2202
PAPTEST_Crude95CI    2202
dtype: int64
28

StateAbbr            0
PlaceName            0
Population2010       0
PAPTEST_CrudePrev    7
PAPTEST_Crude95CI    7
dtype: int64
0

StateAbbr            0
PlaceName            0
Population2010       0
PAPTEST_CrudePrev    7
PAPTEST_Crude95CI    7
dtype: int64
0


In [52]:
# MODIFYING 95% CI COLUMN
def create_95CI_columns(pap):
  lower = np.array([])
  upper = np.array([])
  for row in pap['PAPTEST_Crude95CI']:
    row_tuple = eval(row)
    lower = np.append(lower, row_tuple[0])
    upper = np.append(upper, row_tuple[1])
  pap['lower'] = lower
  pap['upper'] = upper
  return pap.drop(['PAPTEST_Crude95CI'], axis=1)

pap19 = create_95CI_columns(pap19)
pap18 = create_95CI_columns(pap18)
pap17 = create_95CI_columns(pap17)
pap16 = create_95CI_columns(pap16)

In [53]:
# AGGREGATING BY CITY
pap19 = pap19.groupby(['StateAbbr', 'PlaceName']).agg('sum').reset_index()
pap18 = pap18.groupby(['StateAbbr', 'PlaceName']).agg('sum').reset_index()
pap17 = pap17.groupby(['StateAbbr', 'PlaceName']).agg('sum').reset_index()
pap16 = pap16.groupby(['StateAbbr', 'PlaceName']).agg('sum').reset_index()

In [55]:
# ADDING YEAR COLUMN
pap19['year'] = np.array([2019]*len(pap19['StateAbbr']))
pap18['year'] = np.array([2018]*len(pap18['StateAbbr']))
pap17['year'] = np.array([2017]*len(pap17['StateAbbr']))
pap16['year'] = np.array([2016]*len(pap16['StateAbbr']))

In [64]:
# MERGING INTO ONE DATASET
pap = pap19.merge(pap16, how='outer', on=list(pap19.columns)).merge(pap17, how='outer', on=list(pap19.columns)).merge(pap18, how='outer', on=list(pap19.columns))
pap.head(4)

Unnamed: 0,StateAbbr,PlaceName,Population2010,PAPTEST_CrudePrev,lower,upper,year
0,AK,Anchorage,291826,4293.9,4161.1,4412.9,2016
1,AK,Anchorage,291826,4293.9,4161.1,4412.9,2017
2,AK,Anchorage,291826,4339.7,4208.5,4461.3,2018
3,AK,Anchorage,291826,4339.7,4208.5,4461.3,2019


In [65]:
# CHECKING DATATYPES
pap.dtypes

Unnamed: 0,0
StateAbbr,object
PlaceName,object
Population2010,int64
PAPTEST_CrudePrev,float64
lower,float64
upper,float64
year,int64


In [67]:
# STANDARDIZING COLUMN NAMES
pap = pap.rename(columns={'StateAbbr':'state', 'PlaceName':'city', 'Population2010':'pop_2010', 'PAPTEST_CrudePrev':'val'})

In [68]:
# LOADING MODIFIED DATASET INTO FOLDER
pap.to_csv("/content/drive/My Drive/ADS_Maya_Reddy/projects/disease_project_1/prepped_datasets/pap.csv", index=False)

### HPV Vaccination Datasets

In [None]:
# LOADING HPV VACCINATION DATASETS
hpv10 = pd.read_csv('hpv_vaccines/hpv_2010.csv').drop('Unnamed: 0', axis=1)
hpv11 = pd.read_csv('hpv_vaccines/hpv_2011.csv').drop('Unnamed: 0', axis=1)
hpv12 = pd.read_csv('hpv_vaccines/hpv_2012.csv').drop('Unnamed: 0', axis=1)
hpv13 = pd.read_csv('hpv_vaccines/hpv_2013.csv').drop('Unnamed: 0', axis=1)
hpv14 = pd.read_csv('hpv_vaccines/hpv_2014.csv').drop('Unnamed: 0', axis=1)
hpv15 = pd.read_csv('hpv_vaccines/hpv_2015.csv').drop('Unnamed: 0', axis=1)
hpv16 = pd.read_csv('hpv_vaccines/hpv_2016.csv').drop('Unnamed: 0', axis=1)
hpv17 = pd.read_csv('hpv_vaccines/hpv_2017.csv').drop('Unnamed: 0', axis=1)
hpv18 = pd.read_csv('hpv_vaccines/hpv_2018.csv').drop('Unnamed: 0', axis=1)
hpv19 = pd.read_csv('hpv_vaccines/hpv_2019.csv').drop('Unnamed: 0', axis=1)
hpv20 = pd.read_csv('hpv_vaccines/hpv_2020.csv').drop('Unnamed: 0', axis=1)

In [None]:
# ADDING YEAR COLUMN
hpv10['year'] = np.array([2010]*len(hpv10['country']))
hpv11['year'] = np.array([2011]*len(hpv11['country']))
hpv12['year'] = np.array([2012]*len(hpv12['country']))
hpv13['year'] = np.array([2013]*len(hpv13['country']))
hpv14['year'] = np.array([2014]*len(hpv14['country']))
hpv15['year'] = np.array([2015]*len(hpv15['country']))
hpv16['year'] = np.array([2016]*len(hpv16['country']))
hpv17['year'] = np.array([2017]*len(hpv17['country']))
hpv18['year'] = np.array([2018]*len(hpv18['country']))
hpv19['year'] = np.array([2019]*len(hpv19['country']))
hpv20['year'] = np.array([2020]*len(hpv20['country']))

In [None]:
# MERGING DATASETS
hpv = hpv10.merge(hpv11, how='outer', on=list(hpv10.columns))
for df in [hpv12, hpv13, hpv14, hpv15, hpv16, hpv17, hpv18, hpv19, hpv20]:
  hpv = hpv.merge(df, how='outer', on=list(hpv.columns))
hpv.head()

Unnamed: 0,country,cohort_size,current_cov,curr_vacc_cohort_size,future_cov,future_vacc_cohort_size,curr_cc_prev,curr_mort_prev,curr_cost,curr_cost_prev,proj_cc_prev,proj_mort_prev,proj_cost,proj_cost_prev,year,current_net_cost,country_name,region,income_group
0,AFG,458482.38,0.0,355069.8,0.9,355069.8,0.0,0.0,0.0,0.0,2736.52,2116.33,4970977.2,2049164.11,2010,0.0,Afghanistan,South Asia,Low income
1,AFG,473983.89,0.0,368163.9,0.9,368163.9,0.0,0.0,0.0,0.0,2849.51,2204.44,5154294.6,2133772.79,2011,0.0,Afghanistan,South Asia,Low income
2,AFG,483310.36,0.0,380178.9,0.9,380178.9,0.0,0.0,0.0,0.0,2955.8,2287.47,5322504.6,2213365.89,2012,0.0,Afghanistan,South Asia,Low income
3,AFG,497653.08,0.0,392511.6,0.9,392511.6,0.0,0.0,0.0,0.0,3062.26,2370.45,5495162.4,2293087.57,2013,0.0,Afghanistan,South Asia,Low income
4,AFG,510411.1,0.0,403654.5,0.9,403654.5,0.0,0.0,0.0,0.0,3160.75,2447.33,5651163.0,2366835.62,2014,0.0,Afghanistan,South Asia,Low income


In [None]:
# CHECKING NULL / DUPLICATE VALUES
print(hpv.isnull().sum())
print()
print(hpv.duplicated().sum())

country                    0
cohort_size                0
current_cov                0
curr_vacc_cohort_size      0
future_cov                 0
future_vacc_cohort_size    0
curr_cc_prev               0
curr_mort_prev             0
curr_cost                  0
curr_cost_prev             0
proj_cc_prev               0
proj_mort_prev             0
proj_cost                  0
proj_cost_prev             0
year                       0
current_net_cost           0
country_name               0
region                     0
income_group               0
dtype: int64

0


In [None]:
# CHECKING DATATYPES
hpv.dtypes

Unnamed: 0,0
country,object
cohort_size,float64
current_cov,float64
curr_vacc_cohort_size,float64
future_cov,float64
future_vacc_cohort_size,float64
curr_cc_prev,float64
curr_mort_prev,float64
curr_cost,float64
curr_cost_prev,float64


In [None]:
# LOADING MODIFIED DATASET INTO FOLDER
hpv.to_csv("/content/drive/My Drive/ADS_Maya_Reddy/projects/disease_project_1/prepped_datasets/hpv.csv", index=False)

### Extra Datasets

In [None]:
# LOADING EXTRA DATASETS
screening_program = pd.read_csv('countries_with_screening_programs.csv')
medicaid_chip = pd.read_csv('medicaid_chip_vaccine_coverage.csv').drop(['DataQuality'], axis=1)
adolescent = pd.read_csv('adolescent_vaccine_coverage.csv')

In [None]:
# NARROWING DATASETS TO CERVICAL CANCER
medicaid_chip = medicaid_chip[medicaid_chip['VaccineType'] == 'HPV'].drop(['VaccineType'], axis=1)
adolescent = adolescent[adolescent['Vaccine/Sample'] == 'HPV'].drop(['Vaccine/Sample'], axis=1)

In [None]:
# ADJUSTING COLUMN VALUES
adolescent[['lower', 'upper']] = adolescent['95% CI (%)'].str.split(' ', expand=True).drop([1], axis=1).rename(columns={0:'lower', 2:'upper'})
adolescent['lower'] = adolescent['lower'].astype(float)
adolescent['upper'] = adolescent['upper'].astype(float)
adolescent = adolescent.drop(['95% CI (%)'], axis=1)

from datetime import datetime
medicaid_chip = medicaid_chip.reset_index().drop(['index'], axis=1)
for i in range(len(medicaid_chip['Month'])):
  unconverted = str(medicaid_chip['Month'][i])
  date = datetime.strptime(unconverted, '%Y%m')
  medicaid_chip['Month'][i] = date.month

medicaid_chip['ServiceCount'] = pd.Series([x.replace(',', '') for x in medicaid_chip['ServiceCount']]).replace({' -   ': None}).replace({' DS ': None}).astype('float')
medicaid_chip['RatePer1000Beneficiaries'] = medicaid_chip['RatePer1000Beneficiaries'].replace({'DS': None}).astype('float')

In [None]:
# CHECKING DATATYPES
print(screening_program.dtypes)
print()
print(medicaid_chip.dtypes)
print()
print(adolescent.dtypes)

Entity                                                         object
Code                                                           object
Year                                                            int64
Existence of national screening program for cervical cancer    object
dtype: object

State                        object
Year                          int64
Month                         int64
ServiceCount                float64
RatePer1000Beneficiaries    float64
dtype: object

Dose               object
Geography Type     object
Geography          object
Survey Year        object
Dimension Type     object
Dimension          object
Estimate (%)      float64
Sample Size       float64
lower             float64
upper             float64
dtype: object


In [None]:
# STANDARDIZING COLUMN NAMES
screening_program = screening_program.rename(columns={'Entity':'country', 'Code':'code', 'Year':'year', 'Existence of national screening program for cervical cancer':'screening_program'})
medicaid_chip = medicaid_chip.rename(columns={'State':'state', 'Month':'month', 'Year':'year', 'ServiceCount':'service_count', 'RatePer1000Beneficiaries':'rate_per_1000'})
adolescent = adolescent.rename(columns={'Dose':'dose', 'Geography Type':'location_type', 'Geography':'location', 'Survey Year':'year', 'Dimension Type':'dimension_type', 'Dimension':'dimension_val', 'Estimate (%)':'val', 'Sample Size':'sample_size'})

In [None]:
# LOADING MODIFIED DATASET INTO FOLDER
screening_program.to_csv("/content/drive/My Drive/ADS_Maya_Reddy/projects/disease_project_1/prepped_datasets/screening_program.csv", index=False)
medicaid_chip.to_csv("/content/drive/My Drive/ADS_Maya_Reddy/projects/disease_project_1/prepped_datasets/medicaid_chip.csv", index=False)
adolescent.to_csv("/content/drive/My Drive/ADS_Maya_Reddy/projects/disease_project_1/prepped_datasets/adolescent.csv", index=False)