<h1> Prevalence DATA</h1>

Data is from CDC AtlasPlus website.

In [3]:
#Dependencies
import pandas as pd
import glob
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sqlalchemy import create_engine
import pymysql
pymysql.install_as_MySQLdb()

#Config variables
from config import remote_db_endpoint, remote_db_port
from config import remote_hiv_dbname, remote_hiv_dbuser, remote_hiv_dbpwd
# from config import local_hiv_dbuser, local_hiv_dbpwd, local_db_endpoint, local_db_port, local_hiv_dbname

Create engine to pass in Cloud MySQL Database Connection on AWS

In [4]:
#Cloud MySQL Database Connection on AWS
engine = create_engine(f'mysql://{remote_hiv_dbuser}:{remote_hiv_dbpwd}@{remote_db_endpoint}:{remote_db_port}/{remote_hiv_dbname}')

In [5]:
#Create a remote database engine connection
conn = engine.connect()

<h3>Overall Prevalence</h3>

In [6]:
prev_all_files = sorted(glob.glob("Resources/Prev_All*.csv"))
prev_all_files

['Resources/Prev_All_County.csv',
 'Resources/Prev_All_National.csv',
 'Resources/Prev_All_State.csv']

In [7]:
#Check data is reading in and how it looks
# prev_test = pd.read_csv('Resources/Prev_All_National.csv')
prev_test = pd.read_csv('Resources/Prev_All_State.csv')
# prev_test = pd.read_csv('Resources/Prev_All_County.csv')

prev_test.head()
# prev_test.columns

Unnamed: 0,Indicator,Year,Geography,FIPS,Cases,Rate per 100000,Population
0,HIV prevalence,2017,Alabama,1,13124,320.5,4094777
1,HIV prevalence,2016,Alabama,1,12584,308.2,4082972
2,HIV prevalence,2015,Alabama,1,12166,298.9,4070748
3,HIV prevalence,2014,Alabama,1,12339,304.0,4059211
4,HIV prevalence,2013,Alabama,1,11915,294.8,4041871


In [8]:
def parsed(x):
    try:
        return int(x)
    except:
        return x
i=0

for f in prev_all_files:
    prev_all_df = pd.read_csv(f)
    #Shorten and rename columns
    prev_all_df = prev_all_df.rename(columns={'Indicator':'Indicator','Year':'Year','Geography':'Full Name','FIPS':'FIPS',
                                      'Cases':'All Prev Cases','Rate per 100000':'All Prev Rates','Population':'Population'
                                     })
    prev_all_df = prev_all_df.replace({'Data suppressed': np.nan, ',': ''}, regex=True)
    prev_all_df = prev_all_df.apply(lambda x: parsed(x))
    prev_all_df['Year'] = prev_all_df['Year'].apply(lambda x: parsed(x))
    if i>1:
        prev_all_df1 = prev_all_df
    elif i>0:
        prev_all_df0 = prev_all_df
    else:
        prev_all_df2 = prev_all_df
    i = i + 1

prev_all_data = pd.concat([prev_all_df0,prev_all_df1,prev_all_df2]).reset_index(drop=True)
prev_all_data.head()

Unnamed: 0,Indicator,Year,Full Name,FIPS,All Prev Cases,All Prev Rates,Population
0,HIV prevalence,2017,United States,,1001718,367.7,272460904
1,HIV prevalence,2016,United States,,977493,361.6,270301427
2,HIV prevalence,2015,United States,,952705,355.5,268005522
3,HIV prevalence,2014,United States,,927581,349.1,265732678
4,HIV prevalence,2013,United States,,902775,342.8,263356005


In [9]:
prev_all_data.to_csv('Output/prev_all.csv',index=False)

<h3>Prevalence Data by Age</h3>

In [10]:
prev_age_files = sorted(glob.glob("Resources/Prev_Age*.csv"))
prev_age_files

['Resources/Prev_Age_County_CandR.csv',
 'Resources/Prev_Age_National_CandR.csv',
 'Resources/Prev_Age_State_CandR.csv']

In [11]:
#Check data is reading in and how it looks
# prev_age_test = pd.read_csv('Resources/Prev_Age_National_CandR.csv')
prev_age_test = pd.read_csv('Resources/Prev_Age_State_CandR.csv')
# prev_age_test = pd.read_csv('Resources/Prev_Age_County_CandR.csv')

prev_age_test.head()
# prev_age_test.columns

Unnamed: 0,Year,Geography,13-24,Unnamed: 3,25-34,Unnamed: 5,35-44,Unnamed: 7,45-54,Unnamed: 9,55+,Unnamed: 11
0,,,Cases,Rate per 100000,Cases,Rate per 100000,Cases,Rate per 100000,Cases,Rate per 100000,Cases,Rate per 100000
1,2008.0,Alabama,535,68.1,1809,295.9,3324,532.9,3059,455,1267,107.5
2,2008.0,Alaska,21,15.7,65,63.2,185,199.8,184,173.8,92,74
3,2008.0,Arizona,361,34.2,1649,174.4,3574,410.7,3719,440.1,1588,104.6
4,2008.0,Arkansas,186,39.7,703,183.2,1366,366.8,1298,327.4,539,73.2


In [12]:
def parsed(x):
    try:
        return int(x)
    except:
        return x
i=0

for f in prev_age_files:
    prev_age_df = pd.read_csv(f)
    #Rename columns 
    prev_age_df = prev_age_df.rename(columns={'Year':'Year','Geography':'Full Name','13-24':'13-24 Prev Cases','Unnamed: 3':'13-24 Prev Rates',
                                              '25-34':'25-34 Prev Cases','Unnamed: 5':'25-34 Prev Rates',
                                              '35-44':'35-44 Prev Cases','Unnamed: 7':'35-44 Prev Rates',
                                              '45-54':'45-54 Prev Cases','Unnamed: 9':'45-54 Prev Rates',
                                              '55+':'55+ Prev Cases','Unnamed: 11':'55+ Prev Rates'
    })
    prev_age_df = prev_age_df.iloc[1:].replace({'Data suppressed': np.nan, ',': ''}, regex=True)
    prev_age_df = prev_age_df.apply(lambda x: parsed(x))
    prev_age_df['Year'] = prev_age_df['Year'].apply(lambda x: parsed(x))
    if i>1:
        prev_age_df1 = prev_age_df
    elif i>0:
        prev_age_df0 = prev_age_df
    else:
        prev_age_df2 = prev_age_df
    i = i + 1


prev_age_data = pd.concat([prev_age_df0,prev_age_df1,prev_age_df2]).reset_index(drop=True)
prev_age_data.head()

Unnamed: 0,Year,Full Name,13-24 Prev Cases,13-24 Prev Rates,25-34 Prev Cases,25-34 Prev Rates,35-44 Prev Cases,35-44 Prev Rates,45-54 Prev Cases,45-54 Prev Rates,55+ Prev Cases,55+ Prev Rates
0,2008,United States,32936,64.4,108363,264.6,240102,568.4,263589,595.9,126048,174.0
1,2009,United States,35225,68.8,111323,267.8,229044,551.5,279575,627.0,143169,192.5
2,2010,United States,37160,71.5,115693,280.7,219522,535.7,293221,651.8,161910,209.6
3,2011,United States,38210,73.4,119974,287.0,210441,517.8,302421,676.1,182285,229.5
4,2012,United States,38830,74.3,124187,293.6,203600,502.4,307862,695.5,204198,249.9


In [13]:
prev_age_data.to_csv('Output/prev_age.csv',index=False)

<h3>Prevalence Data by Race</h3>

In [14]:
prev_race_files = glob.glob("Resources/Prev_Race*.csv")
prev_race_files

['Resources/Prev_Race_National_CandR.csv',
 'Resources/Prev_Race_County_CandR.csv',
 'Resources/Prev_Race_State_CandR.csv']

In [15]:
#Check data is reading in and how it looks
# prev_race_test = pd.read_csv('Resources/Prev_Race_National_CandR.csv')
# prev_race_test = pd.read_csv('Resources/Prev_Race_State_CandR.csv')
prev_race_test = pd.read_csv('Resources/Prev_Race_County_CandR.csv')

prev_race_test.head()
prev_race_test.columns

Index(['Year', 'Geography', 'American Indian/Alaska Native', 'Unnamed: 3',
       'Asian', 'Unnamed: 5', 'Black/African American', 'Unnamed: 7',
       'Hispanic/Latino', 'Unnamed: 9', 'Multiple races', 'Unnamed: 11',
       'Native Hawaiian/Other Pacific Islander', 'Unnamed: 13', 'White',
       'Unnamed: 15'],
      dtype='object')

In [16]:
def parsed(x):
    try:
        return int(x)
    except:
        return x
i=0

for f in prev_race_files:
    prev_race_df = pd.read_csv(f)
    #Shorten and rename columns (for usability) American Indian/Alaska Native=AI_AN, Asian, Black/African American=Black, Hispanic/Latino=Latino, Multiple races=Multi, Native Hawaiian/Other Pacific Islander=NH_OPI, White
    prev_race_df = prev_race_df.rename(columns={'Year':'Year','Geography':'Full Name','American Indian/Alaska Native':'AI_AN Prev Cases','Unnamed: 3':'AI_AN Prev Rates',
                                                'Asian':'Asian Prev Cases','Unnamed: 5':'Asian Prev Rates',
                                                'Black/African American':'Black Prev Cases','Unnamed: 7':'Black Prev Rates',
                                                'Hispanic/Latino':'Latino Prev Cases','Unnamed: 9':'Latino Prev Rates',
                                                'Multiple races':'Multi Prev Cases','Unnamed: 11':'Multi Prev Rates',
                                                'Native Hawaiian/Other Pacific Islander':'NH_OPI Prev Cases','Unnamed: 13':'NH_OPI Prev Rates',
                                                'White':'White Prev Cases','Unnamed: 15':'White Prev Rates',
                                               })

    prev_race_df = prev_race_df.iloc[1:].replace({'Data suppressed': np.nan, ',': ''}, regex=True)
    prev_race_df = prev_race_df.apply(lambda x: parsed(x))
    prev_race_df['Year'] = prev_race_df['Year'].apply(lambda x: parsed(x))
    if i>1:
        prev_race_df1 = prev_race_df
    elif i>0:
        prev_race_df2 = prev_race_df
    else:
        prev_race_df0 = prev_race_df
    i = i + 1

prev_race_data = pd.concat([prev_race_df0,prev_race_df1,prev_race_df2]).reset_index(drop=True)
prev_race_data.head()

Unnamed: 0,Year,Full Name,AI_AN Prev Cases,AI_AN Prev Rates,Asian Prev Cases,Asian Prev Rates,Black Prev Cases,Black Prev Rates,Latino Prev Cases,Latino Prev Rates,Multi Prev Cases,Multi Prev Rates,NH_OPI Prev Cases,NH_OPI Prev Rates,White Prev Cases,White Prev Rates
0,2008,United States,1997,107.4,7064,63.9,316174,1055.3,156622,452.1,39444,1392.2,467,133.4,248451,145.9
1,2009,United States,2076,110.2,7636,67.4,327839,1082.0,163450,457.9,40815,1397.3,507,141.9,255195,149.4
2,2010,United States,2158,119.4,8321,67.0,340290,1108.1,171228,450.0,42135,1183.0,545,136.5,262009,154.6
3,2011,United States,2235,122.2,8972,69.9,351382,1129.9,177916,456.2,43285,1173.0,591,144.3,268131,157.9
4,2012,United States,2349,126.8,9684,73.0,361885,1148.8,184965,463.0,44325,1159.1,630,150.1,274041,161.0


In [17]:
prev_race_data.to_csv('Output/prev_race.csv',index=False)

<h3>Prevalence Data by Transmission</h3>

In [18]:
prev_tran_files = sorted(glob.glob("Resources/Prev_Tran*.csv"))
prev_tran_files

['Resources/Prev_Tran_County_C.csv',
 'Resources/Prev_Tran_National_C.csv',
 'Resources/Prev_Tran_State_C.csv']

In [19]:
#Check data is reading in and how it looks
prev_tran_test = pd.read_csv('Resources/Prev_Tran_National_C.csv')
# prev_tran_test = pd.read_csv('Resources/Prev_Tran_State_C.csv')
# prev_tran_test = pd.read_csv('Resources/Prev_Tran_County_C.csv')

prev_tran_test.head()
# prev_tran_test.columns

Unnamed: 0,Year,Geography,Heterosexual contact,Injection drug use,Male-to-male sexual contact,Male-to-male sexual contact and injection drug use,Other
0,,,Cases,Cases,Cases,Cases,Cases
1,2008.0,United States,198986,129675,377059,53476,11842
2,2009.0,United States,207207,128542,396415,53851,12321
3,2010.0,United States,215621,128187,416426,54398,12874
4,2011.0,United States,222657,126951,435787,54703,13233


In [20]:
def parsed(x):
    try:
        return int(x)
    except:
        return x
i=0

for f in prev_tran_files:
    prev_tran_df = pd.read_csv(f)
    #Shorten and Rename columns Heterosexual contact=M2F   Injection drug use=ID   Male-to-male sexual contact=M2M   Male-to-male sexual contact and injection drug use=M2MID   Other=O
    prev_tran_df = prev_tran_df.rename(columns={'Year':'Year', 'Geography':'Full Name',
                                                'Heterosexual contact':'M2F Prev Cases',
                                                'Injection drug use':'ID Prev Cases',
                                                'Male-to-male sexual contact':'M2M Prev Cases',
                                                'Male-to-male sexual contact and injection drug use':'M2MID Prev Cases',
                                                'Other':'O Prev Cases'
                                               })
    prev_tran_df = prev_tran_df.iloc[1:].replace({'Data suppressed': np.nan, ',': ''}, regex=True)
    prev_tran_df = prev_tran_df.apply(lambda x: parsed(x))
    prev_tran_df['Year'] = prev_tran_df['Year'].apply(lambda x: parsed(x))
    if i>1:
        prev_tran_df1 = prev_tran_df
    elif i>0:
        prev_tran_df0 = prev_tran_df
    else:
        prev_tran_df2 = prev_tran_df
    i = i + 1

prev_tran_data = pd.concat([prev_tran_df0,prev_tran_df1,prev_tran_df2]).reset_index(drop=True)
prev_tran_data.head()

Unnamed: 0,Year,Full Name,M2F Prev Cases,ID Prev Cases,M2M Prev Cases,M2MID Prev Cases,O Prev Cases
0,2008,United States,198986,129675,377059,53476,11842
1,2009,United States,207207,128542,396415,53851,12321
2,2010,United States,215621,128187,416426,54398,12874
3,2011,United States,222657,126951,435787,54703,13233
4,2012,United States,228941,125651,455566,55013,13506


In [21]:
prev_tran_data.to_csv('Output/prev_tran.csv',index=False)

<h3>Prevalence Data by Sex</h3>

In [22]:
prev_sex_files = glob.glob("Resources/Prev_Sex*.csv")
prev_sex_files

['Resources/Prev_Sex_State_CandR.csv',
 'Resources/Prev_Sex_County_CandR.csv',
 'Resources/Prev_Sex_National_CandR.csv']

In [23]:
#Check data is reading in and how it looks
prev_sex_test = pd.read_csv('Resources/Prev_Sex_National_CandR.csv')
# prev_sex_test = pd.read_csv('Resources/Prev_Sex_State_CandR.csv')
# prev_sex_test = pd.read_csv('Resources/Prev_Sex_County_CandR.csv')

prev_sex_test.head()
# prev_sex_test.columns

Unnamed: 0,Year,Geography,Female,Unnamed: 3,Male,Unnamed: 5
0,,,Cases,Rate per 100000,Cases,Rate per 100000
1,2008.0,United States,195248,152.3,575790,468.9
2,2009.0,United States,201010,155.5,597326,481.8
3,2010.0,United States,207315,157.8,620191,496
4,2011.0,United States,212212,160.2,641119,508.1


In [24]:
def parsed(x):
    try:
        return int(x)
    except:
        return x
i=0

for f in prev_sex_files:
    prev_sex_df = pd.read_csv(f)
    #Shorten and Rename columns Heterosexual contact=M2F   Injection drug use=ID   Male-to-male sexual contact=M2M   Male-to-male sexual contact and injection drug use=M2MID   Other=O
    prev_sex_df = prev_sex_df.rename(columns={'Year':'Year','Geography':'Full Name',
                                              'Female':'F Prev Cases','Unnamed: 3':'F Prev Rates',
                                              'Male':'M Prev Cases','Unnamed: 5':'M Prev Rates',
                                               })
    prev_sex_df = prev_sex_df.iloc[1:].replace({'Data suppressed': np.nan, ',': ''}, regex=True)
    prev_sex_df = prev_sex_df.apply(lambda x: parsed(x))
    prev_sex_df['Year'] = prev_sex_df['Year'].apply(lambda x: parsed(x))
    if i>1:
        prev_sex_df0 = prev_sex_df
    elif i>0:
        prev_sex_df2 = prev_sex_df
    else:
        prev_sex_df1 = prev_sex_df
    i = i + 1

prev_sex_data = pd.concat([prev_sex_df0,prev_sex_df1,prev_sex_df2]).reset_index(drop=True)
prev_sex_data.head()

Unnamed: 0,Year,Full Name,F Prev Cases,F Prev Rates,M Prev Cases,M Prev Rates
0,2008,United States,195248,152.3,575790,468.9
1,2009,United States,201010,155.5,597326,481.8
2,2010,United States,207315,157.8,620191,496.0
3,2011,United States,212212,160.2,641119,508.1
4,2012,United States,216508,162.0,662169,519.7


In [25]:
prev_sex_data.to_csv('Output/prev_sex.csv',index=False)

<h3>Database Prevalence</h3>
Merge all Prevalence Data Files into one 'master_prev.csv' to load to database

In [26]:
prev_all_data = prev_all_data[['Indicator','Year','Full Name','FIPS','Population','All Prev Cases','All Prev Rates']]
prev_all_data.head()

Unnamed: 0,Indicator,Year,Full Name,FIPS,Population,All Prev Cases,All Prev Rates
0,HIV prevalence,2017,United States,,272460904,1001718,367.7
1,HIV prevalence,2016,United States,,270301427,977493,361.6
2,HIV prevalence,2015,United States,,268005522,952705,355.5
3,HIV prevalence,2014,United States,,265732678,927581,349.1
4,HIV prevalence,2013,United States,,263356005,902775,342.8


In [27]:
prev_merge = pd.merge(prev_all_data, prev_sex_data, how='outer', on=['Year','Full Name'])
prev_merge1 = pd.merge(prev_merge, prev_age_data, how='outer', on=['Year','Full Name'])
prev_merge2 = pd.merge(prev_merge1, prev_race_data, how='outer', on=['Year','Full Name'])
prev_master_data = pd.merge(prev_merge2, prev_tran_data, how='outer', on=['Year','Full Name'])
prev_master_data.head()

Unnamed: 0,Indicator,Year,Full Name,FIPS,Population,All Prev Cases,All Prev Rates,F Prev Cases,F Prev Rates,M Prev Cases,...,Multi Prev Rates,NH_OPI Prev Cases,NH_OPI Prev Rates,White Prev Cases,White Prev Rates,M2F Prev Cases,ID Prev Cases,M2M Prev Cases,M2MID Prev Cases,O Prev Cases
0,HIV prevalence,2017,United States,,272460904,1001718,367.7,236589,169.9,765129,...,1027.7,836,177.8,300370,175.2,258162,119245,554159,55318,14834
1,HIV prevalence,2016,United States,,270301427,977493,361.6,232429,168.2,745064,...,1061.7,792,172.0,295439,172.5,252679,120127,534878,55255,14554
2,HIV prevalence,2015,United States,,268005522,952705,355.5,228278,166.6,724427,...,1091.2,765,170.0,290572,169.8,246636,121420,515113,55210,14328
3,HIV prevalence,2014,United States,,265732678,927581,349.1,224285,165.0,703296,...,1117.5,699,158.9,285242,166.9,240788,122629,494900,55179,14085
4,HIV prevalence,2013,United States,,263356005,902775,342.8,220431,163.6,682344,...,1140.9,665,154.5,279730,164.0,234919,124193,474770,55110,13783


In [28]:
prev_master_data.to_csv('Output/master_prev.csv',index=False)

<h3>Load dataframe to database on AWS</h3>
Went into MySQL Workbench to change data types for columns to increase efficiency.

In [29]:
# Dataframe to SQL (is there a way to say append if not already present)
prev_master_data.to_sql(name='prev_data', if_exists='replace', con=conn, chunksize=500, index=False)

In [30]:
remote_prev_data = pd.read_sql("SELECT * FROM prev_data", conn)
print(len(remote_prev_data))
remote_prev_data.head()

31940


Unnamed: 0,Indicator,Year,Full Name,FIPS,Population,All Prev Cases,All Prev Rates,F Prev Cases,F Prev Rates,M Prev Cases,...,Multi Prev Rates,NH_OPI Prev Cases,NH_OPI Prev Rates,White Prev Cases,White Prev Rates,M2F Prev Cases,ID Prev Cases,M2M Prev Cases,M2MID Prev Cases,O Prev Cases
0,HIV prevalence,2017,United States,,272460904,1001718,367.7,236589,169.9,765129,...,1027.7,836,177.8,300370,175.2,258162,119245,554159,55318,14834
1,HIV prevalence,2016,United States,,270301427,977493,361.6,232429,168.2,745064,...,1061.7,792,172.0,295439,172.5,252679,120127,534878,55255,14554
2,HIV prevalence,2015,United States,,268005522,952705,355.5,228278,166.6,724427,...,1091.2,765,170.0,290572,169.8,246636,121420,515113,55210,14328
3,HIV prevalence,2014,United States,,265732678,927581,349.1,224285,165.0,703296,...,1117.5,699,158.9,285242,166.9,240788,122629,494900,55179,14085
4,HIV prevalence,2013,United States,,263356005,902775,342.8,220431,163.6,682344,...,1140.9,665,154.5,279730,164.0,234919,124193,474770,55110,13783
