In [44]:
import numpy as np
import pandas as pd
import re
# grab NYT githhub data
url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
data = pd.read_csv(url)

In [45]:
# see most recent date entry
data.tail()

Unnamed: 0,date,county,state,fips,cases,deaths
297914,2020-07-04,Sweetwater,Wyoming,56037.0,102,0
297915,2020-07-04,Teton,Wyoming,56039.0,137,1
297916,2020-07-04,Uinta,Wyoming,56041.0,183,0
297917,2020-07-04,Washakie,Wyoming,56043.0,38,5
297918,2020-07-04,Weston,Wyoming,56045.0,2,0


In [46]:
# select entries pertaining to the lone star state
texas = data.loc[data['state'] =='Texas'].sort_values(['fips','date'], ascending=[True, True])
print('number of counties:' , len(texas['fips'].unique()))
# find nan's
print(texas[texas.isna().any(axis=1)])

number of counties: 247
             date   county  state  fips  cases  deaths
10678  2020-03-24  Unknown  Texas   nan      0       1


In [47]:
#drop nan's
texas=texas.dropna()
texas['fips']=texas['fips'].astype(int)


In [48]:
'''
NYT interactive map nytimes.com/interactive/2020/us/coronavirus-us-cases.html
Disclaimer :the map shows where the average number of reported cases over the past two weeks is increasing, decreasing
or about the same. Counties with an increase in the rate of cases are shaded darker. Counties with fewer than 20 cases
over the past two weeks and parts of a county with a population density lower than 10 people per square mile are not shaded
'''
county_daily_cases=pd.DataFrame()
counties=[]
t=0 # count counties
i=0 # count counties with minimum 2 weeks of data

for fips in texas['fips'].unique():
    t=t+1
    county=pd.DataFrame(texas.loc[data['fips']==fips].sort_values(['date'], ascending=True)).reset_index(drop=True)
    
    if len(county['cases'])>14:
        i=i+1
        daily_cases = np.zeros(len(county['cases']))
        daily_cases[0] = county['cases'].iloc[0]

        # for loop checks to see if data reports a negative daily cases number.  If it does, the number is recorded as zero instead
        for idx in np.arange(1,len(county['cases'])):
            daily_cases[idx] = max(county['cases'].iloc[idx]-county['cases'].iloc[idx-1],0)
            
        daily_cases=pd.DataFrame(daily_cases,columns=['daily_cases'])
        county['daily_cases']=daily_cases['daily_cases'].values
        
        # collect previous 2 weeks worth of data 
        prev2wk=county.iloc[-14:]
        county_daily_cases=county_daily_cases.append(prev2wk)

        # calculate averages
        sevenMA = daily_cases['daily_cases'].rolling(window=7).mean()
        sevenMA1diff = sevenMA.diff(periods=7)
        sevenMA2diff = sevenMA.diff(periods=14)
        twoweekMA = daily_cases['daily_cases'].rolling(window=14).mean()
        twoweekMAdiff = twoweekMA.diff(periods=14)
        
        # gather information
        counties.append((county['date'].iloc[-1],
                         county['state'].iloc[-1],
                         county['county'].iloc[-1],
                         fips,
                         daily_cases['daily_cases'].iloc[-1],
                         sevenMA.iloc[-1],
                         sevenMA.iloc[-8],
                         sevenMA.iloc[-15],
                         sevenMA1diff.iloc[-1],
                         sevenMA2diff.iloc[-1],
                         twoweekMA.iloc[-1],
                         twoweekMA.iloc[-15],
                         twoweekMAdiff.iloc[-1]
                        ))   


counties=pd.DataFrame(counties,columns=['date','state','county','fips','daily_cases','sevenMA','sevenMA1wktrailing',
                                        'sevenMA2wktrailing','sevenMA1wkdiff','sevenMA2wkdiff',
                                        'twoweekMA','twoweekMAtrailing','twoweekMAdiff'])

# counties = counties.loc[counties['twoweekMA']>1.4]
# counties = counties.loc[counties['daily_cases']>100]

# creating categorical values based on averages
counties['avg_7_1wk_change_dir']=counties['sevenMA1wkdiff'].map(lambda x: 'UP' if x>0 else('SAME' if x==0 else 'DOWN'))
counties['avg_7_2wk_change_dir']=counties['sevenMA2wkdiff'].map(lambda x: 'UP' if x>0 else('SAME' if x==0 else 'DOWN'))
# address dividing by zero
counties['sevenMA1wktrailing']=counties['sevenMA1wktrailing'].replace(0,1e-6)
counties['sevenMA2wktrailing']=counties['sevenMA2wktrailing'].replace(0,1e-6)

counties['avg_14_change_dir']=counties['twoweekMAdiff'].map(lambda x: 'UP' if x>0 else('SAME' if x==0 else 'DOWN'))
counties['twoweekMAtrailing']=counties['twoweekMAtrailing'].replace(0,1e-6)



counties['avg_7_1wk_pct_change']= 100*counties['sevenMA1wkdiff'].abs() / counties['sevenMA1wktrailing']
counties['avg_7_2wk_pct_change']= 100*counties['sevenMA2wkdiff'].abs() / counties['sevenMA2wktrailing']
counties['avg_14_pct_change']= 100*counties['twoweekMAdiff'].abs() / counties['twoweekMAtrailing']

def classification_average_cases_change(updown,x):
    if updown=='UP':
        if x <=25:
            return '0-25% increase'
        elif x >25 and x <=50:
            return '26-50% increase'
        elif x >50 and x <=75:
            return '51-75% increase'
        elif x >75 and x <=100:
            return '76-100% increase'
        else:
            return 'more than 100% increase'
        
    elif updown=='SAME':
        return 'same'
    
    else:
        return 'decrease'

counties['case7_1wk_change'] = counties.apply(lambda x: classification_average_cases_change(x['avg_7_1wk_change_dir'], x['avg_7_1wk_pct_change']), axis=1)
counties['case7_2wk_change'] = counties.apply(lambda x: classification_average_cases_change(x['avg_7_2wk_change_dir'], x['avg_7_2wk_pct_change']), axis=1)
counties['case14_change'] = counties.apply(lambda x: classification_average_cases_change(x['avg_14_change_dir'], x['avg_14_pct_change']), axis=1)
print(counties.shape)

(240, 22)


In [49]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)
counties.head()


Unnamed: 0,date,state,county,fips,daily_cases,sevenMA,sevenMA1wktrailing,sevenMA2wktrailing,sevenMA1wkdiff,sevenMA2wkdiff,...,twoweekMAdiff,avg_7_1wk_change_dir,avg_7_2wk_change_dir,avg_14_change_dir,avg_7_1wk_pct_change,avg_7_2wk_pct_change,avg_14_pct_change,case7_1wk_change,case7_2wk_change,case14_change
0,2020-07-04,Texas,Anderson,48001,0.0,8.0,1.57,129.29,6.43,-121.29,...,-61.07,UP,DOWN,DOWN,409.09,93.81,92.73,more than 100% increase,decrease,decrease
1,2020-07-04,Texas,Andrews,48003,8.0,5.0,3.0,2.0,2.0,3.0,...,2.93,UP,UP,UP,66.67,150.0,273.33,51-75% increase,more than 100% increase,more than 100% increase
2,2020-07-04,Texas,Angelina,48005,0.0,11.0,12.0,10.14,-1.0,0.86,...,1.29,DOWN,UP,UP,8.33,8.45,12.59,decrease,0-25% increase,0-25% increase
3,2020-07-04,Texas,Aransas,48007,3.0,2.57,1.29,0.14,1.29,2.43,...,1.79,UP,UP,UP,100.0,1700.0,1250.0,76-100% increase,more than 100% increase,more than 100% increase
4,2020-07-04,Texas,Archer,48009,0.0,0.29,0.43,0.0,-0.14,0.29,...,0.29,DOWN,UP,UP,33.33,28571428.57,400.0,decrease,more than 100% increase,more than 100% increase


In [50]:
# read texas population estimates
# https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/
pop2019=pd.read_csv('co-est2019-alldata.csv', encoding='latin-1')
pop2019.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2019,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017,RNETMIG2018,RNETMIG2019
0,40,3,6,1,0,Alabama,Alabama,4779736,4780125,4785437,...,1.92,0.58,1.19,1.52,0.56,0.63,0.75,1.09,1.77,2.48
1,50,3,6,1,1,Alabama,Autauga County,54571,54597,54773,...,4.85,6.02,-6.23,-3.9,1.97,-1.71,4.78,0.85,0.54,4.56
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183112,...,24.02,16.64,17.49,22.75,20.18,17.73,21.28,22.4,24.73,24.38
3,50,3,6,1,5,Alabama,Barbour County,27457,27455,27327,...,-5.69,0.29,-6.9,-8.13,-5.14,-15.72,-18.24,-25.0,-8.75,-5.17
4,50,3,6,1,7,Alabama,Bibb County,22915,22915,22870,...,1.39,-5.0,-3.79,-5.8,1.33,1.33,-0.71,-3.23,-6.86,1.83


In [51]:
# grab texas
txpop2019=pop2019.loc[pop2019['STNAME']=='Texas'][['CTYNAME','POPESTIMATE2019']]
txpop2019['CTYNAME']=txpop2019['CTYNAME'].astype(str)
# top row is entire state population
texaspop=txpop2019.iloc[0]
print(texaspop)
txpop2019=txpop2019.drop(txpop2019.index[0])
new_name= [string.replace(' County', '') for string in txpop2019['CTYNAME']]
txpop2019['county']=new_name
txpop2019=txpop2019.drop(['CTYNAME'],axis=1)
txpop2019=txpop2019.rename(columns={'POPESTIMATE2019':'pop_est_2019'})
txpop2019.head()

Unnamed: 0,pop_est_2019,county
2567,57735,Anderson
2568,18705,Andrews
2569,86715,Angelina
2570,23510,Aransas
2571,8553,Archer


In [52]:
# http://www.texascounties.net/statistics/landarea.htm
# copy and paste data into csv file
tx_landsize = pd.read_csv('txlandarea.csv')

In [53]:
tx_landsize.head()

Unnamed: 0,county,size
0,Anderson,1062.6
1,Andrews,1500.7
2,Angelina,797.8
3,Aransas,252.1
4,Archer,903.3


In [54]:
texas_counties=pd.merge(pd.merge(counties,txpop2019, on='county'),tx_landsize, on='county')


In [55]:
texas_counties.head()

Unnamed: 0,date,state,county,fips,daily_cases,sevenMA,sevenMA1wktrailing,sevenMA2wktrailing,sevenMA1wkdiff,sevenMA2wkdiff,...,avg_7_2wk_change_dir,avg_14_change_dir,avg_7_1wk_pct_change,avg_7_2wk_pct_change,avg_14_pct_change,case7_1wk_change,case7_2wk_change,case14_change,pop_est_2019,size
0,2020-07-04,Texas,Anderson,48001,0.0,8.0,1.57,129.29,6.43,-121.29,...,DOWN,DOWN,409.09,93.81,92.73,more than 100% increase,decrease,decrease,57735,1062.6
1,2020-07-04,Texas,Andrews,48003,8.0,5.0,3.0,2.0,2.0,3.0,...,UP,UP,66.67,150.0,273.33,51-75% increase,more than 100% increase,more than 100% increase,18705,1500.7
2,2020-07-04,Texas,Angelina,48005,0.0,11.0,12.0,10.14,-1.0,0.86,...,UP,UP,8.33,8.45,12.59,decrease,0-25% increase,0-25% increase,86715,797.8
3,2020-07-04,Texas,Aransas,48007,3.0,2.57,1.29,0.14,1.29,2.43,...,UP,UP,100.0,1700.0,1250.0,76-100% increase,more than 100% increase,more than 100% increase,23510,252.1
4,2020-07-04,Texas,Archer,48009,0.0,0.29,0.43,0.0,-0.14,0.29,...,UP,UP,33.33,28571428.57,400.0,decrease,more than 100% increase,more than 100% increase,8553,903.3


In [56]:
# calculate population density
texas_counties['pop_density'] = texas_counties['pop_est_2019'] / texas_counties['size']

In [57]:
# save files
texas_counties.to_csv('texas_counties.csv', index=False)
county_daily_cases.to_csv('texas_daily_cases.csv', index=False)