In [71]:
import numpy as np
import pandas as pd
import re
# grab NYT githhub data
url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
data = pd.read_csv(url)

In [72]:
# see most recent date entry
data.tail()

Unnamed: 0,date,county,state,fips,cases,deaths
313343,2020-07-09,Sweetwater,Wyoming,56037.0,126,0
313344,2020-07-09,Teton,Wyoming,56039.0,149,1
313345,2020-07-09,Uinta,Wyoming,56041.0,198,0
313346,2020-07-09,Washakie,Wyoming,56043.0,42,5
313347,2020-07-09,Weston,Wyoming,56045.0,1,0


In [73]:
# select entries pertaining to the lone star state
texas = data.loc[data['state'] =='Texas'].sort_values(['fips','date'], ascending=[True, True])
print('number of counties:' , len(texas['fips'].unique()))
# find nan's
print(texas[texas.isna().any(axis=1)])

number of counties: 247
             date   county  state  fips  cases  deaths
10678  2020-03-24  Unknown  Texas   NaN      0       1


In [74]:
#drop nan's
texas=texas.dropna()
texas['fips']=texas['fips'].astype(int)


In [75]:
'''
NYT interactive map nytimes.com/interactive/2020/us/coronavirus-us-cases.html
Disclaimer :the map shows where the average number of reported cases over the past two weeks is increasing, decreasing
or about the same. Counties with an increase in the rate of cases are shaded darker. Counties with fewer than 20 cases
over the past two weeks and parts of a county with a population density lower than 10 people per square mile are not shaded
'''

def classification_average_cases_change(updown,x):
    if updown=='decrease' and x>=15:
        return 'falling'
    
    elif updown=='decrease' and x<15:
        return 'about the same'
    
    elif updown=='change':
        return 'about the same'
    
    elif updown=='increase' and x<50:
        return 'about the same'
    
    elif updown =='increase':
        if x >=50 and x<200:
            return 'great'
        if x >=200 and x<300:
            return 'greater'
        if x >=300:
            return 'greatest'
    else:
        return

county_daily_cases=pd.DataFrame()
county_total=pd.DataFrame()
counties=[]
t=0 # count counties
i=0 # count counties with minimum 2 weeks of data

for fips in texas['fips'].unique():
    t=t+1
    county=pd.DataFrame(texas.loc[data['fips']==fips].sort_values(['date'], ascending=True)).reset_index(drop=True)
#     county['cumulative_deaths']=county['deaths'].cumsum()
    county['cases_per_day']=county['cases'].diff()
    if len(county['cases'])>14:
        i=i+1
        daily_cases = np.zeros(len(county['cases']))
        daily_cases[0] = county['cases'].iloc[0]

        # for loop checks to see if data reports a negative daily cases number.  If it does, the number is recorded as zero instead
        for idx in np.arange(1,len(county['cases'])):
            daily_cases[idx] = max(county['cases'].iloc[idx]-county['cases'].iloc[idx-1],0)
            
        daily_cases=pd.DataFrame(daily_cases,columns=['daily_cases'])
        county['daily_cases']=daily_cases['daily_cases'].values
        
        # collect previous 2 weeks worth of data 
        prev2wk=county.iloc[-14:]
        county_daily_cases=county_daily_cases.append(prev2wk)
        county_total = county_total.append(county)
        # calculate averages
        sevenMA = daily_cases['daily_cases'].rolling(window=7).mean()
        sevenMA1diff = sevenMA.diff(periods=6)
        sevenMA2diff = sevenMA.diff(periods=13)
        twoweekMA = daily_cases['daily_cases'].rolling(window=14).mean()
        twoweekMAdiff = twoweekMA.diff(periods=13)
        county['sevenMA']=sevenMA
        county_total = county_total.append(county)

        # gather information
        counties.append((county['date'].iloc[-1],
                         county['state'].iloc[-1],
                         county['county'].iloc[-1],
                         fips,
                         daily_cases['daily_cases'].iloc[-1],
                         sevenMA.iloc[-1],
                         sevenMA.iloc[-7],
                         sevenMA.iloc[-14],
                         sevenMA1diff.iloc[-1],
                         sevenMA2diff.iloc[-1],
                         twoweekMA.iloc[-1],
                         twoweekMA.iloc[-14],
                         twoweekMAdiff.iloc[-1]
                        ))   
    else:
        print('pass')


counties=pd.DataFrame(counties,columns=['date','state','county','fips','daily_cases','sevenMA','sevenMA1wktrailing',
                                        'sevenMA2wktrailing','sevenMA1wkdiff','sevenMA2wkdiff',
                                        'twoweekMA','twoweekMAtrailing','twoweekMAdiff'])

# counties = counties.loc[counties['twoweekMA']>1.4]
# counties = counties.loc[counties['daily_cases']>100]

# creating categorical values based on averages
counties['avg_7_1wk_change_dir']=counties['sevenMA1wkdiff'].map(lambda x: 'increase' if x>0 else('change' if x==0 else 'decrease'))
counties['avg_7_2wk_change_dir']=counties['sevenMA2wkdiff'].map(lambda x: 'increase' if x>0 else('change' if x==0 else 'decrease'))
# address dividing by zero
# counties['sevenMA1wktrailing']=counties['sevenMA1wktrailing'].replace(0,1e-6)
# counties['sevenMA2wktrailing']=counties['sevenMA2wktrailing'].replace(0,1e-6)

counties['avg_14_change_dir']=counties['twoweekMAdiff'].map(lambda x: 'increase' if x>0 else('change' if x==0 else 'decrease'))
# counties['twoweekMAtrailing']=counties['twoweekMAtrailing'].replace(0,1e-6)



counties['avg_7_1wk_pct_change']= 100*counties['sevenMA1wkdiff'].abs() / counties['sevenMA1wktrailing']
counties['avg_7_2wk_pct_change']= 100*counties['sevenMA2wkdiff'].abs() / counties['sevenMA2wktrailing']
counties['avg_14_pct_change']= 100*counties['twoweekMAdiff'].abs() / counties['twoweekMAtrailing']

counties['case7_1wk_change'] = counties.apply(lambda x: classification_average_cases_change(x['avg_7_1wk_change_dir'], x['avg_7_1wk_pct_change']), axis=1)
counties['case7_2wk_change'] = counties.apply(lambda x: classification_average_cases_change(x['avg_7_2wk_change_dir'], x['avg_7_2wk_pct_change']), axis=1)
counties['case14_change'] = counties.apply(lambda x: classification_average_cases_change(x['avg_14_change_dir'], x['avg_14_pct_change']), axis=1)
print(counties.shape)

pass
pass
pass
(243, 22)


In [76]:
counties.head()


Unnamed: 0,date,state,county,fips,daily_cases,sevenMA,sevenMA1wktrailing,sevenMA2wktrailing,sevenMA1wkdiff,sevenMA2wkdiff,...,twoweekMAdiff,avg_7_1wk_change_dir,avg_7_2wk_change_dir,avg_14_change_dir,avg_7_1wk_pct_change,avg_7_2wk_pct_change,avg_14_pct_change,case7_1wk_change,case7_2wk_change,case14_change
0,2020-07-09,Texas,Anderson,48001,381.0,67.428571,8.0,1.571429,59.428571,65.857143,...,-28.642857,increase,increase,decrease,742.857143,4190.909091,43.445287,greatest,greatest,falling
1,2020-07-09,Texas,Andrews,48003,12.0,6.142857,4.857143,3.0,1.285714,3.142857,...,3.714286,increase,increase,increase,26.470588,104.761905,179.310345,about the same,great,great
2,2020-07-09,Texas,Angelina,48005,38.0,20.285714,11.0,14.0,9.285714,6.285714,...,5.928571,increase,increase,increase,84.415584,44.897959,53.548387,great,about the same,great
3,2020-07-09,Texas,Aransas,48007,3.0,3.0,2.714286,0.714286,0.285714,2.285714,...,2.357143,increase,increase,increase,10.526316,320.0,550.0,about the same,greatest,greatest
4,2020-07-09,Texas,Archer,48009,0.0,0.428571,0.428571,0.285714,0.0,0.142857,...,0.285714,change,increase,increase,0.0,50.0,200.0,about the same,great,greater


In [89]:
len(county_total.loc[county_total['cases_per_day']<0]) # number of negative daily reported cases

436

In [78]:
# read texas population estimates
# https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/
pop2019=pd.read_csv('co-est2019-alldata.csv', encoding='latin-1')
pop2019.head()

Unnamed: 0,SUMLEV,REGION,DIVISION,STATE,COUNTY,STNAME,CTYNAME,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,...,RDOMESTICMIG2019,RNETMIG2011,RNETMIG2012,RNETMIG2013,RNETMIG2014,RNETMIG2015,RNETMIG2016,RNETMIG2017,RNETMIG2018,RNETMIG2019
0,40,3,6,1,0,Alabama,Alabama,4779736,4780125,4785437,...,1.917501,0.578434,1.186314,1.522549,0.563489,0.626357,0.745172,1.090366,1.773786,2.483744
1,50,3,6,1,1,Alabama,Autauga County,54571,54597,54773,...,4.84731,6.018182,-6.226119,-3.902226,1.970443,-1.712875,4.777171,0.849656,0.540916,4.560062
2,50,3,6,1,3,Alabama,Baldwin County,182265,182265,183112,...,24.017829,16.64187,17.488579,22.751474,20.184334,17.725964,21.279291,22.398256,24.727215,24.380567
3,50,3,6,1,5,Alabama,Barbour County,27457,27455,27327,...,-5.690302,0.292676,-6.897817,-8.132185,-5.140431,-15.724575,-18.238016,-24.998528,-8.754922,-5.165664
4,50,3,6,1,7,Alabama,Bibb County,22915,22915,22870,...,1.385134,-4.998356,-3.787545,-5.797999,1.331144,1.329817,-0.708717,-3.234669,-6.857092,1.831952


In [79]:
# grab texas
txpop2019=pop2019.loc[pop2019['STNAME']=='Texas'][['CTYNAME','POPESTIMATE2019']]
txpop2019['CTYNAME']=txpop2019['CTYNAME'].astype(str)
# top row is entire state population
texaspop=txpop2019.iloc[0]
print(texaspop)
txpop2019=txpop2019.drop(txpop2019.index[0])
new_name= [string.replace(' County', '') for string in txpop2019['CTYNAME']]
txpop2019['county']=new_name
txpop2019=txpop2019.drop(['CTYNAME'],axis=1)
txpop2019=txpop2019.rename(columns={'POPESTIMATE2019':'pop_est_2019'})
txpop2019.head()

CTYNAME               Texas
POPESTIMATE2019    28995881
Name: 2566, dtype: object


Unnamed: 0,pop_est_2019,county
2567,57735,Anderson
2568,18705,Andrews
2569,86715,Angelina
2570,23510,Aransas
2571,8553,Archer


In [80]:
# http://www.texascounties.net/statistics/landarea.htm
# copy and paste data into csv file
tx_landsize = pd.read_csv('txlandarea.csv')

In [81]:
tx_landsize.head()

Unnamed: 0,county,size
0,Anderson,1062.6
1,Andrews,1500.7
2,Angelina,797.8
3,Aransas,252.1
4,Archer,903.3


In [82]:
# merge dataframes
texas_counties=pd.merge(pd.merge(counties,txpop2019, on='county'),tx_landsize, on='county')


In [83]:
texas_counties.head()

Unnamed: 0,date,state,county,fips,daily_cases,sevenMA,sevenMA1wktrailing,sevenMA2wktrailing,sevenMA1wkdiff,sevenMA2wkdiff,...,avg_7_2wk_change_dir,avg_14_change_dir,avg_7_1wk_pct_change,avg_7_2wk_pct_change,avg_14_pct_change,case7_1wk_change,case7_2wk_change,case14_change,pop_est_2019,size
0,2020-07-09,Texas,Anderson,48001,381.0,67.428571,8.0,1.571429,59.428571,65.857143,...,increase,decrease,742.857143,4190.909091,43.445287,greatest,greatest,falling,57735,1062.6
1,2020-07-09,Texas,Andrews,48003,12.0,6.142857,4.857143,3.0,1.285714,3.142857,...,increase,increase,26.470588,104.761905,179.310345,about the same,great,great,18705,1500.7
2,2020-07-09,Texas,Angelina,48005,38.0,20.285714,11.0,14.0,9.285714,6.285714,...,increase,increase,84.415584,44.897959,53.548387,great,about the same,great,86715,797.8
3,2020-07-09,Texas,Aransas,48007,3.0,3.0,2.714286,0.714286,0.285714,2.285714,...,increase,increase,10.526316,320.0,550.0,about the same,greatest,greatest,23510,252.1
4,2020-07-09,Texas,Archer,48009,0.0,0.428571,0.428571,0.285714,0.0,0.142857,...,increase,increase,0.0,50.0,200.0,about the same,great,greater,8553,903.3


In [84]:
# calculate population density
texas_counties['pop_density'] = texas_counties['pop_est_2019'] / texas_counties['size']

In [88]:
# texas_counties[texas_counties.isna().any(axis=1)]
# county_total.loc[county_total['county']=='Culberson']
600/7

85.71428571428571

In [86]:
# save files
texas_counties.to_csv('texas_counties.csv', index=False)
county_daily_cases.to_csv('texas_daily_cases.csv', index=False)