# SAA

In [352]:
import pandas as pd
import itertools

##  Intro

### SAA t-1
Let's say we are at time t, we have our fifty states for which we want to prescribe tests with SAA using information of tests sold in time t-1.
Two approches:
- scalar: for a state, take the average on all states for previous time
- vector: points are now vector in dimensions R^50 (for 50 states for instance). Take the vector average, which basicaly mean as we only have a single vector - the one for t-1 - to take exactly the same info as t-1 for each state 

Observe: for scalar: all states will prescribe the same value, namely the average for t-1, whereas it's not the case for vector SAA.


### SAA cumulative

Again there are the two approches: 
- scalar: for each state take the average for all states on all previous step. Again all prescription will thus be the same
- vector: vector average, i.e. here for a given state take the average on all previous step of test stold FOR THIS GIVEN state

## SAA t-1: SAA prescription with training data from t-1 only

In [353]:
data = pd.read_csv('data/final_df_drop.csv')
data.head()

Unnamed: 0,State,Date,population,confirmed,deaths,incident_rate,mortality_rate,testing_rate,TestsReported
0,Alabama,2020-04-12,4903185.0,3563,93,75.99,2.61,460.3,2165
1,Alabama,2020-04-13,4903185.0,3734,99,79.63,2.65,622.36,2626
2,Alabama,2020-04-14,4903185.0,3953,114,84.31,2.88,706.29,2942
3,Alabama,2020-04-15,4903185.0,4075,118,86.91,2.9,726.76,3289
4,Alabama,2020-04-16,4903185.0,4345,133,92.67,3.06,776.11,3079


In [354]:
# Sort by date
data.sort_values(by=['Date', 'State'], inplace=True)

In [355]:
### df for SAA t-1 prescription

# Scalar SAA t-1

SAA_1 = data[['State', 'Date', 'TestsReported']]
# Shift Date to compute the average on t-1
SAA_1['Date'] = pd.to_datetime(SAA_1['Date'])
SAA_1['Shift_Date'] = SAA_1['Date'] + pd.Timedelta(days=1)
average = SAA_1.groupby('Shift_Date')['TestsReported'].mean().round().rename('SAA_t_1_scalar')
# Merge df and averages
SAA_1 = SAA_1.merge(average, left_on='Date', right_on='Shift_Date', how='left')
SAA_1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SAA_1['Date'] = pd.to_datetime(SAA_1['Date'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  SAA_1['Shift_Date'] = SAA_1['Date'] + pd.Timedelta(days=1)


Unnamed: 0,State,Date,TestsReported,Shift_Date,SAA_t_1_scalar
0,Alabama,2020-04-12,2165,2020-04-13,
1,Alaska,2020-04-12,262,2020-04-13,
2,Arizona,2020-04-12,1119,2020-04-13,
3,Arkansas,2020-04-12,625,2020-04-13,
4,California,2020-04-12,10816,2020-04-13,
...,...,...,...,...,...
7171,Virginia,2020-08-27,17503,2020-08-28,19515.0
7172,Washington,2020-08-27,14511,2020-08-28,19515.0
7173,West Virginia,2020-08-27,5681,2020-08-28,19515.0
7174,Wisconsin,2020-08-27,28825,2020-08-28,19515.0


In [356]:
# Vector SAA t-1
SAA_1.sort_values(by=['State', 'Date'], inplace=True)
SAA_1['SAA_t_1_vector'] = SAA_1.groupby('State')['TestsReported'].shift(1)
SAA_1.sort_index(axis=0, inplace=True)
SAA_1

Unnamed: 0,State,Date,TestsReported,Shift_Date,SAA_t_1_scalar,SAA_t_1_vector
0,Alabama,2020-04-12,2165,2020-04-13,,
1,Alaska,2020-04-12,262,2020-04-13,,
2,Arizona,2020-04-12,1119,2020-04-13,,
3,Arkansas,2020-04-12,625,2020-04-13,,
4,California,2020-04-12,10816,2020-04-13,,
...,...,...,...,...,...,...
7171,Virginia,2020-08-27,17503,2020-08-28,19515.0,20820.0
7172,Washington,2020-08-27,14511,2020-08-28,19515.0,12517.0
7173,West Virginia,2020-08-27,5681,2020-08-28,19515.0,4762.0
7174,Wisconsin,2020-08-27,28825,2020-08-28,19515.0,21167.0


## SAA cumul: SAA prescription with cumulative training data

In [357]:
# Scalar SAA cumul

SAA_1['Day_before'] = SAA_1['Date'] + pd.Timedelta(days=-1)
n = len(SAA_1[SAA_1.Date == '2020-04-12'])
cumulative_average_per_date = SAA_1.groupby('Date')['TestsReported'].sum().rename('SAA_cumul_scalar')/n
cumulative_average_per_date.reset_index()
SAA_1 = SAA_1.merge(cumulative_average_per_date, left_on ='Day_before', right_on='Date', how='left')
SAA_1['SAA_cumul_scalar'] = SAA_1['SAA_cumul_scalar'].round()

In [358]:
### df for SAA cumulative prescription

# vectorial SAA cumul

l = []
for date in SAA_1.Date.unique():
    df = SAA_1[SAA_1.Date < date]
    mean_vector = list(df.groupby('State')['TestsReported'].mean().round())
    l.append(mean_vector)

flat_list = list(itertools.chain.from_iterable(l))
SAA_1.dropna(inplace=True)
SAA_1['SAA_cumul_vector'] = flat_list
SAA_1.drop(columns=['Shift_Date', 'Day_before'], inplace=True)
SAA_1 = SAA_1.reset_index(drop=True)


In [359]:
SAA_1

Unnamed: 0,State,Date,TestsReported,SAA_t_1_scalar,SAA_t_1_vector,SAA_cumul_scalar,SAA_cumul_vector
0,Alabama,2020-04-13,2626,2151.0,2165.0,2151.0,2165.0
1,Alaska,2020-04-13,399,2151.0,262.0,2151.0,262.0
2,Arizona,2020-04-13,947,2151.0,1119.0,2151.0,1119.0
3,Arkansas,2020-04-13,293,2151.0,625.0,2151.0,625.0
4,California,2020-04-13,9150,2151.0,10816.0,2151.0,10816.0
...,...,...,...,...,...,...,...
7119,Virginia,2020-08-27,17503,19515.0,20820.0,19515.0,12330.0
7120,Washington,2020-08-27,14511,19515.0,12517.0,19515.0,8037.0
7121,West Virginia,2020-08-27,5681,19515.0,4762.0,19515.0,3275.0
7122,Wisconsin,2020-08-27,28825,19515.0,21167.0,19515.0,13429.0


In [360]:
#SAA_1.to_csv('data/prescription/saa.csv', index=None)