## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data file
WHO_results_path = "FluViewPhase2Data/WHO_NREVSS_Combined_prior_to_2015_16.csv"

# Read the flu data and skip header row
WHO_df = pd.read_csv(WHO_results_path, skiprows=1)

WHO_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14094 entries, 0 to 14093
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   REGION TYPE                  14094 non-null  object
 1   REGION                       14094 non-null  object
 2   YEAR                         14094 non-null  int64 
 3   WEEK                         14094 non-null  int64 
 4   TOTAL SPECIMENS              14094 non-null  object
 5   PERCENT POSITIVE             14094 non-null  object
 6   A (2009 H1N1)                14094 non-null  object
 7   A (H1)                       14094 non-null  object
 8   A (H3)                       14094 non-null  object
 9   A (Subtyping not Performed)  14094 non-null  object
 10  A (Unable to Subtype)        14094 non-null  object
 11  B                            14094 non-null  object
 12  H3N2v                        14094 non-null  object
dtypes: int64(2), object(11)
memory 

In [11]:
#When trying to convert columns to type float - returned that string "X" values present
#Change any X values to zero
WHO_df_clean = WHO_df.replace('X',0)

#Convert result columns to numbers
WHO_df_clean['TOTAL SPECIMENS'] = pd.to_numeric(WHO_df_clean['TOTAL SPECIMENS'])
WHO_df_clean['PERCENT POSITIVE'] = pd.to_numeric(WHO_df_clean['PERCENT POSITIVE'])
WHO_df_clean['A (2009 H1N1)'] = pd.to_numeric(WHO_df_clean['A (2009 H1N1)'])
WHO_df_clean['A (H1)'] = pd.to_numeric(WHO_df_clean['A (H1)'])
WHO_df_clean['A (H3)'] = pd.to_numeric(WHO_df_clean['A (H3)'])
WHO_df_clean['A (Subtyping not Performed)'] = pd.to_numeric(WHO_df_clean['A (Subtyping not Performed)'])
WHO_df_clean['A (Unable to Subtype)'] = pd.to_numeric(WHO_df_clean['A (Unable to Subtype)'])
WHO_df_clean['B'] = pd.to_numeric(WHO_df_clean['B'])
WHO_df_clean['H3N2v'] = pd.to_numeric(WHO_df_clean['H3N2v'])
WHO_df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14094 entries, 0 to 14093
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   REGION TYPE                  14094 non-null  object 
 1   REGION                       14094 non-null  object 
 2   YEAR                         14094 non-null  int64  
 3   WEEK                         14094 non-null  int64  
 4   TOTAL SPECIMENS              14094 non-null  int64  
 5   PERCENT POSITIVE             14094 non-null  float64
 6   A (2009 H1N1)                14094 non-null  int64  
 7   A (H1)                       14094 non-null  int64  
 8   A (H3)                       14094 non-null  int64  
 9   A (Subtyping not Performed)  14094 non-null  int64  
 10  A (Unable to Subtype)        14094 non-null  int64  
 11  B                            14094 non-null  int64  
 12  H3N2v                        14094 non-null  int64  
dtypes: float64(1), i

In [16]:
#Sum flu columns to return total positives for A type, B type and combined
WHO_df_clean['Total A Positives'] = WHO_df_clean['A (2009 H1N1)'] + WHO_df_clean['A (H1)'] + WHO_df_clean['A (H3)'] +\
                                    WHO_df_clean['A (Subtyping not Performed)'] + WHO_df_clean['A (Unable to Subtype)'] +\
                                    WHO_df_clean['H3N2v']

WHO_df_clean['Total B Positives'] = WHO_df_clean['B']

WHO_df_clean['Total Positives'] = WHO_df_clean['Total A Positives'] + WHO_df_clean['Total B Positives']

WHO_df_clean.head()

Unnamed: 0,REGION TYPE,REGION,YEAR,WEEK,TOTAL SPECIMENS,PERCENT POSITIVE,A (2009 H1N1),A (H1),A (H3),A (Subtyping not Performed),A (Unable to Subtype),B,H3N2v,Total A Positives,Total B Positives,Total Positives
0,States,Alabama,2010,40,54,0.0,0,0,0,0,0,0,0,0,0,0
1,States,Alaska,2010,40,40,0.0,0,0,0,0,0,0,0,0,0,0
2,States,Arizona,2010,40,40,2.5,0,0,1,0,0,0,0,1,0,1
3,States,Arkansas,2010,40,15,0.0,0,0,0,0,0,0,0,0,0,0
4,States,California,2010,40,183,3.28,2,0,3,0,0,1,0,5,1,6


In [18]:
#Filter to desired states
filtered_df = WHO_df_clean.loc[(WHO_df_clean['REGION']=='New York')|(WHO_df_clean['REGION']=='Illinois')|
                               (WHO_df_clean['REGION']=='Texas')|(WHO_df_clean['REGION']=='Arizona')|
                               (WHO_df_clean['REGION']=='Georgia')|(WHO_df_clean['REGION']=='Pennsylvania')|
                               (WHO_df_clean['REGION']=='California')|(WHO_df_clean['REGION']=='Washington')|
                               (WHO_df_clean['REGION']=='Colorado')]
filtered_df.count()

REGION TYPE                    2349
REGION                         2349
YEAR                           2349
WEEK                           2349
TOTAL SPECIMENS                2349
PERCENT POSITIVE               2349
A (2009 H1N1)                  2349
A (H1)                         2349
A (H3)                         2349
A (Subtyping not Performed)    2349
A (Unable to Subtype)          2349
B                              2349
H3N2v                          2349
Total A Positives              2349
Total B Positives              2349
Total Positives                2349
dtype: int64

## Summary Statistics

In [20]:
# Generate a summary statistics table for each state
state_group = filtered_df.groupby(['REGION'])
summary_statistics = state_group.agg({
    'Total Positives':['mean', 'median','var', 'std', 'sem']})
summary_statistics

Unnamed: 0_level_0,Total Positives,Total Positives,Total Positives,Total Positives,Total Positives
Unnamed: 0_level_1,mean,median,var,std,sem
REGION,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Arizona,37.153257,6,4296.184114,65.545283,4.057151
California,64.168582,9,29831.417625,172.71774,10.69096
Colorado,42.789272,6,8932.951577,94.514293,5.850288
Georgia,66.62069,12,13545.120955,116.383508,7.203958
Illinois,21.881226,2,3112.520454,55.789967,3.453312
New York,98.065134,9,31073.945741,176.278035,10.911336
Pennsylvania,41.727969,4,6486.421869,80.538325,4.985197
Texas,117.1341,14,52042.201179,228.127598,14.120744
Washington,29.099617,7,2392.551577,48.913716,3.027683
