# Indicators of Anxiety and Depression Based on Reported Frequency of Symptoms

## Clean & Prepare Data

In [1]:
#import libraries

import pandas as pd   
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [2]:
#load dataset

df = pd.read_csv('AnxietyData.csv', parse_dates=['Time Period Start Date', 'Time Period End Date'])

In [3]:
#display data

df.head()

Unnamed: 0,Indicator,Group,State,Subgroup,Phase,Time Period,Time Period Label,Time Period Start Date,Time Period End Date,Value,Low CI,High CI,Confidence Interval,Quartile Range
0,Symptoms of Depressive Disorder,National Estimate,United States,United States,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,23.5,22.7,24.3,22.7 - 24.3,
1,Symptoms of Depressive Disorder,By Age,United States,18 - 29 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,32.7,30.2,35.2,30.2 - 35.2,
2,Symptoms of Depressive Disorder,By Age,United States,30 - 39 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,25.7,24.1,27.3,24.1 - 27.3,
3,Symptoms of Depressive Disorder,By Age,United States,40 - 49 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,24.8,23.3,26.2,23.3 - 26.2,
4,Symptoms of Depressive Disorder,By Age,United States,50 - 59 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,23.2,21.5,25.0,21.5 - 25.0,


In [5]:
#check for missing values

df.isnull().sum()

Indicator                    0
Group                        0
State                        0
Subgroup                     0
Phase                        0
Time Period                  0
Time Period Label            0
Time Period Start Date       0
Time Period End Date         0
Value                      297
Low CI                     297
High CI                    297
Confidence Interval        297
Quartile Range            2862
dtype: int64

In [6]:
#groups that have missing values in the Value column

na_groups = df['Group'].unique()

na_groups

array(['National Estimate', 'By Age', 'By Sex',
       'By Race/Hispanic ethnicity', 'By Education', 'By State',
       'By Disability status', 'By Gender identity',
       'By Sexual orientation'], dtype=object)

In [7]:
#create function that checks the missing values of any group  

def check_na_count(group):
    groupdf = df[df['Group'] == group]
    length_of_na = len(groupdf[groupdf['Value'].isna()])

    length_of_df = len(groupdf)

    na_as_percentage = round((length_of_na/length_of_df),2)

    outputstring = f"In {group} out of {length_of_df} values, there are {length_of_na} missing values. {na_as_percentage}% missing" 

    return outputstring

In [8]:

check_na_count('National Estimate')

##in the original EDA notebook, I ran a cell for each group. 
##to make this cleaner, I want to create a loop 
##that loops through each group and outputs its own f-string 

##it was determined that the missing values were not significant enough

'In National Estimate out of 138 values, there are 15 missing values. 0.11% missing'

In [9]:
df['Phase'].unique()

#values are inconsistent

array(['1', '-1', '2', '3 (Oct 28 � Dec 21)', '3 (Jan 6 � Mar 29)', '3.1',
       '3.2', '3.3'], dtype=object)

In [11]:
#create a new column with renamed / correct Phase values (call it Phases)

#create a list of conditions
conditions = [ 
    (df['Time Period Start Date'] >= '2020-04-23') & (df['Time Period End Date'] <='2020-07-21'), #1. Phase 1   04/23/2020 - 07/21/2020
    ((df['Time Period Start Date'] >= '2020-08-19') & (df['Time Period End Date'] <='2020-10-26')), #2. Phase 2   08/19/2020 - 10/26/2020
    ((df['Time Period Start Date'] >= '2020-10-28') & (df['Time Period End Date'] <='2020-12-21')), #3. Phase 3   10/28/2020 - 12/21/2020
    ((df['Time Period Start Date'] >= '2021-01-06') & (df['Time Period End Date'] <= '2021-03-29')), #4. Phase 3 (con't) 01/06/2021 - 03/29/2021   
    ((df['Time Period Start Date'] >= '2021-04-14') & (df['Time Period End Date'] <='2021-07-05')), #5. Phase 3.1 04/14/2021 - 07/05/2021
    ((df['Time Period Start Date'] >= '2021-07-21') & (df['Time Period End Date'] <='2021-10-11')), #6. Phase 3.2 07/21/2021 - 10/11/2021
    ((df['Time Period Start Date'] >= '2021-12-01') & (df['Time Period End Date'] <='2022-02-07'))  #7 Phase 3.3 Phase 3.3 12/01/2021 - 02/07/2022
    
    ]
#create a list of the values we want to assign to each condition
values = ['Phase 1','Phase 2','Phase 3','Phase 4', 'Phase 5','Phase 6', 'Phase 7']

#create a new column and use np.select to assign values from the list as arguments
df['Phases'] = np.select(conditions, values)

#display updated dataframe
df.head()

Unnamed: 0,Indicator,Group,State,Subgroup,Phase,Time Period,Time Period Label,Time Period Start Date,Time Period End Date,Value,Low CI,High CI,Confidence Interval,Quartile Range,Phases
0,Symptoms of Depressive Disorder,National Estimate,United States,United States,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,23.5,22.7,24.3,22.7 - 24.3,,Phase 1
1,Symptoms of Depressive Disorder,By Age,United States,18 - 29 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,32.7,30.2,35.2,30.2 - 35.2,,Phase 1
2,Symptoms of Depressive Disorder,By Age,United States,30 - 39 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,25.7,24.1,27.3,24.1 - 27.3,,Phase 1
3,Symptoms of Depressive Disorder,By Age,United States,40 - 49 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,24.8,23.3,26.2,23.3 - 26.2,,Phase 1
4,Symptoms of Depressive Disorder,By Age,United States,50 - 59 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,23.2,21.5,25.0,21.5 - 25.0,,Phase 1


In [12]:
#display the renamed Phases
df['Phases'].unique()

array(['Phase 1', '0', 'Phase 2', 'Phase 3', 'Phase 4', 'Phase 5',
       'Phase 6', 'Phase 7'], dtype=object)

In [13]:
#what is element '0' 

zero_phase_value = df[df['Phases'] == '0']

In [14]:
zero_phase_value['Time Period Start Date'].unique()

#these dates correspond to Break periods

array(['2020-07-22T00:00:00.000000000', '2020-12-22T00:00:00.000000000',
       '2021-03-30T00:00:00.000000000', '2021-07-06T00:00:00.000000000',
       '2021-10-12T00:00:00.000000000'], dtype='datetime64[ns]')

In [15]:
zero_phase_value['Time Period End Date'].unique()

#these dates correspond to Break periods

array(['2020-08-18T00:00:00.000000000', '2021-01-05T00:00:00.000000000',
       '2021-04-13T00:00:00.000000000', '2021-07-20T00:00:00.000000000',
       '2021-11-30T00:00:00.000000000'], dtype='datetime64[ns]')

In [16]:
#remove any rows that correspond to survey breaks

#removing unwanted rows by filtering
df = df[df['Phases'] != '0']

#display updated dataframe
df.head()

Unnamed: 0,Indicator,Group,State,Subgroup,Phase,Time Period,Time Period Label,Time Period Start Date,Time Period End Date,Value,Low CI,High CI,Confidence Interval,Quartile Range,Phases
0,Symptoms of Depressive Disorder,National Estimate,United States,United States,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,23.5,22.7,24.3,22.7 - 24.3,,Phase 1
1,Symptoms of Depressive Disorder,By Age,United States,18 - 29 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,32.7,30.2,35.2,30.2 - 35.2,,Phase 1
2,Symptoms of Depressive Disorder,By Age,United States,30 - 39 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,25.7,24.1,27.3,24.1 - 27.3,,Phase 1
3,Symptoms of Depressive Disorder,By Age,United States,40 - 49 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,24.8,23.3,26.2,23.3 - 26.2,,Phase 1
4,Symptoms of Depressive Disorder,By Age,United States,50 - 59 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,23.2,21.5,25.0,21.5 - 25.0,,Phase 1


In [17]:
#remove rows that are National Estimates

df = df[df['Group'] != 'National Estimate']

#display df
df.head()

Unnamed: 0,Indicator,Group,State,Subgroup,Phase,Time Period,Time Period Label,Time Period Start Date,Time Period End Date,Value,Low CI,High CI,Confidence Interval,Quartile Range,Phases
1,Symptoms of Depressive Disorder,By Age,United States,18 - 29 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,32.7,30.2,35.2,30.2 - 35.2,,Phase 1
2,Symptoms of Depressive Disorder,By Age,United States,30 - 39 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,25.7,24.1,27.3,24.1 - 27.3,,Phase 1
3,Symptoms of Depressive Disorder,By Age,United States,40 - 49 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,24.8,23.3,26.2,23.3 - 26.2,,Phase 1
4,Symptoms of Depressive Disorder,By Age,United States,50 - 59 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,23.2,21.5,25.0,21.5 - 25.0,,Phase 1
5,Symptoms of Depressive Disorder,By Age,United States,60 - 69 years,1,1,"Apr 23 - May 5, 2020",2020-04-23,2020-05-05,18.4,17.0,19.7,17.0 - 19.7,,Phase 1


In [18]:
#create a new df with relevant columns
clean_df = df[['Indicator','State','Group', 'Subgroup','Phases', 'Time Period Label', 'Value']]

#display new df
clean_df

Unnamed: 0,Indicator,State,Group,Subgroup,Phases,Time Period Label,Value
1,Symptoms of Depressive Disorder,United States,By Age,18 - 29 years,Phase 1,"Apr 23 - May 5, 2020",32.7
2,Symptoms of Depressive Disorder,United States,By Age,30 - 39 years,Phase 1,"Apr 23 - May 5, 2020",25.7
3,Symptoms of Depressive Disorder,United States,By Age,40 - 49 years,Phase 1,"Apr 23 - May 5, 2020",24.8
4,Symptoms of Depressive Disorder,United States,By Age,50 - 59 years,Phase 1,"Apr 23 - May 5, 2020",23.2
5,Symptoms of Depressive Disorder,United States,By Age,60 - 69 years,Phase 1,"Apr 23 - May 5, 2020",18.4
...,...,...,...,...,...,...,...
9130,Symptoms of Anxiety Disorder or Depressive Dis...,Virginia,By State,Virginia,Phase 7,"Dec 29, 2021 - Jan 10, 2022",35.4
9131,Symptoms of Anxiety Disorder or Depressive Dis...,Washington,By State,Washington,Phase 7,"Dec 29, 2021 - Jan 10, 2022",32.7
9132,Symptoms of Anxiety Disorder or Depressive Dis...,West Virginia,By State,West Virginia,Phase 7,"Dec 29, 2021 - Jan 10, 2022",34.6
9133,Symptoms of Anxiety Disorder or Depressive Dis...,Wisconsin,By State,Wisconsin,Phase 7,"Dec 29, 2021 - Jan 10, 2022",25.5


In [19]:
#create a df filtered by United States only

us_only = clean_df[clean_df['State'] == 'United States']

#display 
us_only.head()

Unnamed: 0,Indicator,State,Group,Subgroup,Phases,Time Period Label,Value
1,Symptoms of Depressive Disorder,United States,By Age,18 - 29 years,Phase 1,"Apr 23 - May 5, 2020",32.7
2,Symptoms of Depressive Disorder,United States,By Age,30 - 39 years,Phase 1,"Apr 23 - May 5, 2020",25.7
3,Symptoms of Depressive Disorder,United States,By Age,40 - 49 years,Phase 1,"Apr 23 - May 5, 2020",24.8
4,Symptoms of Depressive Disorder,United States,By Age,50 - 59 years,Phase 1,"Apr 23 - May 5, 2020",23.2
5,Symptoms of Depressive Disorder,United States,By Age,60 - 69 years,Phase 1,"Apr 23 - May 5, 2020",18.4


## Create df for each phase

In [21]:
#create df for each Phase 

#create a list of phases
phases_list = []

for phase in us_only['Phases'].unique().tolist():
    if phase not in phases_list:
        phases_list.append(phase)

#create a list of dataframes
phases_df = [] 
for phase in phases_list:
    us_only_df = us_only[us_only['Phases'] == phase]
    phases_df.append(us_only_df)

In [25]:
#call on the first dataframe; this should output Phase 1 only

phases_df[0].head()

Unnamed: 0,Indicator,State,Group,Subgroup,Phases,Time Period Label,Value
1,Symptoms of Depressive Disorder,United States,By Age,18 - 29 years,Phase 1,"Apr 23 - May 5, 2020",32.7
2,Symptoms of Depressive Disorder,United States,By Age,30 - 39 years,Phase 1,"Apr 23 - May 5, 2020",25.7
3,Symptoms of Depressive Disorder,United States,By Age,40 - 49 years,Phase 1,"Apr 23 - May 5, 2020",24.8
4,Symptoms of Depressive Disorder,United States,By Age,50 - 59 years,Phase 1,"Apr 23 - May 5, 2020",23.2
5,Symptoms of Depressive Disorder,United States,By Age,60 - 69 years,Phase 1,"Apr 23 - May 5, 2020",18.4


In [29]:
#assign phase 1 dataframe to a easier to read variable

phase1df = phases_df[0]

In [31]:
#from Phase 1, filter by age

phase1df_age = phase1df[phase1df['Group'] == 'By Age']

phase1df_age.head()

Unnamed: 0,Indicator,State,Group,Subgroup,Phases,Time Period Label,Value
1,Symptoms of Depressive Disorder,United States,By Age,18 - 29 years,Phase 1,"Apr 23 - May 5, 2020",32.7
2,Symptoms of Depressive Disorder,United States,By Age,30 - 39 years,Phase 1,"Apr 23 - May 5, 2020",25.7
3,Symptoms of Depressive Disorder,United States,By Age,40 - 49 years,Phase 1,"Apr 23 - May 5, 2020",24.8
4,Symptoms of Depressive Disorder,United States,By Age,50 - 59 years,Phase 1,"Apr 23 - May 5, 2020",23.2
5,Symptoms of Depressive Disorder,United States,By Age,60 - 69 years,Phase 1,"Apr 23 - May 5, 2020",18.4


In [42]:
# #create bar chart that breaks down by age group

# #prepare the data to plot
# ages = phase1df_age['Subgroup']
# percent = phase1df_age['Value']

# #create bar chart
# plt.bar(ages, percent, color= ('b', 'g', 'r', 'c', 'm', 'y','w'))

# #adjust labels
# plt.title('Percentage of people showing symptoms in Phase 1')
# plt.xlabel('Age Ranges')
# plt.xticks(rotation= 55)
# plt.ylabel('Percentage')
# plt.show()

## Function - provide parameters

#goal: write a function that allows me to pass a state or subcategory as a parameter and outputs the pertaining information 

In [43]:
def lookup_phase_stats(phases, group):
    print(f"Hey, you looked up {phases} by {group}!")

In [45]:
#call the function
lookup_phase_stats(phases="Phase 1", group="By Age")

Hey, you looked up Phase 1 by By Age!
