# Unemployment Rate by Age Group in CA
https://data.ca.gov/dataset/unemployment-rate-by-age-groups

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import math

In [2]:
unemploy_df = pd.read_csv('../data/Raw/adult_unemployment.csv')

In [3]:
unemploy_df

Unnamed: 0,Area Type,Area Name,Date,Year,Month,Age 16-19,Age 20-24,Age 25-34,Age 35-44,Age 45-54,Age 55-64,Age 65+
0,California,State,05/01/2019,2019,May,0.2,0.1,0.0,0.0,0.0,0.0,0.0
1,California,State,04/01/2019,2019,April,0.2,0.1,0.0,0.0,0.0,0.0,0.0
2,California,State,03/01/2019,2019,March,0.2,0.1,0.0,0.0,0.0,0.0,0.0
3,California,State,02/01/2019,2019,February,0.1,0.1,0.0,0.0,0.0,0.0,0.0
4,California,State,01/01/2019,2019,January,0.2,0.1,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
148,California,State,06/01/2020,2020,June,0.2,0.1,0.1,0.1,0.1,0.1,0.1
149,California,State,07/01/2020,2020,July,0.2,0.1,0.1,0.1,0.1,0.1,0.1
150,California,State,08/01/2020,2020,August,0.2,0.1,0.1,0.1,0.1,0.1,0.1
151,California,State,09/01/2020,2020,September,0.2,0.1,0.1,0.1,0.1,0.1,0.1


In [4]:
unemploy_df.columns

Index(['Area Type', 'Area Name', 'Date', 'Year', 'Month', 'Age 16-19',
       'Age 20-24', 'Age 25-34', 'Age 35-44', 'Age 45-54', 'Age 55-64',
       'Age 65+'],
      dtype='object')

Interesting; there's an Area type and Area Name but let's see if there are actually different areas in our data or if it's just the state of CA overall:

In [5]:
unemploy_df.sample(30)

Unnamed: 0,Area Type,Area Name,Date,Year,Month,Age 16-19,Age 20-24,Age 25-34,Age 35-44,Age 45-54,Age 55-64,Age 65+
123,California,State,02/01/2009,2009,February,0.3,0.1,0.1,0.1,0.1,0.0,0.1
5,California,State,12/01/2018,2018,December,0.2,0.1,0.0,0.0,0.0,0.0,0.0
152,California,State,10/01/2020,2020,October,0.2,0.2,0.1,0.1,0.1,0.1,0.1
74,California,State,03/01/2013,2013,March,0.3,0.2,0.1,0.1,0.1,0.1,0.1
29,California,State,12/01/2016,2016,December,0.2,0.1,0.1,0.0,0.0,0.0,0.0
84,California,State,05/01/2012,2012,May,0.4,0.2,0.1,0.1,0.1,0.1,0.1
38,California,State,03/01/2016,2016,March,0.2,0.1,0.1,0.0,0.0,0.0,0.0
18,California,State,11/01/2017,2017,November,0.2,0.1,0.0,0.0,0.0,0.0,0.0
43,California,State,10/01/2015,2015,October,0.2,0.1,0.1,0.0,0.0,0.0,0.1
87,California,State,02/01/2012,2012,February,0.4,0.2,0.1,0.1,0.1,0.1,0.1


In [6]:
unemploy_df.nunique()

Area Type      1
Area Name      1
Date         153
Year          13
Month         12
Age 16-19      4
Age 20-24      2
Age 25-34      2
Age 35-44      2
Age 45-54      2
Age 55-64      2
Age 65+        2
dtype: int64

In [8]:
unemploy_df.columns.unique()

Index(['Area Type', 'Area Name', 'Date', 'Year', 'Month', 'Age 16-19',
       'Age 20-24', 'Age 25-34', 'Age 35-44', 'Age 45-54', 'Age 55-64',
       'Age 65+'],
      dtype='object')

Taking a sample several times and looking at the number of unique columns, it seems like the data is covering the entire state of CA rather than showing us specific areas. 

I'm going to rename the columns to make them more convenient for me and easier to work with:

In [10]:
# renaming columns:

cname_dict = {
    'Year' : 'year',
    'Month' : 'month',
    'Age 16-19' : '16-19',
    'Age 20-24' : '20-24',
    'Age 25-34' : '25-34',
    'Age 35-44' : '35-44',
    'Age 45-54' : '45-54',
    'Age 55-64' : '55-64',
    'Age 65+' : '65+'
}

In [11]:
unemploy_df = unemploy_df.rename(columns=cname_dict)

In [12]:
unemploy_df

Unnamed: 0,Area Type,Area Name,Date,year,month,16-19,20-24,25-34,35-44,45-54,55-64,65+
0,California,State,05/01/2019,2019,May,0.2,0.1,0.0,0.0,0.0,0.0,0.0
1,California,State,04/01/2019,2019,April,0.2,0.1,0.0,0.0,0.0,0.0,0.0
2,California,State,03/01/2019,2019,March,0.2,0.1,0.0,0.0,0.0,0.0,0.0
3,California,State,02/01/2019,2019,February,0.1,0.1,0.0,0.0,0.0,0.0,0.0
4,California,State,01/01/2019,2019,January,0.2,0.1,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
148,California,State,06/01/2020,2020,June,0.2,0.1,0.1,0.1,0.1,0.1,0.1
149,California,State,07/01/2020,2020,July,0.2,0.1,0.1,0.1,0.1,0.1,0.1
150,California,State,08/01/2020,2020,August,0.2,0.1,0.1,0.1,0.1,0.1,0.1
151,California,State,09/01/2020,2020,September,0.2,0.1,0.1,0.1,0.1,0.1,0.1


In [13]:
# Choosing columns with the data I want
cols_to_use = [
    'year',
    'month',
    '16-19',
    '20-24',
    '35-44',
    '45-54',
    '55-64',
    '65+'
]

unemploy_df = unemploy_df[cols_to_use].rename(columns=cname_dict)

In [14]:
unemploy_df

Unnamed: 0,year,month,16-19,20-24,35-44,45-54,55-64,65+
0,2019,May,0.2,0.1,0.0,0.0,0.0,0.0
1,2019,April,0.2,0.1,0.0,0.0,0.0,0.0
2,2019,March,0.2,0.1,0.0,0.0,0.0,0.0
3,2019,February,0.1,0.1,0.0,0.0,0.0,0.0
4,2019,January,0.2,0.1,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
148,2020,June,0.2,0.1,0.1,0.1,0.1,0.1
149,2020,July,0.2,0.1,0.1,0.1,0.1,0.1
150,2020,August,0.2,0.1,0.1,0.1,0.1,0.1
151,2020,September,0.2,0.1,0.1,0.1,0.1,0.1


In [15]:
unemploy_df.sample(20)

Unnamed: 0,year,month,16-19,20-24,35-44,45-54,55-64,65+
122,2009,March,0.3,0.1,0.1,0.1,0.1,0.1
121,2009,April,0.3,0.1,0.1,0.1,0.1,0.1
108,2010,May,0.3,0.2,0.1,0.1,0.1,0.1
27,2017,February,0.2,0.1,0.0,0.0,0.0,0.0
77,2012,December,0.3,0.2,0.1,0.1,0.1,0.1
17,2017,December,0.2,0.1,0.0,0.0,0.0,0.0
41,2015,December,0.2,0.1,0.0,0.0,0.0,0.0
88,2012,January,0.4,0.2,0.1,0.1,0.1,0.1
134,2008,March,0.2,0.1,0.0,0.0,0.0,0.0
58,2014,July,0.3,0.1,0.1,0.1,0.1,0.1


In [16]:
unemploy_df.corr?

### Wondering if I can find any correlation data between unemployment and age group? And see if I can make any notable observations from this correlation data? I'm trying to see if age group and unemployment has any relationship.

In [17]:
unemploy_df.corr(method='pearson', min_periods=1)

Unnamed: 0,year,16-19,20-24,35-44,45-54,55-64,65+
year,1.0,-0.607598,-0.521799,-0.567346,-0.548129,-0.455411,-0.536495
16-19,-0.607598,1.0,0.726541,0.764963,0.781692,0.7454,0.724972
20-24,-0.521799,0.726541,1.0,0.642578,0.659816,0.659816,0.600915
35-44,-0.567346,0.764963,0.642578,1.0,0.973875,0.894449,0.935162
45-54,-0.548129,0.781692,0.659816,0.973875,1.0,0.89441,0.910731
55-64,-0.455411,0.7454,0.659816,0.894449,0.89441,1.0,0.910731
65+,-0.536495,0.724972,0.600915,0.935162,0.910731,0.910731,1.0


In [18]:
col_to_use = [
    'year',
    '55-64'
]

unemploy_df1 = unemploy_df[col_to_use]

In [19]:
unemploy_df1

Unnamed: 0,year,55-64
0,2019,0.0
1,2019,0.0
2,2019,0.0
3,2019,0.0
4,2019,0.0
...,...,...
148,2020,0.1
149,2020,0.1
150,2020,0.1
151,2020,0.1


### ^Wondering if focusing on the 55-64 age group will help me gain insights?

In [32]:
depress_df = pd.read_csv('../data/depress_CLEANED.csv')

In [33]:
depress_df.sample(15)

Unnamed: 0,year,category,category_name,count,weighted_count,percent,lower_cl,upper_cl
19,2012,Age,35 to 44,240,576639.0,11.13,9.57,12.69
10,2012,Education,Some College or Tech School,563,947473.0,13.25,11.95,14.55
5,2012,Race-Ethnicity,Hispanic,412,923174.0,9.98,8.91,11.05
147,2018,Education,High School Graduate or GED Certificate,337,830587.0,15.53,11.4,19.65
126,2017,Education,College Graduate or Post Grad,618,1852870.0,15.84,13.47,18.21
104,2016,Income,"< $20,000",396,956963.0,20.8,17.11,24.49
154,2018,Income,"$75,000 - $99,999",323,848592.0,14.98,11.25,18.72
34,2013,Education,College Graduate or Post Grad,583,1170903.0,11.45,10.31,12.59
116,2017,Sex,Male,525,2001004.0,14.61,11.78,17.43
20,2012,Age,45 to 54,409,770238.0,14.67,13.14,16.2


In [35]:
# age filter
age_filter = depress_df['category'] == 'Age'

age_depress_df = depress_df[age_filter]

age_grouped = age_depress_df.groupby(['year', 'category_name'])['percent'].sum().unstack()

age_grouped

category_name,18 to 34,35 to 44,45 to 54,55 to 64,65+ years
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012,7.63,11.13,14.67,17.19,12.63
2013,9.96,10.42,16.81,18.39,13.41
2014,10.18,12.45,14.62,19.91,13.19
2015,9.81,14.21,13.27,16.99,13.85
2016,11.84,11.85,14.59,18.65,14.61
2017,19.8,19.53,16.4,22.32,16.94
2018,17.69,14.56,20.06,21.44,15.6


In [37]:
# using only age category column
col_use = [
    '55 to 64'
]

age_depress_df = age_grouped[col_use]

age_depress_df

category_name,55 to 64
year,Unnamed: 1_level_1
2012,17.19
2013,18.39
2014,19.91
2015,16.99
2016,18.65
2017,22.32
2018,21.44


In [38]:
# merging the two data sets to show unemployment and depression side by side 
age_unemploy_df = pd.merge(unemploy_df1, age_depress_df, on='year', how='left')

In [39]:
age_unemploy_df

Unnamed: 0,year,55-64,55 to 64
0,2019,0.0,
1,2019,0.0,
2,2019,0.0,
3,2019,0.0,
4,2019,0.0,
...,...,...,...
148,2020,0.1,
149,2020,0.1,
150,2020,0.1,
151,2020,0.1,


In [40]:
age_unemploy_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 153 entries, 0 to 152
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   year      153 non-null    int64  
 1   55-64     153 non-null    float64
 2   55 to 64  84 non-null     float64
dtypes: float64(2), int64(1)
memory usage: 4.8 KB


* Nothing too meaningful, since the unemployment rates are not telling us about any big spikes or useful information (it's been pretty static over the years)
* However, I can still mention unemployment in a short sentence in my data story to indicate these sorts of socioeconomic factors as possible predictors for mental health issues like depression