In [1]:
import pandas as pd

In [8]:
df = pd.read_csv('co-est00int-tot.csv', encoding='ISO-8859-1')

In [9]:
# Zero-pad the state and county codes to ensure correct lengths
df['STATE'] = df['STATE'].apply(lambda x: f'{x:02}')
df['COUNTY'] = df['COUNTY'].apply(lambda x: f'{x:03}')

# Concatenate the two columns to form the FIPS code
df['fips'] = df['STATE'] + df['COUNTY']

In [10]:
df.to_csv('census_county_pop.csv', index=False)

_____

In [13]:
signal = pd.read_csv('signal_final.csv')

In [28]:
signal['depression'] = signal.iloc[:, 5:15].sum(axis=1).apply(lambda x: 1 if x > 0 else 0)

In [30]:
county_depression = signal.groupby(['geoid', 'CENSUS2010POP']).agg({
    'depression': 'sum',
    'user_id': 'count'
}).reset_index()

In [41]:
county_depression

Unnamed: 0,geoid,CENSUS2010POP,depression,user_id
0,1003,182265,0,2
1,1007,22915,0,1
2,1015,118572,0,1
3,1021,43643,0,1
4,1033,54428,0,1
...,...,...,...,...
731,55079,947735,1,25
732,55101,195408,0,2
733,55117,115507,0,1
734,55127,102228,0,2


In [133]:
df_msa = pd.read_csv("https://data.nber.org/cbsa-csa-fips-county-crosswalk/cbsa2fipsxw.csv")
# Concatenate the two columns to form the FIPS code
df_msa['geoid'] = (df_msa['fipsstatecode'].apply(lambda x: f'{x:02}') + 
                   df_msa['fipscountycode'].apply(lambda x: f'{x:03}')).astype(int)
df_msa['fipscode'] = df_msa['geoid']

In [69]:
cbsa_depression = county_depression.merge(df_msa[['cbsatitle','cbsacode','metropolitanmicropolitanstatis','geoid']],how='left')
cbsa_depression = cbsa_depression.groupby(['cbsatitle','cbsacode','metropolitanmicropolitanstatis']).agg({
    'depression': 'sum',
    'user_id': 'sum'
}).reset_index()
cbsa_depression['depression_rate'] = cbsa_depression['depression'] / cbsa_depression['user_id'] 
cbsa_depression = cbsa_depression.loc[(cbsa_depression['depression_rate']>0) & (cbsa_depression['user_id']>= 10)]

In [70]:
cbsa_depression.shape

(91, 6)

In [71]:
cbsa_depression

Unnamed: 0,cbsatitle,cbsacode,metropolitanmicropolitanstatis,depression,user_id,depression_rate
1,"Akron, OH",10420.0,Metropolitan Statistical Area,8,39,0.205128
2,"Albany, GA",10500.0,Metropolitan Statistical Area,4,15,0.266667
4,"Albany-Schenectady-Troy, NY",10580.0,Metropolitan Statistical Area,1,24,0.041667
19,"Atlanta-Sandy Springs-Roswell, GA",12060.0,Metropolitan Statistical Area,48,486,0.098765
20,"Atlantic City-Hammonton, NJ",12100.0,Metropolitan Statistical Area,1,10,0.100000
...,...,...,...,...,...,...
367,"Valdosta, GA",46660.0,Metropolitan Statistical Area,4,13,0.307692
368,"Vallejo, CA",46700.0,Metropolitan Statistical Area,2,17,0.117647
372,"Virginia Beach-Chesapeake-Norfolk, VA-NC",47260.0,Metropolitan Statistical Area,27,170,0.158824
377,"Washington-Arlington-Alexandria, DC-VA-MD-WV",47900.0,Metropolitan Statistical Area,45,401,0.112219


---

In [123]:
df_pop = pd.read_csv('census_county_pop.csv')
pop_df_drop = df_pop[['CENSUS2010POP', 'fips']].copy()
pop_df_drop.columns = ['CENSUS2010POP', 'fipscode']

In [142]:
column_select = ['fipscode','v002_rawvalue','v037_rawvalue','v009_rawvalue','v011_rawvalue','v012_rawvalue',
                 'v045_rawvalue','v004_rawvalue','v007_rawvalue','v021_rawvalue','v022_rawvalue',
                 'v023_rawvalue','v028_rawvalue','v043_rawvalue','v041_rawvalue','v003_rawvalue']
df_factor = df_factor[column_select]
df_factor = df_factor.merge(pop_df_drop)

In [143]:
df_factor['v045_rawvalue'] = df_factor['v045_rawvalue']/100000 #Chlamydia cases per 100,000 people
df_factor['v004_rawvalue'] = df_factor['v004_rawvalue']/100000 #Primary care provider rate per 100,000 population
df_factor['v004_rawvalue'] = df_factor['v004_rawvalue']/100000 # Violent Crimes per 100,000 People 

In [153]:
df_factor_cbsa = df_factor.merge(df_msa[['cbsatitle','cbsacode','metropolitanmicropolitanstatis','fipscode']],how='left')
df_factor_cbsa = df_factor_cbsa.loc[~df_factor_cbsa['cbsacode'].isna()]

In [157]:
# Identify all columns that follow the pattern vXXX_rawvalue
value_columns = [col for col in df_factor_cbsa.columns if col.startswith('v') and col.endswith('_rawvalue')]

# Function to calculate the weighted value for each column
def calculate_weighted(group):
    result = {}
    for col in value_columns:
        weighted_sum = (group[col].fillna(0) * group['CENSUS2010POP']).sum()
        sum_population_non_na = group.loc[group[col].notna(), 'CENSUS2010POP'].sum()
        total_population = group['CENSUS2010POP'].sum()
        
        # Calculate the desired formula
        result[col] = (weighted_sum / sum_population_non_na) * total_population if sum_population_non_na != 0 else 0
    return pd.Series(result)

# Apply the function for each group and reset index
weighted_results = df_factor_cbsa.groupby(['cbsatitle','cbsacode','metropolitanmicropolitanstatis']).apply(calculate_weighted).reset_index()

# Display the result
weighted_results

Unnamed: 0,cbsatitle,cbsacode,metropolitanmicropolitanstatis,v002_rawvalue,v037_rawvalue,v009_rawvalue,v011_rawvalue,v012_rawvalue,v045_rawvalue,v004_rawvalue,v007_rawvalue,v021_rawvalue,v022_rawvalue,v023_rawvalue,v028_rawvalue,v043_rawvalue,v041_rawvalue,v003_rawvalue
0,"Aberdeen, SD",10100.0,Micropolitan Statistical Area,5322.000000,2241.2304,8028.9180,11202.081,7691.88150,103.230333,0.000303,36724.7973,35098.231049,9251.634420,986.661,2740.168455,0.000000e+00,41583.261650,4756.017
1,"Aberdeen, WA",10140.0,Micropolitan Statistical Area,12812.272000,4258.6245,19487.7569,24605.386,10766.67630,130.161036,0.000311,62707.3358,52314.689962,9270.989153,5605.369,8273.696321,1.364215e+07,51384.183180,10045.986
2,"Abilene, TX",10180.0,Metropolitan Statistical Area,37786.460799,13953.0436,40453.6896,44800.770,17573.04852,656.176240,0.001116,124688.1388,125011.108850,32853.874583,6508.456,15368.343648,7.031154e+07,124815.404703,36100.208
3,"Ada, OK",10220.0,Micropolitan Statistical Area,8506.934800,2999.3600,10977.6576,11697.504,5140.15320,149.555588,0.000286,24770.9644,31555.088936,9200.490797,1199.744,3314.825659,1.837417e+07,20907.874191,7310.940
4,"Adrian, MI",10300.0,Micropolitan Statistical Area,12426.564800,6912.5264,18470.0308,30167.384,16721.92080,144.643616,0.000426,85517.5412,82252.597671,18507.848103,10188.984,9225.036529,2.187368e+07,49270.015393,10688.444
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,"Youngstown-Warren, OH",49660.0,Metropolitan Statistical Area,72038.772300,40846.2945,109967.2016,125786.311,70835.03050,1437.548274,0.003886,358715.4324,358226.929339,79763.210499,32729.833,45195.615530,0.000000e+00,332987.543627,51832.326
914,"Yuba City, CA",49700.0,Metropolitan Statistical Area,27770.828800,10432.4062,0.0000,42981.035,26869.61200,435.044654,0.001155,128658.7414,117705.517795,25321.597134,20311.251,18279.954689,7.593015e+07,114381.071179,31141.595
915,"Yuma, AZ",49740.0,Metropolitan Statistical Area,39091.474700,11353.5580,30752.4821,50308.007,33375.54550,516.586889,0.000967,159204.2883,0.000000,24493.789757,33473.421,20727.312271,0.000000e+00,73732.760808,38562.947
916,"Zanesville, OH",49780.0,Micropolitan Statistical Area,11284.301400,7763.8748,26097.6368,25908.274,8968.91080,227.837878,0.000506,71716.8568,71871.076283,10988.912044,7402.364,9326.344481,0.000000e+00,70469.941521,9382.066


In [162]:
pop_cbsa = df_factor_cbsa.groupby(['cbsatitle','cbsacode','metropolitanmicropolitanstatis']).agg({'CENSUS2010POP':'sum'}).reset_index()

---

In [167]:
cbsa_depression.merge(weighted_results,how='left').merge(pop_cbsa,how='left').to_csv('tweet2020_analysis.csv', index=False)