In [19]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import pandas_profiling as profile
from collections import OrderedDict, Counter
import plotly.express as px
import plotly.graph_objects as go
import re 
import sys
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
sys.path.append('../scripts/')
pd.set_option('max_colwidth', 1000)
import hcmst_lib as lib

In [21]:
data = pd.read_stata('../data/HCMST 2017 fresh sample for public sharing draft v1.1.dta')
data.set_index('CaseID', inplace=True)

In [22]:
data_dict = pd.read_csv('../data/final_data_dict.csv')
data_dict.set_index('Column Name', inplace=True)

In [23]:
baseline_fields = ['ppgender', 'ppethm', 'ppeduc', 'ppincimp', 'ppmarit'
                   , 'ppreg9', 'ppmsacat', 'ppwork', 'ppage']
data['ppage'] = data['ppage'].astype('float')

In [24]:
data_dict.loc[baseline_fields]

Unnamed: 0_level_0,Description
Column Name,Unnamed: 1_level_1
ppgender,Gender
ppethm,Race / Ethnicity
ppeduc,Education (Highest Degree Received)
ppincimp,Household Income
ppmarit,Marital Status
ppreg9,Region 9 - Based on State of Residence
ppmsacat,MSA Status PPREG4
ppwork,Current Employment Status
ppage,Age ppagecat


In [25]:
# Have useable dictionary for data set percentages 
baseline_pcts = OrderedDict()
for group in baseline_fields:
    baseline_pcts[group] = data[group].value_counts(normalize=True)

In [26]:
groups = [(18, 24),
 (25, 29),
 (30, 34),
 (35, 39),
 (40, 44),
 (45, 49),
 (50, 54),
 (55, 59),
 (60, 64),
 (65, 69),
 (70, 74),
 (75, 79),
 (80, 84)]

def assign_group(n, grp_set_list, grp_index, n_index):
    
    grp = grp_set_list[grp_index]
    
    rslt = np.where(grp[0] <= n <= grp[1], str(grp), False).tolist()
    if rslt[0] == '(':
        return(rslt)
    elif grp_index < (n_index-1) :
        grp_index = grp_index + 1
        rslt = assign_group(n, grp_set_list, grp_index, len(grp_set_list))
        return(rslt)
    else:
        return "Not in group list"

In [27]:
data['age_group'] = data['ppage'].apply(assign_group, grp_set_list=groups, grp_index=0, n_index=len(groups))
data['age_group'].value_counts()

(55, 59)             417
(25, 29)             410
(60, 64)             379
(50, 54)             335
(40, 44)             272
(35, 39)             272
(65, 69)             267
(18, 24)             254
(30, 34)             251
(45, 49)             233
(70, 74)             222
(75, 79)             124
(80, 84)              47
Not in group list     27
Name: age_group, dtype: int64

In [28]:
q_name = 'w6_q32'
data_dict.loc[q_name].str.replace('\s{2,}', ' ')

Description    did you use an Internet service to meet partner? 
Name: w6_q32, dtype: object

In [29]:
data.shape

(3510, 285)

In [30]:
# # Filter
# if q_name == 'w6_q32':
#     data = data[data['S2'].isin(['Yes, I have a sexual partner (boyfriend or girlfriend)'
#                                   , 'I have a romantic partner who is not yet a sexual partner'])]
# else:
#     data = data.copy()

In [31]:
data.shape

(3510, 285)

In [32]:
data["flag internet service to meet partner"] = np.where(data['w6_q32'].str.startswith('No, ') == True, 1, 0)


In [33]:
600/2900

0.20689655172413793

In [34]:
def bivar_analysis(df):
    out = {}
    out['Count'] = df.shape[1]
    out['Flag Internet Service to Meet Partner'] = np.round(df["flag internet service to meet partner"].mean(), 3)
    out['Count Internet Service to Meet Partner'] = df["flag internet service to meet partner"].sum()
    
    out_series = pd.Series(out)
    return(out_series)

In [35]:
avg_partner_meet_by_age = data.groupby(['age_group', 'ppgender']).apply(lambda x:bivar_analysis(x))
avg_partner_meet_by_age.reset_index(inplace=True)
avg_partner_meet_by_age.head()


Unnamed: 0,age_group,ppgender,Count,Flag Internet Service to Meet Partner,Count Internet Service to Meet Partner
0,"(18, 24)",Male,286.0,0.582,64.0
1,"(18, 24)",Female,286.0,0.632,91.0
2,"(25, 29)",Male,286.0,0.618,105.0
3,"(25, 29)",Female,286.0,0.746,179.0
4,"(30, 34)",Male,286.0,0.719,82.0


In [36]:
px.line(avg_partner_meet_by_age, 'age_group', 'Flag Internet Service to Meet Partner'
       , color='ppgender')