In [3]:
import numpy as np
import csv
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import random
from scipy.optimize import check_grad
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV
import numdifftools as nd
import collections
import pickle as pkl
from sklearn.calibration import CalibratedClassifierCV
import matplotlib
import warnings
import itertools
from scipy.stats.stats import pearsonr
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import normalized_mutual_info_score

# Load data

In [4]:
#load data with ids of callworkers
data_ids = pd.read_csv("../../data/ChildWelfare/refer_ids_appended.csv")
data_ids.head()

Unnamed: 0,REFER_ID,SCREENER_SSN_NBR,RQST_SSN_NBR,APRV_SSN_NBR
0,514235,T094595,T071543,T071543
1,514236,T094595,T071543,T071543
2,514237,T094595,T071543,T071543
3,514239,T096208,T096208,T071543
4,514240,T096208,T096208,T071543


In [5]:
#load data containing info on each call
dat_sorted = pd.read_csv("../../data/ChildWelfare/child_welfare_with_call.csv")
dat_sorted = dat_sorted.sort_values(['REFER_DT'])
dat_sorted.drop_duplicates(subset=['MCI_ID'],inplace=True)
dat_sorted.head()

Unnamed: 0.1,Unnamed: 0,X,RUN_ID,REFER_DT,REFER_ID,MCI_ID,PL_SCORE,RR_SCORE,AGE_AT_RFRL_CHLD,AGE_AT_RFRL_OTH,...,Terrorizing_Child,Throwing,Truancy,Tying_OR_Close_Confinement,Ungovernable,Unlawful_Contact_With_A_Minor,Unreasonably_RestrainingORConfining,WeltsOREcchymosis,Failure_to_Thrive,Inadequate_Clothing
64,65,82,10000743,2010-03-31,514286,1000615786,-1.612429,0.475528,0,0,...,0,0,0,0,0,0,0,0,0,0
63,64,81,10000743,2010-03-31,514286,1000119855,-1.6345,0.23207,0,0,...,0,0,0,0,0,0,0,0,0,0
62,63,80,10000743,2010-03-31,514286,1000615785,-1.823429,0.313528,0,0,...,0,0,0,0,0,0,0,0,0,0
61,62,79,10000742,2010-04-01,514285,1000630239,-1.7399,-1.316,0,0,...,0,0,0,0,0,0,0,0,0,0
44280,44281,46592,10000830,2010-04-01,514375,1000630509,-1.652,-0.32676,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
X_call = np.array(dat_sorted.iloc[:,-190:])
cols_call = dat_sorted.columns[-190:]


In [14]:
dat_sorted.shape

(46544, 1043)

In [8]:
#columns that should not be used for prediction
col_no_use = ["SCREEN_OUT_NULL","RUN_ID","REFER_DT","REFER_ID","PL_SCORE","RR_SCORE","prediction_random_forests",
                    "REF_2HRS_NULL", 
                    "REF_24HRS_NULL", 
                    "REF_MISTIME_NULL", 
                    "REF_LOW_NULL", 
                    "REF_MOD_NULL", 
                    "REF_HIGH_NULL", 
                    "REF_NORISK_NULL", 
                    "REF_MISRISK_NULL", 
                    "REF_DGR_IMPEND_NULL", 
                    "REF_DGR_PRESENT_NULL", 
                    "REF_DGR_NO_NULL",
                    "REF_DGR_MISSING_NULL",
                    "VERIFIEDOUTCOMES_NULL",
                    "PENDINGCOURT_NULL",
                    "ACCEPTFORSERVICE_NULL",
                    "train_test",
                    "MCI_ID",
                    "DAYS_FROM_LASTRFRL_CHLD",
                    "PREVIOUS_RFRL_CHLD",
                    "RFRL_RANK_CHLD",
                    "RFRL_RANK_OTH",
                    "RFRL_RANK_PERP",
                    "RFRL_RANK_PRNT",
                    "RFRL_RANK_VICT_SELF",
                    "RFRL_RANK_VICT_OTHR",
                     "plsm_nxt730_dummy",
'X','Unnamed: 0']+list(dat_sorted.columns[-195:])

In [9]:
Y_observed = dat_sorted[['plsm_nxt730_dummy']].values
Y_human = 1-dat_sorted[["SCREEN_OUT_NULL"]].values

refer_ids = dat_sorted[["REFER_ID"]].values

mci_ids = dat_sorted[["MCI_ID"]].values

Y_serv = dat_sorted['ACCEPTFORSERVICE_NULL'].values
Y_sub = dat_sorted['VERIFIEDOUTCOMES_NULL'].values

#remove columns we do not want to use
dat_model = dat_sorted.drop(labels=col_no_use,axis=1)
#remove columns that are always null
dat_model = dat_model.loc[:,((dat_model != 0).any(axis=0))]
dat_model.head()

Unnamed: 0,AGE_AT_RFRL_VICT_SELF,INFANT_VIC_NULL,TOD_VIC_NULL,PRESC_VIC_NULL,SC1_VIC_NULL,SC2_VIC_NULL,TEEN_VIC_NULL,ADT_VIC_NULL,FEMALE_NULL,POVERYRATE_NULL,...,BIO_MOM_NULL,TOTAL_SUBSTANCE_CHLD,TOTAL_SUBSTANCE_OTH,TOTAL_SUBSTANCE_PERP,TOTAL_SUBSTANCE_PRNT,TOTAL_SUBSTANCE_VICT_SELF,TOTAL_SUBSTANCE_VICT_OTHR,BH_SUBSTANCE,BH_C_20,REF_PAST548_SERV
64,3,0,0,1,0,0,0,0,1,19.6,...,1,0,0,0,0,0,0,0,0,3
63,4,0,0,1,0,0,0,0,1,19.6,...,1,0,0,0,0,0,0,0,0,2
62,1,0,1,0,0,0,0,0,0,19.6,...,1,0,0,0,0,0,0,0,0,2
61,14,0,0,0,0,0,1,0,0,17.0,...,0,0,0,0,0,0,0,0,0,0
44280,8,0,0,0,1,0,0,0,0,11.4,...,0,0,0,0,0,0,0,0,0,0


In [10]:
#create dictionary indicating which decision-maker saw each case
screener_ids_dic = {refer_ids[k][0]:data_ids.loc[data_ids['REFER_ID'] == refer_ids[k][0],'SCREENER_SSN_NBR'].values[0] for k in np.arange(refer_ids.shape[0])}
screener_ids = [data_ids.loc[data_ids['REFER_ID'] == refer_ids[k][0],'SCREENER_SSN_NBR'].values[0] for k in np.arange(refer_ids.shape[0])]
collections.Counter(screener_ids)

Counter({'T095467': 4119,
         'T096208': 4973,
         'T060875': 2483,
         'T084791': 3499,
         'T091442': 2400,
         'T094595': 2384,
         'T044996': 1642,
         'T066265': 4137,
         'T071543': 142,
         'T096966': 2834,
         'T090937': 6,
         'T098306': 1245,
         'T055473': 4877,
         'T079344': 4547,
         'T069459': 642,
         'T092753': 1123,
         'T096969': 4,
         'T098337': 701,
         'T078534': 27,
         'T087566': 382,
         'X001435': 3,
         'T102928': 444,
         'T102948': 59,
         'T098938': 33,
         'T098007': 161,
         'T083431': 2,
         'T101142': 2135,
         'T104299': 196,
         'T083062': 405,
         'T101854': 181,
         'T105008': 43,
         'T104369': 77,
         'T105222': 42,
         'T105208': 30,
         'T099724': 44,
         'T105784': 1,
         'T099665': 495,
         'T105837': 26})

### Select features of X to reduce correlations across features

In [11]:
#Greedily delete correlated features

def corr_matrix(X):
    M_corr = np.zeros(shape=(X.shape[1],X.shape[1]))
    M_pvalue = np.zeros(shape=(X.shape[1],X.shape[1]))
    for i, j in itertools.combinations(np.arange(X.shape[1]), 2):
        #print(i,j)
        pears = pearsonr(X[:,i], X[:,j])
        M_corr[i,j] = pears[0]
        M_pvalue[i,j] = pears[1]
    return M_corr, M_pvalue

def remove_corr(X,C,columns,tau):
    while np.max(abs(C))>tau:
        idx_maxcor = np.unravel_index(abs(C).argmax(),C.shape)[1]
        print(idx_maxcor)
        print(columns[idx_maxcor])
        #idx_maxcor = idx_maxcor
        columns = np.delete(columns,idx_maxcor)
        C = np.delete(C,idx_maxcor,0)
        C = np.delete(C,idx_maxcor,1)
        X = np.delete(X,idx_maxcor,1)
    return X, C, columns
    
    

In [12]:
X = dat_model.values
colnames = np.array(list(dat_model))



In [13]:
X.shape

(46544, 779)

In [10]:
colnames = np.array(list(dat_model))
sel = VarianceThreshold(threshold=(.9* (1 - .9)))
X = sel.fit_transform(X)
colnames = colnames[sel.get_support()]
M_corr, M_pvalue = corr_matrix(X)
X, C, colnames = remove_corr(X, M_corr, colnames, 0.5)

248
DPW_OM_1_PER_VICT_SELF
245
DPW_OM_2_PER_VICT_SELF
203
DPW_FS_1_PER_PRNT
194
DPW_FS_3_PER_VICT_SELF
199
DPW_FS_2_PER_VICT_OTHR
199
DPW_FS_1_PER_CHLD
201
DPW_FS_1_PER_VICT_OTHR
195
DPW_FS_2_PER_CHLD
198
DPW_FS_1_PER_PERP
198
DPW_FS_1_PER_VICT_SELF
238
DPW_OM_1_PER_PRNT
281
DPW_TANF_1_PER_PERP
194
DPW_FS_3_PER_VICT_OTHR
194
DPW_FS_2_PER_PERP
235
DPW_OM_1_PER_PERP
235
DPW_OM_1_PER_VICT_OTHR
234
DPW_OM_2_PER_PERP
194
DPW_FS_2_PER_PRNT
194
DPW_FS_2_PER_VICT_SELF
231
DPW_OM_3_PER_PERP
192
DPW_FS_3_PER_PERP
192
DPW_FS_3_PER_PRNT
118
PLSM_PAST548_LOS_NULL
103
FNDG_PAST180_COUNT_VICT_SELF
102
FNDG_PAST180_COUNT_PERP
102
FNDG_PAST180_COUNT_VICT_OTHR
237
DPW_SSI_2_OTH
177
DPW_FS_2_OTH
107
FNDG_PAST548_COUNT_VICT_OTHR
234
DPW_SSI_2_CHLD
237
DPW_SSI_2_VICT_OTHR
175
DPW_FS_2_CHLD
233
DPW_SSI_2_PERP
233
DPW_SSI_2_PRNT
233
DPW_SSI_2_VICT_SELF
178
DPW_FS_2_VICT_OTHR
175
DPW_FS_2_PERP
176
DPW_FS_2_VICT_SELF
193
DPW_GA_2_CHLD
103
FNDG_PAST365_COUNT_VICT_SELF
391
TOTAL_SUBSTANCE_CHLD
206
DPW_OM_2_OTH
2

In [11]:

X = np.hstack((X, np.ones((X.shape[0],1))))
scaler = sklearn.preprocessing.StandardScaler()
X = scaler.fit_transform(X)

In [12]:
df_count = (dat_model[colnames]!=0).sum(axis=0)
(pd.DataFrame(df_count)).sort_values(by=0,ascending=False)

Unnamed: 0,0
VIC_COUNT_NULL,46544
POVERYRATE_NULL,45516
PERP_COUNT_NULL,43476
AGE_AT_RFRL_VICT_SELF,40268
NO_BH_VICT_SELF,37683
PRNT_COUNT_NULL,36548
PERP_1_NULL,35232
PERP_FEMALES_NULL,32180
NO_BH_PERP,28743
NO_BH_PRNT,26849


In [13]:
#colnames
X.shape

(46544, 216)

In [16]:
with open('../../data/ChildWelfare/X_preprocess.pkl', 'wb') as file:
    pkl.dump([X,screener_ids,refer_ids,Y_observed,Y_human,Y_serv,Y_sub,colnames],file)